From eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 01:49:05 +0800
Subject: Add CPU fp8 support

Since norm layer need fp32, I only convert the linear operation layer(conv2d/linear)

And TE have some pytorch function not support bf16 amp in CPU. I add a condition to indicate if the autocast is for unet.
---
 modules/processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules/processing.py')

diff --git a/modules/processing.py b/modules/processing.py
index 40598f5c..2df8a7ea 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -865,7 +865,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
+            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(unet=True):
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
             if getattr(samples_ddim, 'already_decoded', False):
-- 
cgit v1.2.3


From d4d3134f6d2d232c7bcfa80900a362921e644976 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 28 Oct 2023 15:24:26 +0800
Subject: ManualCast for 10/16 series gpu

---
 modules/devices.py    | 57 +++++++++++++++++++++++++++++++++++++++++++++------
 modules/processing.py |  2 +-
 modules/sd_models.py  | 21 +++++++++++--------
 3 files changed, 64 insertions(+), 16 deletions(-)

(limited to 'modules/processing.py')

diff --git a/modules/devices.py b/modules/devices.py
index 0cd2b55d..c05f2b35 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -16,6 +16,23 @@ def has_mps() -> bool:
         return mac_specific.has_mps
 
 
+def cuda_no_autocast(device_id=None) -> bool:
+    if device_id is None:
+        device_id = get_cuda_device_id()
+    return (
+        torch.cuda.get_device_capability(device_id) == (7, 5) 
+        and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
+    )
+
+
+def get_cuda_device_id():
+    return (
+        int(shared.cmd_opts.device_id) 
+        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() 
+        else 0
+    ) or torch.cuda.current_device()
+
+
 def get_cuda_device_string():
     if shared.cmd_opts.device_id is not None:
         return f"cuda:{shared.cmd_opts.device_id}"
@@ -60,8 +77,7 @@ def enable_tf32():
 
         # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
         # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
-        device_id = (int(shared.cmd_opts.device_id) if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() else 0) or torch.cuda.current_device()
-        if torch.cuda.get_device_capability(device_id) == (7, 5) and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16"):
+        if cuda_no_autocast():
             torch.backends.cudnn.benchmark = True
 
         torch.backends.cuda.matmul.allow_tf32 = True
@@ -92,15 +108,44 @@ def cond_cast_float(input):
 
 
 nv_rng = None
-
-
-def autocast(disable=False, unet=False):
+patch_module_list = [
+    torch.nn.Linear,
+    torch.nn.Conv2d,
+    torch.nn.MultiheadAttention,
+    torch.nn.GroupNorm,
+    torch.nn.LayerNorm,
+]
+
+@contextlib.contextmanager
+def manual_autocast():
+    def manual_cast_forward(self, *args, **kwargs):
+        org_dtype = next(self.parameters()).dtype
+        self.to(dtype)
+        result = self.org_forward(*args, **kwargs)
+        self.to(org_dtype)
+        return result
+    for module_type in patch_module_list:
+        org_forward = module_type.forward
+        module_type.forward = manual_cast_forward
+        module_type.org_forward = org_forward
+    try:
+        yield None
+    finally:
+        for module_type in patch_module_list:
+            module_type.forward = module_type.org_forward
+
+
+def autocast(disable=False):
+    print(fp8, dtype, shared.cmd_opts.precision, device)
     if disable:
         return contextlib.nullcontext()
 
-    if unet and fp8 and device==cpu:
+    if fp8 and device==cpu:
         return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
 
+    if fp8 and (dtype == torch.float32 or shared.cmd_opts.precision == "full" or cuda_no_autocast()):
+        return manual_autocast()
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
diff --git a/modules/processing.py b/modules/processing.py
index 2df8a7ea..40598f5c 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -865,7 +865,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(unet=True):
+            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
             if getattr(samples_ddim, 'already_decoded', False):
diff --git a/modules/sd_models.py b/modules/sd_models.py
index ccb6afd2..31bcb913 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -403,23 +403,26 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
     if enable_fp8:
         devices.fp8 = True
+        if model.is_sdxl:
+            cond_stage = model.conditioner
+        else:
+            cond_stage = model.cond_stage_model
+
+        for module in cond_stage.modules():
+            if isinstance(module, torch.nn.Linear):
+                module.to(torch.float8_e4m3fn)
+
         if devices.device == devices.cpu:
             for module in model.model.diffusion_model.modules():
                 if isinstance(module, torch.nn.Conv2d):
                     module.to(torch.float8_e4m3fn)
                 elif isinstance(module, torch.nn.Linear):
                     module.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet for cpu")
         else:
-            if model.is_sdxl:
-                cond_stage = model.conditioner
-            else:
-                cond_stage = model.cond_stage_model
-            for module in cond_stage.modules():
-                if isinstance(module, torch.nn.Linear):
-                    module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet")
+        timer.record("apply fp8")
+    else:
+        devices.fp8 = False
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.3


From ea272152e0b50dbb2bd675ec020607f3d50c37d0 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 16 Dec 2023 15:08:08 +0800
Subject: Add FP8 settings into PNG info

---
 modules/generation_parameters_copypaste.py | 6 ++++++
 modules/processing.py                      | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'modules/processing.py')

diff --git a/modules/generation_parameters_copypaste.py b/modules/generation_parameters_copypaste.py
index 4efe53e0..dbffe494 100644
--- a/modules/generation_parameters_copypaste.py
+++ b/modules/generation_parameters_copypaste.py
@@ -314,6 +314,12 @@ Steps: 20, Sampler: Euler a, CFG scale: 7, Seed: 965400086, Size: 512x512, Model
     if "VAE Decoder" not in res:
         res["VAE Decoder"] = "Full"
 
+    if "FP8 weight" not in res:
+        res["FP8 weight"] = "Disable"
+
+    if "Cache FP16 weight for LoRA" not in res and res["FP8 weight"] != "Disable":
+        res["Cache FP16 weight for LoRA"] = False
+
     skip = set(shared.opts.infotext_skip_pasting)
     res = {k: v for k, v in res.items() if k not in skip}
 
diff --git a/modules/processing.py b/modules/processing.py
index bea01ec6..179f2c0f 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -688,6 +688,8 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter
         "Size": f"{p.width}x{p.height}",
         "Model hash": p.sd_model_hash if opts.add_model_hash_to_info else None,
         "Model": p.sd_model_name if opts.add_model_name_to_info else None,
+        "FP8 weight": opts.fp8_storage if devices.fp8 else None,
+        "Cache FP16 weight for LoRA": opts.cache_fp16_weight if devices.fp8 else None,
         "VAE hash": p.sd_vae_hash if opts.add_vae_hash_to_info else None,
         "VAE": p.sd_vae_name if opts.add_vae_name_to_info else None,
         "Variation seed": (None if p.subseed_strength == 0 else (p.all_subseeds[0] if use_main_prompt else all_subseeds[index])),
-- 
cgit v1.2.3