From 7c128bbdac0da1767c239174e91af6f327845372 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Thu, 19 Oct 2023 13:56:17 +0800
Subject: Add fp8 for sd unet

---
 modules/sd_models.py | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 3b6cdea1..3b8ff820 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -391,6 +391,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
+        if shared.cmd_opts.opt_unet_fp8_storage:
+            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.3


From 5f9ddfa46f28ca2aa9e0bd832f6bbd67069be63e Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Thu, 19 Oct 2023 23:57:22 +0800
Subject: Add sdxl only arg

---
 modules/cmd_args.py  | 1 +
 modules/sd_models.py | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'modules/sd_models.py')

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index 0f14c71e..20bfb2c4 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -119,3 +119,4 @@ parser.add_argument("--disable-all-extensions", action='store_true', help="preve
 parser.add_argument("--disable-extra-extensions", action='store_true', help="prevent all extensions except built-in from running regardless of any other settings", default=False)
 parser.add_argument("--skip-load-model-at-start", action='store_true', help="if load a model at web start, only take effect when --nowebui", )
 parser.add_argument("--opt-unet-fp8-storage", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
+parser.add_argument("--opt-unet-fp8-storage-xl", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 3b8ff820..08af128f 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -394,6 +394,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         if shared.cmd_opts.opt_unet_fp8_storage:
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
+        elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
+            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet for sdxl")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.3


From eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 01:49:05 +0800
Subject: Add CPU fp8 support

Since norm layer need fp32, I only convert the linear operation layer(conv2d/linear)

And TE have some pytorch function not support bf16 amp in CPU. I add a condition to indicate if the autocast is for unet.
---
 modules/devices.py    |  6 +++++-
 modules/processing.py |  2 +-
 modules/sd_models.py  | 20 ++++++++++++++++----
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/devices.py b/modules/devices.py
index 1d4eb563..0cd2b55d 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -71,6 +71,7 @@ def enable_tf32():
 errors.run(enable_tf32, "Enabling TF32")
 
 cpu: torch.device = torch.device("cpu")
+fp8: bool = False
 device: torch.device = None
 device_interrogate: torch.device = None
 device_gfpgan: torch.device = None
@@ -93,10 +94,13 @@ def cond_cast_float(input):
 nv_rng = None
 
 
-def autocast(disable=False):
+def autocast(disable=False, unet=False):
     if disable:
         return contextlib.nullcontext()
 
+    if unet and fp8 and device==cpu:
+        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
diff --git a/modules/processing.py b/modules/processing.py
index 40598f5c..2df8a7ea 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -865,7 +865,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
+            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(unet=True):
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
             if getattr(samples_ddim, 'already_decoded', False):
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 08af128f..c5fe57bf 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -391,12 +391,24 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
-        if shared.cmd_opts.opt_unet_fp8_storage:
+
+    if shared.cmd_opts.opt_unet_fp8_storage:
+        enable_fp8 = True
+    elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
+        enable_fp8 = True
+    
+    if enable_fp8:
+        devices.fp8 = True
+        if devices.device == devices.cpu:
+            for module in model.model.diffusion_model.modules():
+                if isinstance(module, torch.nn.Conv2d):
+                    module.to(torch.float8_e4m3fn)
+                elif isinstance(module, torch.nn.Linear):
+                    module.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet for cpu")
+        else:
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
-        elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
-            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet for sdxl")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.3


From 9c1eba2af3a6f9cd6282b3a367656793cbe70c01 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 02:11:27 +0800
Subject: Fix lint

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index c5fe57bf..44d4038b 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -396,7 +396,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         enable_fp8 = True
     elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
         enable_fp8 = True
-    
+
     if enable_fp8:
         devices.fp8 = True
         if devices.device == devices.cpu:
-- 
cgit v1.2.3


From 1df6c8bfec4715610d64684b6ad2fa38c76c1df6 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 11:36:43 +0800
Subject: fp8 for TE

---
 modules/sd_models.py | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 44d4038b..69395294 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -407,6 +407,13 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
                     module.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet for cpu")
         else:
+            if model.is_sdxl:
+                cond_stage = model.conditioner
+            else:
+                cond_stage = model.cond_stage_model
+            for module in cond_stage.modules():
+                if isinstance(module, torch.nn.Linear):
+                    module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
 
-- 
cgit v1.2.3


From 4830b251366436ee8499c003fe87e46ddb4a4581 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 11:53:37 +0800
Subject: Fix alphas_cumprod dtype

---
 modules/sd_models.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 69395294..23660454 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -416,6 +416,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
                     module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
+        model.alphas_cumprod = model.alphas_cumprod.to(torch.float32)
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.3


From bf5067f50ca32cd4764638702e3cc38bca8bfd8b Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 12:54:28 +0800
Subject: Fix alphas cumprod

---
 modules/sd_models.py    | 3 ++-
 modules/sd_models_xl.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 23660454..7ed89a9c 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -396,6 +396,8 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         enable_fp8 = True
     elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
         enable_fp8 = True
+    else:
+        enable_fp8 = False
 
     if enable_fp8:
         devices.fp8 = True
@@ -416,7 +418,6 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
                     module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
-        model.alphas_cumprod = model.alphas_cumprod.to(torch.float32)
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
diff --git a/modules/sd_models_xl.py b/modules/sd_models_xl.py
index 01123321..11259a36 100644
--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@@ -93,7 +93,7 @@ def extend_sdxl(model):
     model.parameterization = "v" if isinstance(model.denoiser.scaling, sgm.modules.diffusionmodules.denoiser_scaling.VScaling) else "eps"
 
     discretization = sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization()
-    model.alphas_cumprod = torch.asarray(discretization.alphas_cumprod, device=devices.device, dtype=dtype)
+    model.alphas_cumprod = torch.asarray(discretization.alphas_cumprod, device=devices.device, dtype=torch.float32)
 
     model.conditioner.wrapped = torch.nn.Module()
 
-- 
cgit v1.2.3


From dda067f64d3289cee3ffd65767126cb30ae73b13 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 19:53:22 +0800
Subject: ignore mps for fp8

---
 modules/sd_models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 7ed89a9c..ccb6afd2 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -392,7 +392,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
-    if shared.cmd_opts.opt_unet_fp8_storage:
+    if devices.get_optimal_device_name() == "mps":
+        enable_fp8 = False
+    elif shared.cmd_opts.opt_unet_fp8_storage:
         enable_fp8 = True
     elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
         enable_fp8 = True
-- 
cgit v1.2.3


From d4d3134f6d2d232c7bcfa80900a362921e644976 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 28 Oct 2023 15:24:26 +0800
Subject: ManualCast for 10/16 series gpu

---
 modules/devices.py    | 57 +++++++++++++++++++++++++++++++++++++++++++++------
 modules/processing.py |  2 +-
 modules/sd_models.py  | 21 +++++++++++--------
 3 files changed, 64 insertions(+), 16 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/devices.py b/modules/devices.py
index 0cd2b55d..c05f2b35 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -16,6 +16,23 @@ def has_mps() -> bool:
         return mac_specific.has_mps
 
 
+def cuda_no_autocast(device_id=None) -> bool:
+    if device_id is None:
+        device_id = get_cuda_device_id()
+    return (
+        torch.cuda.get_device_capability(device_id) == (7, 5) 
+        and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
+    )
+
+
+def get_cuda_device_id():
+    return (
+        int(shared.cmd_opts.device_id) 
+        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() 
+        else 0
+    ) or torch.cuda.current_device()
+
+
 def get_cuda_device_string():
     if shared.cmd_opts.device_id is not None:
         return f"cuda:{shared.cmd_opts.device_id}"
@@ -60,8 +77,7 @@ def enable_tf32():
 
         # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
         # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
-        device_id = (int(shared.cmd_opts.device_id) if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() else 0) or torch.cuda.current_device()
-        if torch.cuda.get_device_capability(device_id) == (7, 5) and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16"):
+        if cuda_no_autocast():
             torch.backends.cudnn.benchmark = True
 
         torch.backends.cuda.matmul.allow_tf32 = True
@@ -92,15 +108,44 @@ def cond_cast_float(input):
 
 
 nv_rng = None
-
-
-def autocast(disable=False, unet=False):
+patch_module_list = [
+    torch.nn.Linear,
+    torch.nn.Conv2d,
+    torch.nn.MultiheadAttention,
+    torch.nn.GroupNorm,
+    torch.nn.LayerNorm,
+]
+
+@contextlib.contextmanager
+def manual_autocast():
+    def manual_cast_forward(self, *args, **kwargs):
+        org_dtype = next(self.parameters()).dtype
+        self.to(dtype)
+        result = self.org_forward(*args, **kwargs)
+        self.to(org_dtype)
+        return result
+    for module_type in patch_module_list:
+        org_forward = module_type.forward
+        module_type.forward = manual_cast_forward
+        module_type.org_forward = org_forward
+    try:
+        yield None
+    finally:
+        for module_type in patch_module_list:
+            module_type.forward = module_type.org_forward
+
+
+def autocast(disable=False):
+    print(fp8, dtype, shared.cmd_opts.precision, device)
     if disable:
         return contextlib.nullcontext()
 
-    if unet and fp8 and device==cpu:
+    if fp8 and device==cpu:
         return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
 
+    if fp8 and (dtype == torch.float32 or shared.cmd_opts.precision == "full" or cuda_no_autocast()):
+        return manual_autocast()
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
diff --git a/modules/processing.py b/modules/processing.py
index 2df8a7ea..40598f5c 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -865,7 +865,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(unet=True):
+            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
             if getattr(samples_ddim, 'already_decoded', False):
diff --git a/modules/sd_models.py b/modules/sd_models.py
index ccb6afd2..31bcb913 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -403,23 +403,26 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
     if enable_fp8:
         devices.fp8 = True
+        if model.is_sdxl:
+            cond_stage = model.conditioner
+        else:
+            cond_stage = model.cond_stage_model
+
+        for module in cond_stage.modules():
+            if isinstance(module, torch.nn.Linear):
+                module.to(torch.float8_e4m3fn)
+
         if devices.device == devices.cpu:
             for module in model.model.diffusion_model.modules():
                 if isinstance(module, torch.nn.Conv2d):
                     module.to(torch.float8_e4m3fn)
                 elif isinstance(module, torch.nn.Linear):
                     module.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet for cpu")
         else:
-            if model.is_sdxl:
-                cond_stage = model.conditioner
-            else:
-                cond_stage = model.cond_stage_model
-            for module in cond_stage.modules():
-                if isinstance(module, torch.nn.Linear):
-                    module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet")
+        timer.record("apply fp8")
+    else:
+        devices.fp8 = False
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.3


From 598da5cd4928618b166886d3485ce30ce3a43490 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 19 Nov 2023 15:50:06 +0800
Subject: Use options instead of cmd_args

---
 modules/cmd_args.py        |  2 --
 modules/devices.py         | 25 ++++++++++---------
 modules/initialize_util.py |  1 +
 modules/sd_models.py       | 61 ++++++++++++++++++++++++----------------------
 modules/shared_options.py  |  1 +
 scripts/xyz_grid.py        |  1 +
 6 files changed, 49 insertions(+), 42 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index 088d5dea..a9fb9bfa 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -118,5 +118,3 @@ parser.add_argument('--timeout-keep-alive', type=int, default=30, help='set time
 parser.add_argument("--disable-all-extensions", action='store_true', help="prevent all extensions from running regardless of any other settings", default=False)
 parser.add_argument("--disable-extra-extensions", action='store_true', help="prevent all extensions except built-in from running regardless of any other settings", default=False)
 parser.add_argument("--skip-load-model-at-start", action='store_true', help="if load a model at web start, only take effect when --nowebui", )
-parser.add_argument("--opt-unet-fp8-storage", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
-parser.add_argument("--opt-unet-fp8-storage-xl", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
diff --git a/modules/devices.py b/modules/devices.py
index d7c905c2..03e7bdb7 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -20,15 +20,15 @@ def cuda_no_autocast(device_id=None) -> bool:
     if device_id is None:
         device_id = get_cuda_device_id()
     return (
-        torch.cuda.get_device_capability(device_id) == (7, 5) 
+        torch.cuda.get_device_capability(device_id) == (7, 5)
         and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
     )
 
 
 def get_cuda_device_id():
     return (
-        int(shared.cmd_opts.device_id) 
-        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() 
+        int(shared.cmd_opts.device_id)
+        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit()
         else 0
     ) or torch.cuda.current_device()
 
@@ -116,16 +116,19 @@ patch_module_list = [
     torch.nn.LayerNorm,
 ]
 
+
+def manual_cast_forward(self, *args, **kwargs):
+    org_dtype = next(self.parameters()).dtype
+    self.to(dtype)
+    args = [arg.to(dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
+    kwargs = {k: v.to(dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
+    result = self.org_forward(*args, **kwargs)
+    self.to(org_dtype)
+    return result
+
+
 @contextlib.contextmanager
 def manual_autocast():
-    def manual_cast_forward(self, *args, **kwargs):
-        org_dtype = next(self.parameters()).dtype
-        self.to(dtype)
-        args = [arg.to(dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
-        kwargs = {k: v.to(dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
-        result = self.org_forward(*args, **kwargs)
-        self.to(org_dtype)
-        return result
     for module_type in patch_module_list:
         org_forward = module_type.forward
         module_type.forward = manual_cast_forward
diff --git a/modules/initialize_util.py b/modules/initialize_util.py
index 2e9b6d89..1b11ead6 100644
--- a/modules/initialize_util.py
+++ b/modules/initialize_util.py
@@ -177,6 +177,7 @@ def configure_opts_onchange():
     shared.opts.onchange("temp_dir", ui_tempdir.on_tmpdir_changed)
     shared.opts.onchange("gradio_theme", shared.reload_gradio_theme)
     shared.opts.onchange("cross_attention_optimization", wrap_queued_call(lambda: sd_hijack.model_hijack.redo_hijack(shared.sd_model)), call=False)
+    shared.opts.onchange("fp8_storage", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
     startup_timer.record("opts onchange")
 
 
diff --git a/modules/sd_models.py b/modules/sd_models.py
index a6c8b2fa..eb491434 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -339,10 +339,28 @@ class SkipWritingToConfig:
         SkipWritingToConfig.skip = self.previous
 
 
+def check_fp8(model):
+    if model is None:
+        return None
+    if devices.get_optimal_device_name() == "mps":
+        enable_fp8 = False
+    elif shared.opts.fp8_storage == "Enable":
+        enable_fp8 = True
+    elif getattr(model, "is_sdxl", False) and shared.opts.fp8_storage == "Enable for SDXL":
+        enable_fp8 = True
+    else:
+        enable_fp8 = False
+    return enable_fp8
+
+
 def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer):
     sd_model_hash = checkpoint_info.calculate_shorthash()
     timer.record("calculate hash")
 
+    if not check_fp8(model) and devices.fp8:
+        # prevent model to load state dict in fp8
+        model.half()
+
     if not SkipWritingToConfig.skip:
         shared.opts.data["sd_model_checkpoint"] = checkpoint_info.title
 
@@ -395,34 +413,16 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
-    if devices.get_optimal_device_name() == "mps":
-        enable_fp8 = False
-    elif shared.cmd_opts.opt_unet_fp8_storage:
-        enable_fp8 = True
-    elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
-        enable_fp8 = True
-    else:
-        enable_fp8 = False
-
-    if enable_fp8:
+    if check_fp8(model):
         devices.fp8 = True
-        if model.is_sdxl:
-            cond_stage = model.conditioner
-        else:
-            cond_stage = model.cond_stage_model
-
-        for module in cond_stage.modules():
-            if isinstance(module, torch.nn.Linear):
+        first_stage = model.first_stage_model
+        model.first_stage_model = None
+        for module in model.modules():
+            if isinstance(module, torch.nn.Conv2d):
                 module.to(torch.float8_e4m3fn)
-
-        if devices.device == devices.cpu:
-            for module in model.model.diffusion_model.modules():
-                if isinstance(module, torch.nn.Conv2d):
-                    module.to(torch.float8_e4m3fn)
-                elif isinstance(module, torch.nn.Linear):
-                    module.to(torch.float8_e4m3fn)
-        else:
-            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
+            elif isinstance(module, torch.nn.Linear):
+                module.to(torch.float8_e4m3fn)
+        model.first_stage_model = first_stage
         timer.record("apply fp8")
     else:
         devices.fp8 = False
@@ -769,7 +769,7 @@ def reuse_model_from_already_loaded(sd_model, checkpoint_info, timer):
         return None
 
 
-def reload_model_weights(sd_model=None, info=None):
+def reload_model_weights(sd_model=None, info=None, forced_reload=False):
     checkpoint_info = info or select_checkpoint()
 
     timer = Timer()
@@ -781,11 +781,14 @@ def reload_model_weights(sd_model=None, info=None):
         current_checkpoint_info = None
     else:
         current_checkpoint_info = sd_model.sd_checkpoint_info
-        if sd_model.sd_model_checkpoint == checkpoint_info.filename:
+        if check_fp8(sd_model) != devices.fp8:
+            # load from state dict again to prevent extra numerical errors
+            forced_reload = True
+        elif sd_model.sd_model_checkpoint == checkpoint_info.filename:
             return sd_model
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
-    if sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
+    if not forced_reload and sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
         return sd_model
 
     if sd_model is not None:
diff --git a/modules/shared_options.py b/modules/shared_options.py
index f1003f21..d27f35e9 100644
--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -200,6 +200,7 @@ options_templates.update(options_section(('optimizations', "Optimizations"), {
     "pad_cond_uncond": OptionInfo(False, "Pad prompt/negative prompt to be same length", infotext='Pad conds').info("improves performance when prompt and negative prompt have different lengths; changes seeds"),
     "persistent_cond_cache": OptionInfo(True, "Persistent cond cache").info("do not recalculate conds from prompts if prompts have not changed since previous calculation"),
     "batch_cond_uncond": OptionInfo(True, "Batch cond/uncond").info("do both conditional and unconditional denoising in one batch; uses a bit more VRAM during sampling, but improves speed; previously this was controlled by --always-batch-cond-uncond comandline argument"),
+    "fp8_storage": OptionInfo("Disable", "FP8 weight", gr.Dropdown, {"choices": ["Disable", "Enable for SDXL", "Enable"]}).info("Use FP8 to store Linear/Conv layers' weight. Require pytorch>=2.1.0."),
 }))
 
 options_templates.update(options_section(('compatibility', "Compatibility"), {
diff --git a/scripts/xyz_grid.py b/scripts/xyz_grid.py
index 0dc255bc..b2250c04 100644
--- a/scripts/xyz_grid.py
+++ b/scripts/xyz_grid.py
@@ -270,6 +270,7 @@ axis_options = [
     AxisOption("Refiner checkpoint", str, apply_field('refiner_checkpoint'), format_value=format_remove_path, confirm=confirm_checkpoints_or_none, cost=1.0, choices=lambda: ['None'] + sorted(sd_models.checkpoints_list, key=str.casefold)),
     AxisOption("Refiner switch at", float, apply_field('refiner_switch_at')),
     AxisOption("RNG source", str, apply_override("randn_source"), choices=lambda: ["GPU", "CPU", "NV"]),
+    AxisOption("FP8 mode", str, apply_override("fp8_storage"), cost=0.9, choices=lambda: ["Disable", "Enable for SDXL", "Enable"]),
 ]
 
 
-- 
cgit v1.2.3


From 370a77f8e78e65a8a1339289d684cb43df142f70 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 21 Nov 2023 19:59:34 +0800
Subject: Option for using fp16 weight when apply lora

---
 extensions-builtin/Lora/networks.py | 16 ++++++++++++----
 modules/initialize_util.py          |  1 +
 modules/sd_models.py                | 14 +++++++++++---
 modules/shared_options.py           |  1 +
 4 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/extensions-builtin/Lora/networks.py b/extensions-builtin/Lora/networks.py
index 0170dbfb..d22ed843 100644
--- a/extensions-builtin/Lora/networks.py
+++ b/extensions-builtin/Lora/networks.py
@@ -388,18 +388,26 @@ def network_apply_weights(self: Union[torch.nn.Conv2d, torch.nn.Linear, torch.nn
             if module is not None and hasattr(self, 'weight'):
                 try:
                     with torch.no_grad():
-                        updown, ex_bias = module.calc_updown(self.weight)
+                        if getattr(self, 'fp16_weight', None) is None:
+                            weight = self.weight
+                            bias = self.bias
+                        else:
+                            weight = self.fp16_weight.clone().to(self.weight.device)
+                            bias = getattr(self, 'fp16_bias', None)
+                            if bias is not None:
+                                bias = bias.clone().to(self.bias.device)
+                        updown, ex_bias = module.calc_updown(weight)
 
-                        if len(self.weight.shape) == 4 and self.weight.shape[1] == 9:
+                        if len(weight.shape) == 4 and weight.shape[1] == 9:
                             # inpainting model. zero pad updown to make channel[1]  4 to 9
                             updown = torch.nn.functional.pad(updown, (0, 0, 0, 0, 0, 5))
 
-                        self.weight.copy_((self.weight.to(dtype=updown.dtype) + updown).to(dtype=self.weight.dtype))
+                        self.weight.copy_((weight.to(dtype=updown.dtype) + updown).to(dtype=self.weight.dtype))
                         if ex_bias is not None and hasattr(self, 'bias'):
                             if self.bias is None:
                                 self.bias = torch.nn.Parameter(ex_bias).to(self.weight.dtype)
                             else:
-                                self.bias.copy_((self.bias.to(dtype=ex_bias.dtype) + ex_bias).to(dtype=self.bias.dtype))
+                                self.bias.copy_((bias + ex_bias).to(dtype=self.bias.dtype))
                 except RuntimeError as e:
                     logging.debug(f"Network {net.name} layer {network_layer_name}: {e}")
                     extra_network_lora.errors[net.name] = extra_network_lora.errors.get(net.name, 0) + 1
diff --git a/modules/initialize_util.py b/modules/initialize_util.py
index 1b11ead6..7fb1d8d5 100644
--- a/modules/initialize_util.py
+++ b/modules/initialize_util.py
@@ -178,6 +178,7 @@ def configure_opts_onchange():
     shared.opts.onchange("gradio_theme", shared.reload_gradio_theme)
     shared.opts.onchange("cross_attention_optimization", wrap_queued_call(lambda: sd_hijack.model_hijack.redo_hijack(shared.sd_model)), call=False)
     shared.opts.onchange("fp8_storage", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
+    shared.opts.onchange("cache_fp16_weight", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
     startup_timer.record("opts onchange")
 
 
diff --git a/modules/sd_models.py b/modules/sd_models.py
index eb491434..0a7777f1 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -413,14 +413,22 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
+    for module in model.modules():
+        if hasattr(module, 'fp16_weight'):
+            del module.fp16_weight
+        if hasattr(module, 'fp16_bias'):
+            del module.fp16_bias
+
     if check_fp8(model):
         devices.fp8 = True
         first_stage = model.first_stage_model
         model.first_stage_model = None
         for module in model.modules():
-            if isinstance(module, torch.nn.Conv2d):
-                module.to(torch.float8_e4m3fn)
-            elif isinstance(module, torch.nn.Linear):
+            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
+                if shared.opts.cache_fp16_weight:
+                    module.fp16_weight = module.weight.clone().half()
+                    if module.bias is not None:
+                        module.fp16_bias = module.bias.clone().half()
                 module.to(torch.float8_e4m3fn)
         model.first_stage_model = first_stage
         timer.record("apply fp8")
diff --git a/modules/shared_options.py b/modules/shared_options.py
index d27f35e9..eaa9f135 100644
--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -201,6 +201,7 @@ options_templates.update(options_section(('optimizations', "Optimizations"), {
     "persistent_cond_cache": OptionInfo(True, "Persistent cond cache").info("do not recalculate conds from prompts if prompts have not changed since previous calculation"),
     "batch_cond_uncond": OptionInfo(True, "Batch cond/uncond").info("do both conditional and unconditional denoising in one batch; uses a bit more VRAM during sampling, but improves speed; previously this was controlled by --always-batch-cond-uncond comandline argument"),
     "fp8_storage": OptionInfo("Disable", "FP8 weight", gr.Dropdown, {"choices": ["Disable", "Enable for SDXL", "Enable"]}).info("Use FP8 to store Linear/Conv layers' weight. Require pytorch>=2.1.0."),
+    "cache_fp16_weight": OptionInfo(False, "Cache FP16 weight for LoRA").info("Cache fp16 weight when enabling FP8, will increase the quality of LoRA. Use more system ram."),
 }))
 
 options_templates.update(options_section(('compatibility', "Compatibility"), {
-- 
cgit v1.2.3


From 40ac134c553ac824d4a96666bba14d550300daa5 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 25 Nov 2023 12:35:09 +0800
Subject: Fix pre-fp8

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 0a7777f1..90437c87 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -357,7 +357,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
     sd_model_hash = checkpoint_info.calculate_shorthash()
     timer.record("calculate hash")
 
-    if not check_fp8(model) and devices.fp8:
+    if devices.fp8:
         # prevent model to load state dict in fp8
         model.half()
 
-- 
cgit v1.2.3


From b25c126ccdbc4da22ade46597a9addf808998989 Mon Sep 17 00:00:00 2001
From: drhead <1313496+drhead@users.noreply.github.com>
Date: Wed, 29 Nov 2023 17:38:53 -0500
Subject: Protect alphas_cumprod from downcasting

---
 modules/sd_models.py | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 841402e8..de80a493 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -387,7 +387,11 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         if shared.cmd_opts.upcast_sampling and depth_model:
             model.depth_model = None
 
+        alphas_cumprod = model.alphas_cumprod
+        model.alphas_cumprod = None
         model.half()
+        model.alphas_cumprod = alphas_cumprod
+        model.alphas_cumprod_original = alphas_cumprod
         model.first_stage_model = vae
         if depth_model:
             model.depth_model = depth_model
@@ -642,6 +646,7 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
     else:
         weight_dtype_conversion = {
             'first_stage_model': None,
+            'alphas_cumprod': None,
             '': torch.float16,
         }
 
-- 
cgit v1.2.3


From 50a21cb09fe3e9ea2d4fe058e0484e192c8a86e3 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 2 Dec 2023 22:06:47 +0800
Subject: Ensure the cached weight will not be affected

---
 modules/sd_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 4b8a9ae6..dcf816b3 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -435,9 +435,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         for module in model.modules():
             if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
                 if shared.opts.cache_fp16_weight:
-                    module.fp16_weight = module.weight.clone().half()
+                    module.fp16_weight = module.weight.data.clone().cpu().half()
                     if module.bias is not None:
-                        module.fp16_bias = module.bias.clone().half()
+                        module.fp16_bias = module.bias.data.clone().cpu().half()
                 module.to(torch.float8_e4m3fn)
         model.first_stage_model = first_stage
         timer.record("apply fp8")
-- 
cgit v1.2.3


From dc1adeecdd02f3fb910481e808a6d60a77100fea Mon Sep 17 00:00:00 2001
From: drhead <runemaste644@gmail.com>
Date: Sat, 2 Dec 2023 14:06:56 -0500
Subject: Create alphas_cumprod_original on full precision path

---
 modules/sd_models.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index de80a493..976c7d5b 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -374,6 +374,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
     if shared.cmd_opts.no_half:
         model.float()
+        model.alphas_cumprod_original = alphas_cumprod
         devices.dtype_unet = torch.float32
         timer.record("apply float()")
     else:
-- 
cgit v1.2.3


From 78acdcf677a96894651ff0d7d8287f2a994f3781 Mon Sep 17 00:00:00 2001
From: drhead <runemaste644@gmail.com>
Date: Sat, 2 Dec 2023 14:09:18 -0500
Subject: fix variable

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 976c7d5b..5a19a00a 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -374,7 +374,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
     if shared.cmd_opts.no_half:
         model.float()
-        model.alphas_cumprod_original = alphas_cumprod
+        model.alphas_cumprod_original = model.alphas_cumprod
         devices.dtype_unet = torch.float32
         timer.record("apply float()")
     else:
-- 
cgit v1.2.3


From 672dc4efa8e0da38426b121e7c7216d0a8e465fd Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:16:10 +0800
Subject: Fix forced reload

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index dcf816b3..d0046f88 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -801,7 +801,7 @@ def reload_model_weights(sd_model=None, info=None, forced_reload=False):
         if check_fp8(sd_model) != devices.fp8:
             # load from state dict again to prevent extra numerical errors
             forced_reload = True
-        elif sd_model.sd_model_checkpoint == checkpoint_info.filename:
+        elif sd_model.sd_model_checkpoint == checkpoint_info.filename and not forced_reload:
             return sd_model
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
-- 
cgit v1.2.3


From a183de04e3f965083e7f3462201327d30c36b958 Mon Sep 17 00:00:00 2001
From: Nuullll <vfirst218@gmail.com>
Date: Sat, 6 Jan 2024 20:03:33 +0800
Subject: Execute model_loaded_callback after moving to target device

---
 modules/sd_models.py | 6 +++---
 modules/sd_vae.py    | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 50bc209e..2c045771 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -842,13 +842,13 @@ def reload_model_weights(sd_model=None, info=None, forced_reload=False):
         sd_hijack.model_hijack.hijack(sd_model)
         timer.record("hijack")
 
-        script_callbacks.model_loaded_callback(sd_model)
-        timer.record("script callbacks")
-
         if not sd_model.lowvram:
             sd_model.to(devices.device)
             timer.record("move model to device")
 
+        script_callbacks.model_loaded_callback(sd_model)
+        timer.record("script callbacks")
+
     print(f"Weights loaded in {timer.summary()}.")
 
     model_data.set_sd_model(sd_model)
diff --git a/modules/sd_vae.py b/modules/sd_vae.py
index 31306d8b..43687e48 100644
--- a/modules/sd_vae.py
+++ b/modules/sd_vae.py
@@ -273,10 +273,11 @@ def reload_vae_weights(sd_model=None, vae_file=unspecified):
     load_vae(sd_model, vae_file, vae_source)
 
     sd_hijack.model_hijack.hijack(sd_model)
-    script_callbacks.model_loaded_callback(sd_model)
 
     if not sd_model.lowvram:
         sd_model.to(devices.device)
 
+    script_callbacks.model_loaded_callback(sd_model)
+
     print("VAE weights loaded.")
     return sd_model
-- 
cgit v1.2.3


From 28bc85a20a282285710e17c4d86cf9db5e00d7db Mon Sep 17 00:00:00 2001
From: AUTOMATIC1111 <16777216c@gmail.com>
Date: Sat, 2 Mar 2024 06:40:32 +0300
Subject: Merge pull request #14979 from drhead/refiner_cumprod_fix

Protect alphas_cumprod during refiner switchover
---
 modules/processing.py | 28 +---------------------------
 modules/sd_models.py  | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/processing.py b/modules/processing.py
index d208a922..411c7c3f 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -915,33 +915,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            def rescale_zero_terminal_snr_abar(alphas_cumprod):
-                alphas_bar_sqrt = alphas_cumprod.sqrt()
-
-                # Store old values.
-                alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
-                alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
-
-                # Shift so the last timestep is zero.
-                alphas_bar_sqrt -= (alphas_bar_sqrt_T)
-
-                # Scale so the first timestep is back to the old value.
-                alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
-
-                # Convert alphas_bar_sqrt to betas
-                alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
-                alphas_bar[-1] = 4.8973451890853435e-08
-                return alphas_bar
-
-            if hasattr(p.sd_model, 'alphas_cumprod') and hasattr(p.sd_model, 'alphas_cumprod_original'):
-                p.sd_model.alphas_cumprod = p.sd_model.alphas_cumprod_original.to(shared.device)
-
-                if opts.use_downcasted_alpha_bar:
-                    p.extra_generation_params['Downcast alphas_cumprod'] = opts.use_downcasted_alpha_bar
-                    p.sd_model.alphas_cumprod = p.sd_model.alphas_cumprod.half().to(shared.device)
-                if opts.sd_noise_schedule == "Zero Terminal SNR":
-                    p.extra_generation_params['Noise Schedule'] = opts.sd_noise_schedule
-                    p.sd_model.alphas_cumprod = rescale_zero_terminal_snr_abar(p.sd_model.alphas_cumprod).to(shared.device)
+            sd_models.apply_alpha_schedule_override(p.sd_model, p)
 
             with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 2c045771..fbd53adb 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -15,6 +15,7 @@ from ldm.util import instantiate_from_config
 
 from modules import paths, shared, modelloader, devices, script_callbacks, sd_vae, sd_disable_initialization, errors, hashes, sd_models_config, sd_unet, sd_models_xl, cache, extra_networks, processing, lowvram, sd_hijack, patches
 from modules.timer import Timer
+from modules.shared import opts
 import tomesd
 import numpy as np
 
@@ -549,6 +550,36 @@ def repair_config(sd_config):
         karlo_path = os.path.join(paths.models_path, 'karlo')
         sd_config.model.params.noise_aug_config.params.clip_stats_path = sd_config.model.params.noise_aug_config.params.clip_stats_path.replace("checkpoints/karlo_models", karlo_path)
 
+def apply_alpha_schedule_override(sd_model, p=None):
+    def rescale_zero_terminal_snr_abar(alphas_cumprod):
+        alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+        # Shift so the last timestep is zero.
+        alphas_bar_sqrt -= (alphas_bar_sqrt_T)
+
+        # Scale so the first timestep is back to the old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+        alphas_bar[-1] = 4.8973451890853435e-08
+        return alphas_bar
+
+    if hasattr(sd_model, 'alphas_cumprod') and hasattr(sd_model, 'alphas_cumprod_original'):
+        sd_model.alphas_cumprod = sd_model.alphas_cumprod_original.to(shared.device)
+
+        if opts.use_downcasted_alpha_bar:
+            if p is not None:
+                p.extra_generation_params['Downcast alphas_cumprod'] = opts.use_downcasted_alpha_bar
+            sd_model.alphas_cumprod = sd_model.alphas_cumprod.half().to(shared.device)
+        if opts.sd_noise_schedule == "Zero Terminal SNR":
+            if p is not None:
+                p.extra_generation_params['Noise Schedule'] = opts.sd_noise_schedule
+            sd_model.alphas_cumprod = rescale_zero_terminal_snr_abar(sd_model.alphas_cumprod).to(shared.device)
 
 sd1_clip_weight = 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight'
 sd2_clip_weight = 'cond_stage_model.model.transformer.resblocks.0.attn.in_proj_weight'
@@ -812,6 +843,7 @@ def reload_model_weights(sd_model=None, info=None, forced_reload=False):
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
     if not forced_reload and sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
+        apply_alpha_schedule_override(sd_model)
         return sd_model
 
     if sd_model is not None:
-- 
cgit v1.2.3


From da67afe5f68497a04d1fd9173bbd256b73d9d251 Mon Sep 17 00:00:00 2001
From: AUTOMATIC1111 <16777216c@gmail.com>
Date: Sat, 2 Mar 2024 06:53:53 +0300
Subject: call apply_alpha_schedule_override in load_model_weights for #14979

---
 modules/sd_models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index fbd53adb..db72e120 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -428,6 +428,8 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
+    apply_alpha_schedule_override(model)
+
     for module in model.modules():
         if hasattr(module, 'fp16_weight'):
             del module.fp16_weight
@@ -843,7 +845,6 @@ def reload_model_weights(sd_model=None, info=None, forced_reload=False):
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
     if not forced_reload and sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
-        apply_alpha_schedule_override(sd_model)
         return sd_model
 
     if sd_model is not None:
-- 
cgit v1.2.3


From 141a17e9693065c33a2b1d30f04a0083bb687775 Mon Sep 17 00:00:00 2001
From: AUTOMATIC1111 <16777216c@gmail.com>
Date: Sat, 2 Mar 2024 06:54:11 +0300
Subject: style changes for #14979

---
 modules/sd_models.py | 70 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 29 deletions(-)

(limited to 'modules/sd_models.py')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index db72e120..747fc39e 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -552,36 +552,48 @@ def repair_config(sd_config):
         karlo_path = os.path.join(paths.models_path, 'karlo')
         sd_config.model.params.noise_aug_config.params.clip_stats_path = sd_config.model.params.noise_aug_config.params.clip_stats_path.replace("checkpoints/karlo_models", karlo_path)
 
+
+def rescale_zero_terminal_snr_abar(alphas_cumprod):
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= (alphas_bar_sqrt_T)
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt ** 2  # Revert sqrt
+    alphas_bar[-1] = 4.8973451890853435e-08
+    return alphas_bar
+
+
 def apply_alpha_schedule_override(sd_model, p=None):
-    def rescale_zero_terminal_snr_abar(alphas_cumprod):
-        alphas_bar_sqrt = alphas_cumprod.sqrt()
-
-        # Store old values.
-        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
-        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
-
-        # Shift so the last timestep is zero.
-        alphas_bar_sqrt -= (alphas_bar_sqrt_T)
-
-        # Scale so the first timestep is back to the old value.
-        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
-
-        # Convert alphas_bar_sqrt to betas
-        alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
-        alphas_bar[-1] = 4.8973451890853435e-08
-        return alphas_bar
-
-    if hasattr(sd_model, 'alphas_cumprod') and hasattr(sd_model, 'alphas_cumprod_original'):
-        sd_model.alphas_cumprod = sd_model.alphas_cumprod_original.to(shared.device)
-
-        if opts.use_downcasted_alpha_bar:
-            if p is not None:
-                p.extra_generation_params['Downcast alphas_cumprod'] = opts.use_downcasted_alpha_bar
-            sd_model.alphas_cumprod = sd_model.alphas_cumprod.half().to(shared.device)
-        if opts.sd_noise_schedule == "Zero Terminal SNR":
-            if p is not None:
-                p.extra_generation_params['Noise Schedule'] = opts.sd_noise_schedule
-            sd_model.alphas_cumprod = rescale_zero_terminal_snr_abar(sd_model.alphas_cumprod).to(shared.device)
+    """
+    Applies an override to the alpha schedule of the model according to settings.
+    - downcasts the alpha schedule to half precision
+    - rescales the alpha schedule to have zero terminal SNR
+    """
+
+    if not hasattr(sd_model, 'alphas_cumprod') or not hasattr(sd_model, 'alphas_cumprod_original'):
+        return
+
+    sd_model.alphas_cumprod = sd_model.alphas_cumprod_original.to(shared.device)
+
+    if opts.use_downcasted_alpha_bar:
+        if p is not None:
+            p.extra_generation_params['Downcast alphas_cumprod'] = opts.use_downcasted_alpha_bar
+        sd_model.alphas_cumprod = sd_model.alphas_cumprod.half().to(shared.device)
+
+    if opts.sd_noise_schedule == "Zero Terminal SNR":
+        if p is not None:
+            p.extra_generation_params['Noise Schedule'] = opts.sd_noise_schedule
+        sd_model.alphas_cumprod = rescale_zero_terminal_snr_abar(sd_model.alphas_cumprod).to(shared.device)
+
 
 sd1_clip_weight = 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight'
 sd2_clip_weight = 'cond_stage_model.model.transformer.resblocks.0.attn.in_proj_weight'
-- 
cgit v1.2.3