Add CPU fp8 support

Since norm layer need fp32, I only convert the linear operation layer(conv2d/linear) And TE have some pytorch function not support bf16 amp in CPU. I add a condition to indicate if the autocast is for unet.
author: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> 2023-10-23 17:49:05 +0000
committer: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> 2023-10-23 17:49:05 +0000
commit: eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66 (patch)
tree: f8bf60786db8d42a0a0e85deb56c885780bda654 /modules/sd_models.py
parent: 5f9ddfa46f28ca2aa9e0bd832f6bbd67069be63e (diff)
download: stable-diffusion-webui-gfx803-eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66.tar.gz
stable-diffusion-webui-gfx803-eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66.tar.bz2
stable-diffusion-webui-gfx803-eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66.zip
1 files changed, 16 insertions, 4 deletions
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 08af128f..c5fe57bf 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -391,12 +391,24 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
-        if shared.cmd_opts.opt_unet_fp8_storage:
+
+    if shared.cmd_opts.opt_unet_fp8_storage:
+        enable_fp8 = True
+    elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
+        enable_fp8 = True
+    
+    if enable_fp8:
+        devices.fp8 = True
+        if devices.device == devices.cpu:
+            for module in model.model.diffusion_model.modules():
+                if isinstance(module, torch.nn.Conv2d):
+                    module.to(torch.float8_e4m3fn)
+                elif isinstance(module, torch.nn.Linear):
+                    module.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet for cpu")
+        else:
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
-        elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
-            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet for sdxl")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
author	Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>	2023-10-23 17:49:05 +0000
committer	Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>	2023-10-23 17:49:05 +0000
commit	eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66 (patch)
tree	f8bf60786db8d42a0a0e85deb56c885780bda654 /modules/sd_models.py
parent	5f9ddfa46f28ca2aa9e0bd832f6bbd67069be63e (diff)
download	stable-diffusion-webui-gfx803-eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66.tar.gz stable-diffusion-webui-gfx803-eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66.tar.bz2 stable-diffusion-webui-gfx803-eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66.zip