From bdaa36c84470adbdce3e98c01a69af5e95adfb02 Mon Sep 17 00:00:00 2001
From: brkirch <brkirch@users.noreply.github.com>
Date: Fri, 30 Sep 2022 23:53:25 -0400
Subject: When device is MPS, use CPU for GFPGAN instead

GFPGAN will not work if the device is MPS, so default to CPU instead.
---
 modules/devices.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules/devices.py')

diff --git a/modules/devices.py b/modules/devices.py
index 07bb2339..08bb26d6 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -34,7 +34,7 @@ errors.run(enable_tf32, "Enabling TF32")
 
 
 device = get_optimal_device()
-device_codeformer = cpu if has_mps else device
+device_gfpgan = device_codeformer = cpu if device.type == 'mps' else device
 
 
 def randn(seed, shape):
-- 
cgit v1.2.3


From eeab7aedf532680a6ae9058ee272450bb07e41eb Mon Sep 17 00:00:00 2001
From: brkirch <brkirch@users.noreply.github.com>
Date: Tue, 4 Oct 2022 04:24:35 -0400
Subject: Add --use-cpu command line option

Remove MPS detection to use CPU for GFPGAN / CodeFormer and add a --use-cpu command line option.
---
 modules/devices.py      | 5 ++---
 modules/esrgan_model.py | 9 ++++-----
 modules/scunet_model.py | 8 ++++----
 modules/shared.py       | 9 +++++++--
 4 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/devices.py b/modules/devices.py
index 5d9c7a07..b5a0cd29 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -1,8 +1,8 @@
 import torch
 
-# has_mps is only available in nightly pytorch (for now), `getattr` for compatibility
 from modules import errors
 
+# has_mps is only available in nightly pytorch (for now), `getattr` for compatibility
 has_mps = getattr(torch, 'has_mps', False)
 
 cpu = torch.device("cpu")
@@ -32,8 +32,7 @@ def enable_tf32():
 
 errors.run(enable_tf32, "Enabling TF32")
 
-device = get_optimal_device()
-device_gfpgan = device_codeformer = cpu if device.type == 'mps' else device
+device = device_gfpgan = device_esrgan = device_scunet = device_codeformer = get_optimal_device()
 dtype = torch.float16
 
 def randn(seed, shape):
diff --git a/modules/esrgan_model.py b/modules/esrgan_model.py
index 4aed9283..d17e730f 100644
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -6,8 +6,7 @@ from PIL import Image
 from basicsr.utils.download_util import load_file_from_url
 
 import modules.esrgam_model_arch as arch
-from modules import shared, modelloader, images
-from modules.devices import has_mps
+from modules import shared, modelloader, images, devices
 from modules.paths import models_path
 from modules.upscaler import Upscaler, UpscalerData
 from modules.shared import opts
@@ -97,7 +96,7 @@ class UpscalerESRGAN(Upscaler):
         model = self.load_model(selected_model)
         if model is None:
             return img
-        model.to(shared.device)
+        model.to(devices.device_esrgan)
         img = esrgan_upscale(model, img)
         return img
 
@@ -112,7 +111,7 @@ class UpscalerESRGAN(Upscaler):
             print("Unable to load %s from %s" % (self.model_path, filename))
             return None
 
-        pretrained_net = torch.load(filename, map_location='cpu' if has_mps else None)
+        pretrained_net = torch.load(filename, map_location='cpu' if shared.device.type == 'mps' else None)
         crt_model = arch.RRDBNet(3, 3, 64, 23, gc=32)
 
         pretrained_net = fix_model_layers(crt_model, pretrained_net)
@@ -127,7 +126,7 @@ def upscale_without_tiling(model, img):
     img = img[:, :, ::-1]
     img = np.moveaxis(img, 2, 0) / 255
     img = torch.from_numpy(img).float()
-    img = img.unsqueeze(0).to(shared.device)
+    img = img.unsqueeze(0).to(devices.device_esrgan)
     with torch.no_grad():
         output = model(img)
     output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
diff --git a/modules/scunet_model.py b/modules/scunet_model.py
index 7987ac14..fb64b740 100644
--- a/modules/scunet_model.py
+++ b/modules/scunet_model.py
@@ -8,7 +8,7 @@ import torch
 from basicsr.utils.download_util import load_file_from_url
 
 import modules.upscaler
-from modules import shared, modelloader
+from modules import devices, modelloader
 from modules.paths import models_path
 from modules.scunet_model_arch import SCUNet as net
 
@@ -51,12 +51,12 @@ class UpscalerScuNET(modules.upscaler.Upscaler):
         if model is None:
             return img
 
-        device = shared.device
+        device = devices.device_scunet
         img = np.array(img)
         img = img[:, :, ::-1]
         img = np.moveaxis(img, 2, 0) / 255
         img = torch.from_numpy(img).float()
-        img = img.unsqueeze(0).to(shared.device)
+        img = img.unsqueeze(0).to(device)
 
         img = img.to(device)
         with torch.no_grad():
@@ -69,7 +69,7 @@ class UpscalerScuNET(modules.upscaler.Upscaler):
         return PIL.Image.fromarray(output, 'RGB')
 
     def load_model(self, path: str):
-        device = shared.device
+        device = devices.device_scunet
         if "http" in path:
             filename = load_file_from_url(url=self.model_url, model_dir=self.model_path, file_name="%s.pth" % self.name,
                                           progress=True)
diff --git a/modules/shared.py b/modules/shared.py
index 2a599e9c..7899ab8d 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -12,7 +12,7 @@ import modules.interrogate
 import modules.memmon
 import modules.sd_models
 import modules.styles
-from modules.devices import get_optimal_device
+import modules.devices as devices
 from modules.paths import script_path, sd_path
 
 sd_model_file = os.path.join(script_path, 'model.ckpt')
@@ -46,6 +46,7 @@ parser.add_argument("--ldsr-models-path", type=str, help="Path to directory with
 parser.add_argument("--opt-split-attention", action='store_true', help="force-enables cross-attention layer optimization. By default, it's on for torch.cuda and off for other torch devices.")
 parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization")
 parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find")
+parser.add_argument("--use-cpu", nargs='+',choices=['SD', 'GFPGAN', 'ESRGAN', 'SCUNet', 'CodeFormer'], help="use CPU for specified modules", default=[])
 parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
 parser.add_argument("--port", type=int, help="launch gradio with given server port, you need root/admin rights for ports < 1024, defaults to 7860 if available", default=None)
 parser.add_argument("--show-negative-prompt", action='store_true', help="does not do anything", default=False)
@@ -63,7 +64,11 @@ parser.add_argument("--enable-console-prompts", action='store_true', help="print
 
 
 cmd_opts = parser.parse_args()
-device = get_optimal_device()
+
+devices.device, devices.device_gfpgan, devices.device_esrgan, devices.device_scunet, devices.device_codeformer = \
+(devices.cpu if x in cmd_opts.use_cpu else devices.get_optimal_device() for x in ['SD', 'GFPGAN', 'ESRGAN', 'SCUNet', 'CodeFormer'])
+
+device = devices.device
 
 batch_cond_uncond = cmd_opts.always_batch_cond_uncond or not (cmd_opts.lowvram or cmd_opts.medvram)
 parallel_processing_allowed = not cmd_opts.lowvram and not cmd_opts.medvram
-- 
cgit v1.2.3


From 27ddc24fdee1fbe709054a43235ab7f9c51b3e9f Mon Sep 17 00:00:00 2001
From: brkirch <brkirch@users.noreply.github.com>
Date: Tue, 4 Oct 2022 05:18:17 -0400
Subject: Add BSRGAN to --add-cpu

---
 modules/bsrgan_model.py | 6 +++---
 modules/devices.py      | 2 +-
 modules/shared.py       | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/bsrgan_model.py b/modules/bsrgan_model.py
index e62c6657..3bd80791 100644
--- a/modules/bsrgan_model.py
+++ b/modules/bsrgan_model.py
@@ -8,7 +8,7 @@ import torch
 from basicsr.utils.download_util import load_file_from_url
 
 import modules.upscaler
-from modules import shared, modelloader
+from modules import devices, modelloader
 from modules.bsrgan_model_arch import RRDBNet
 from modules.paths import models_path
 
@@ -44,13 +44,13 @@ class UpscalerBSRGAN(modules.upscaler.Upscaler):
         model = self.load_model(selected_file)
         if model is None:
             return img
-        model.to(shared.device)
+        model.to(devices.device_bsrgan)
         torch.cuda.empty_cache()
         img = np.array(img)
         img = img[:, :, ::-1]
         img = np.moveaxis(img, 2, 0) / 255
         img = torch.from_numpy(img).float()
-        img = img.unsqueeze(0).to(shared.device)
+        img = img.unsqueeze(0).to(devices.device_bsrgan)
         with torch.no_grad():
             output = model(img)
         output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
diff --git a/modules/devices.py b/modules/devices.py
index b5a0cd29..b7899632 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -32,7 +32,7 @@ def enable_tf32():
 
 errors.run(enable_tf32, "Enabling TF32")
 
-device = device_gfpgan = device_esrgan = device_scunet = device_codeformer = get_optimal_device()
+device = device_gfpgan = device_bsrgan = device_esrgan = device_scunet = device_codeformer = get_optimal_device()
 dtype = torch.float16
 
 def randn(seed, shape):
diff --git a/modules/shared.py b/modules/shared.py
index 7899ab8d..95b98a06 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -46,7 +46,7 @@ parser.add_argument("--ldsr-models-path", type=str, help="Path to directory with
 parser.add_argument("--opt-split-attention", action='store_true', help="force-enables cross-attention layer optimization. By default, it's on for torch.cuda and off for other torch devices.")
 parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization")
 parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find")
-parser.add_argument("--use-cpu", nargs='+',choices=['SD', 'GFPGAN', 'ESRGAN', 'SCUNet', 'CodeFormer'], help="use CPU for specified modules", default=[])
+parser.add_argument("--use-cpu", nargs='+',choices=['SD', 'GFPGAN', 'BSRGAN', 'ESRGAN', 'SCUNet', 'CodeFormer'], help="use CPU for specified modules", default=[])
 parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
 parser.add_argument("--port", type=int, help="launch gradio with given server port, you need root/admin rights for ports < 1024, defaults to 7860 if available", default=None)
 parser.add_argument("--show-negative-prompt", action='store_true', help="does not do anything", default=False)
@@ -65,8 +65,8 @@ parser.add_argument("--enable-console-prompts", action='store_true', help="print
 
 cmd_opts = parser.parse_args()
 
-devices.device, devices.device_gfpgan, devices.device_esrgan, devices.device_scunet, devices.device_codeformer = \
-(devices.cpu if x in cmd_opts.use_cpu else devices.get_optimal_device() for x in ['SD', 'GFPGAN', 'ESRGAN', 'SCUNet', 'CodeFormer'])
+devices.device, devices.device_gfpgan, devices.device_bsrgan, devices.device_esrgan, devices.device_scunet, devices.device_codeformer = \
+(devices.cpu if x in cmd_opts.use_cpu else devices.get_optimal_device() for x in ['SD', 'GFPGAN', 'BSRGAN', 'ESRGAN', 'SCUNet', 'CodeFormer'])
 
 device = devices.device
 
-- 
cgit v1.2.3


From 6c6ae28bf5fd1e8bc3e8f64a3430b6f29f338f77 Mon Sep 17 00:00:00 2001
From: AUTOMATIC <16777216c@gmail.com>
Date: Tue, 4 Oct 2022 12:32:22 +0300
Subject: send all three of GFPGAN's and codeformer's models to CPU memory
 instead of just one for #1283

---
 modules/codeformer_model.py | 12 ++++++++++--
 modules/devices.py          | 10 ++++++++++
 modules/gfpgan_model.py     | 14 ++++++++++++--
 modules/processing.py       | 16 +++++++++-------
 4 files changed, 41 insertions(+), 11 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/codeformer_model.py b/modules/codeformer_model.py
index a29f3855..e6d9fa4f 100644
--- a/modules/codeformer_model.py
+++ b/modules/codeformer_model.py
@@ -69,10 +69,14 @@ def setup_model(dirname):
 
                 self.net = net
                 self.face_helper = face_helper
-                self.net.to(devices.device_codeformer)
 
                 return net, face_helper
 
+            def send_model_to(self, device):
+                self.net.to(device)
+                self.face_helper.face_det.to(device)
+                self.face_helper.face_parse.to(device)
+
             def restore(self, np_image, w=None):
                 np_image = np_image[:, :, ::-1]
 
@@ -82,6 +86,8 @@ def setup_model(dirname):
                 if self.net is None or self.face_helper is None:
                     return np_image
 
+                self.send_model_to(devices.device_codeformer)
+
                 self.face_helper.clean_all()
                 self.face_helper.read_image(np_image)
                 self.face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)
@@ -113,8 +119,10 @@ def setup_model(dirname):
                 if original_resolution != restored_img.shape[0:2]:
                     restored_img = cv2.resize(restored_img, (0, 0), fx=original_resolution[1]/restored_img.shape[1], fy=original_resolution[0]/restored_img.shape[0], interpolation=cv2.INTER_LINEAR)
 
+                self.face_helper.clean_all()
+
                 if shared.opts.face_restoration_unload:
-                    self.net.to(devices.cpu)
+                    self.send_model_to(devices.cpu)
 
                 return restored_img
 
diff --git a/modules/devices.py b/modules/devices.py
index ff82f2f6..12aab665 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -1,3 +1,5 @@
+import contextlib
+
 import torch
 
 # has_mps is only available in nightly pytorch (for now), `getattr` for compatibility
@@ -57,3 +59,11 @@ def randn_without_seed(shape):
 
     return torch.randn(shape, device=device)
 
+
+def autocast():
+    from modules import shared
+
+    if dtype == torch.float32 or shared.cmd_opts.precision == "full":
+        return contextlib.nullcontext()
+
+    return torch.autocast("cuda")
diff --git a/modules/gfpgan_model.py b/modules/gfpgan_model.py
index dd3fbcab..5586b554 100644
--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -37,22 +37,32 @@ def gfpgann():
         print("Unable to load gfpgan model!")
         return None
     model = gfpgan_constructor(model_path=model_file, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None)
-    model.gfpgan.to(shared.device)
     loaded_gfpgan_model = model
 
     return model
 
 
+def send_model_to(model, device):
+    model.gfpgan.to(device)
+    model.face_helper.face_det.to(device)
+    model.face_helper.face_parse.to(device)
+
+
 def gfpgan_fix_faces(np_image):
     model = gfpgann()
     if model is None:
         return np_image
+
+    send_model_to(model, devices.device)
+
     np_image_bgr = np_image[:, :, ::-1]
     cropped_faces, restored_faces, gfpgan_output_bgr = model.enhance(np_image_bgr, has_aligned=False, only_center_face=False, paste_back=True)
     np_image = gfpgan_output_bgr[:, :, ::-1]
 
+    model.face_helper.clean_all()
+
     if shared.opts.face_restoration_unload:
-        model.gfpgan.to(devices.cpu)
+        send_model_to(model, devices.cpu)
 
     return np_image
 
diff --git a/modules/processing.py b/modules/processing.py
index 0a4b6198..9cbecdd8 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -1,4 +1,3 @@
-import contextlib
 import json
 import math
 import os
@@ -330,9 +329,8 @@ def process_images(p: StableDiffusionProcessing) -> Processed:
 
     infotexts = []
     output_images = []
-    precision_scope = torch.autocast if cmd_opts.precision == "autocast" else contextlib.nullcontext
-    ema_scope = (contextlib.nullcontext if cmd_opts.lowvram else p.sd_model.ema_scope)
-    with torch.no_grad(), precision_scope("cuda"), ema_scope():
+
+    with torch.no_grad():
         p.init(all_prompts, all_seeds, all_subseeds)
 
         if state.job_count == -1:
@@ -351,8 +349,9 @@ def process_images(p: StableDiffusionProcessing) -> Processed:
 
             #uc = p.sd_model.get_learned_conditioning(len(prompts) * [p.negative_prompt])
             #c = p.sd_model.get_learned_conditioning(prompts)
-            uc = prompt_parser.get_learned_conditioning(len(prompts) * [p.negative_prompt], p.steps)
-            c = prompt_parser.get_learned_conditioning(prompts, p.steps)
+            with devices.autocast():
+                uc = prompt_parser.get_learned_conditioning(len(prompts) * [p.negative_prompt], p.steps)
+                c = prompt_parser.get_learned_conditioning(prompts, p.steps)
 
             if len(model_hijack.comments) > 0:
                 for comment in model_hijack.comments:
@@ -361,7 +360,9 @@ def process_images(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            samples_ddim = p.sample(conditioning=c, unconditional_conditioning=uc, seeds=seeds, subseeds=subseeds, subseed_strength=p.subseed_strength)
+            with devices.autocast():
+                samples_ddim = p.sample(conditioning=c, unconditional_conditioning=uc, seeds=seeds, subseeds=subseeds, subseed_strength=p.subseed_strength).to(devices.dtype)
+
             if state.interrupted:
 
                 # if we are interruped, sample returns just noise
@@ -386,6 +387,7 @@ def process_images(p: StableDiffusionProcessing) -> Processed:
                     devices.torch_gc()
 
                     x_sample = modules.face_restoration.restore_faces(x_sample)
+                    devices.torch_gc()
 
                 image = Image.fromarray(x_sample)
 
-- 
cgit v1.2.3