From 57eb54b838faa383c10079e1bb5471b7bee6a695 Mon Sep 17 00:00:00 2001
From: Extraltodeus <extraltodeus@gmail.com>
Date: Sat, 22 Oct 2022 00:11:07 +0200
Subject: implement CUDA device selection by ID

---
 modules/devices.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/devices.py b/modules/devices.py
index eb422583..8a159282 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -1,7 +1,6 @@
+import sys, os, shlex
 import contextlib
-
 import torch
-
 from modules import errors
 
 # has_mps is only available in nightly pytorch (for now), `getattr` for compatibility
@@ -9,10 +8,26 @@ has_mps = getattr(torch, 'has_mps', False)
 
 cpu = torch.device("cpu")
 
+def extract_device_id(args, name):
+    for x in range(len(args)):
+        if name in args[x]: return args[x+1]
+    return None
 
 def get_optimal_device():
     if torch.cuda.is_available():
-        return torch.device("cuda")
+        # CUDA device selection support:
+        if "shared" not in sys.modules:
+            commandline_args = os.environ.get('COMMANDLINE_ARGS', "") #re-parse the commandline arguments because using the shared.py module creates an import loop.
+            sys.argv += shlex.split(commandline_args)
+            device_id = extract_device_id(sys.argv, '--device-id')
+        else:
+            device_id = shared.cmd_opts.device_id
+            
+        if device_id is not None:
+            cuda_device = f"cuda:{device_id}"
+            return torch.device(cuda_device)
+        else:
+            return torch.device("cuda")
 
     if has_mps:
         return torch.device("mps")
-- 
cgit v1.2.3


From 50b5504401e50b6c94eba41b37fe212b2f27b792 Mon Sep 17 00:00:00 2001
From: AUTOMATIC <16777216c@gmail.com>
Date: Sat, 22 Oct 2022 14:04:14 +0300
Subject: remove parsing command line from devices.py

---
 modules/devices.py | 14 +++++---------
 modules/lowvram.py |  9 ++++-----
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/devices.py b/modules/devices.py
index 8a159282..dc1f3cdd 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -15,14 +15,10 @@ def extract_device_id(args, name):
 
 def get_optimal_device():
     if torch.cuda.is_available():
-        # CUDA device selection support:
-        if "shared" not in sys.modules:
-            commandline_args = os.environ.get('COMMANDLINE_ARGS', "") #re-parse the commandline arguments because using the shared.py module creates an import loop.
-            sys.argv += shlex.split(commandline_args)
-            device_id = extract_device_id(sys.argv, '--device-id')
-        else:
-            device_id = shared.cmd_opts.device_id
-            
+        from modules import shared
+
+        device_id = shared.cmd_opts.device_id
+
         if device_id is not None:
             cuda_device = f"cuda:{device_id}"
             return torch.device(cuda_device)
@@ -49,7 +45,7 @@ def enable_tf32():
 
 errors.run(enable_tf32, "Enabling TF32")
 
-device = device_interrogate = device_gfpgan = device_bsrgan = device_esrgan = device_scunet = device_codeformer = get_optimal_device()
+device = device_interrogate = device_gfpgan = device_bsrgan = device_esrgan = device_scunet = device_codeformer = None
 dtype = torch.float16
 dtype_vae = torch.float16
 
diff --git a/modules/lowvram.py b/modules/lowvram.py
index 7eba1349..f327c3df 100644
--- a/modules/lowvram.py
+++ b/modules/lowvram.py
@@ -1,9 +1,8 @@
 import torch
-from modules.devices import get_optimal_device
+from modules import devices
 
 module_in_gpu = None
 cpu = torch.device("cpu")
-device = gpu = get_optimal_device()
 
 
 def send_everything_to_cpu():
@@ -33,7 +32,7 @@ def setup_for_low_vram(sd_model, use_medvram):
         if module_in_gpu is not None:
             module_in_gpu.to(cpu)
 
-        module.to(gpu)
+        module.to(devices.device)
         module_in_gpu = module
 
     # see below for register_forward_pre_hook;
@@ -51,7 +50,7 @@ def setup_for_low_vram(sd_model, use_medvram):
     # send the model to GPU. Then put modules back. the modules will be in CPU.
     stored = sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.model
     sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.model = None, None, None
-    sd_model.to(device)
+    sd_model.to(devices.device)
     sd_model.cond_stage_model.transformer, sd_model.first_stage_model, sd_model.model = stored
 
     # register hooks for those the first two models
@@ -70,7 +69,7 @@ def setup_for_low_vram(sd_model, use_medvram):
         # so that only one of them is in GPU at a time
         stored = diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed
         diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = None, None, None, None
-        sd_model.model.to(device)
+        sd_model.model.to(devices.device)
         diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = stored
 
         # install hooks for bits of third model
-- 
cgit v1.2.3


From 4c24347e45776d505937856ab280548d9298f0a8 Mon Sep 17 00:00:00 2001
From: brkirch <brkirch@users.noreply.github.com>
Date: Mon, 24 Oct 2022 23:04:50 -0400
Subject: Remove BSRGAN from --use-cpu, add SwinIR

---
 modules/devices.py      |  2 +-
 modules/shared.py       |  6 +++---
 modules/swinir_model.py | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/devices.py b/modules/devices.py
index dc1f3cdd..033a42d5 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -45,7 +45,7 @@ def enable_tf32():
 
 errors.run(enable_tf32, "Enabling TF32")
 
-device = device_interrogate = device_gfpgan = device_bsrgan = device_esrgan = device_scunet = device_codeformer = None
+device = device_interrogate = device_gfpgan = device_swinir = device_esrgan = device_scunet = device_codeformer = None
 dtype = torch.float16
 dtype_vae = torch.float16
 
diff --git a/modules/shared.py b/modules/shared.py
index 76cbb1bd..308fccce 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -58,7 +58,7 @@ parser.add_argument("--opt-split-attention", action='store_true', help="force-en
 parser.add_argument("--opt-split-attention-invokeai", action='store_true', help="force-enables InvokeAI's cross-attention layer optimization. By default, it's on when cuda is unavailable.")
 parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find")
 parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization")
-parser.add_argument("--use-cpu", nargs='+',choices=['all', 'sd', 'interrogate', 'gfpgan', 'bsrgan', 'esrgan', 'scunet', 'codeformer'], help="use CPU as torch device for specified modules", default=[], type=str.lower)
+parser.add_argument("--use-cpu", nargs='+',choices=['all', 'sd', 'interrogate', 'gfpgan', 'swinir', 'esrgan', 'scunet', 'codeformer'], help="use CPU as torch device for specified modules", default=[], type=str.lower)
 parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
 parser.add_argument("--port", type=int, help="launch gradio with given server port, you need root/admin rights for ports < 1024, defaults to 7860 if available", default=None)
 parser.add_argument("--show-negative-prompt", action='store_true', help="does not do anything", default=False)
@@ -96,8 +96,8 @@ restricted_opts = [
     "outdir_save",
 ]
 
-devices.device, devices.device_interrogate, devices.device_gfpgan, devices.device_bsrgan, devices.device_esrgan, devices.device_scunet, devices.device_codeformer = \
-(devices.cpu if any(y in cmd_opts.use_cpu for y in [x, 'all']) else devices.get_optimal_device() for x in ['sd', 'interrogate', 'gfpgan', 'bsrgan', 'esrgan', 'scunet', 'codeformer'])
+devices.device, devices.device_interrogate, devices.device_gfpgan, devices.device_swinir, devices.device_esrgan, devices.device_scunet, devices.device_codeformer = \
+(devices.cpu if any(y in cmd_opts.use_cpu for y in [x, 'all']) else devices.get_optimal_device() for x in ['sd', 'interrogate', 'gfpgan', 'swinir', 'esrgan', 'scunet', 'codeformer'])
 
 device = devices.device
 weight_load_location = None if cmd_opts.lowram else "cpu"
diff --git a/modules/swinir_model.py b/modules/swinir_model.py
index baa02e3d..facd262d 100644
--- a/modules/swinir_model.py
+++ b/modules/swinir_model.py
@@ -7,8 +7,8 @@ from PIL import Image
 from basicsr.utils.download_util import load_file_from_url
 from tqdm import tqdm
 
-from modules import modelloader
-from modules.shared import cmd_opts, opts, device
+from modules import modelloader, devices
+from modules.shared import cmd_opts, opts
 from modules.swinir_model_arch import SwinIR as net
 from modules.swinir_model_arch_v2 import Swin2SR as net2
 from modules.upscaler import Upscaler, UpscalerData
@@ -42,7 +42,7 @@ class UpscalerSwinIR(Upscaler):
         model = self.load_model(model_file)
         if model is None:
             return img
-        model = model.to(device)
+        model = model.to(devices.device_swinir)
         img = upscale(img, model)
         try:
             torch.cuda.empty_cache()
@@ -111,7 +111,7 @@ def upscale(
     img = img[:, :, ::-1]
     img = np.moveaxis(img, 2, 0) / 255
     img = torch.from_numpy(img).float()
-    img = img.unsqueeze(0).to(device)
+    img = img.unsqueeze(0).to(devices.device_swinir)
     with torch.no_grad(), precision_scope("cuda"):
         _, _, h_old, w_old = img.size()
         h_pad = (h_old // window_size + 1) * window_size - h_old
@@ -139,8 +139,8 @@ def inference(img, model, tile, tile_overlap, window_size, scale):
     stride = tile - tile_overlap
     h_idx_list = list(range(0, h - tile, stride)) + [h - tile]
     w_idx_list = list(range(0, w - tile, stride)) + [w - tile]
-    E = torch.zeros(b, c, h * sf, w * sf, dtype=torch.half, device=device).type_as(img)
-    W = torch.zeros_like(E, dtype=torch.half, device=device)
+    E = torch.zeros(b, c, h * sf, w * sf, dtype=torch.half, device=devices.device_swinir).type_as(img)
+    W = torch.zeros_like(E, dtype=torch.half, device=devices.device_swinir)
 
     with tqdm(total=len(h_idx_list) * len(w_idx_list), desc="SwinIR tiles") as pbar:
         for h_idx in h_idx_list:
-- 
cgit v1.2.3


From faed465a0b1a7d19669568738c93e04907c10415 Mon Sep 17 00:00:00 2001
From: brkirch <brkirch@users.noreply.github.com>
Date: Tue, 25 Oct 2022 02:01:57 -0400
Subject: MPS Upscalers Fix

Get ESRGAN, SCUNet, and SwinIR working correctly on MPS by ensuring memory is contiguous for tensor views before sending to MPS device.
---
 modules/devices.py      | 4 ++++
 modules/esrgan_model.py | 2 +-
 modules/scunet_model.py | 3 +--
 modules/swinir_model.py | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'modules/devices.py')

diff --git a/modules/devices.py b/modules/devices.py
index 033a42d5..7511e1dc 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -81,3 +81,7 @@ def autocast(disable=False):
         return contextlib.nullcontext()
 
     return torch.autocast("cuda")
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/79383
+def mps_contiguous(input_tensor, device): return input_tensor.contiguous() if device.type == 'mps' else input_tensor
+def mps_contiguous_to(input_tensor, device): return mps_contiguous(input_tensor, device).to(device)
diff --git a/modules/esrgan_model.py b/modules/esrgan_model.py
index a49e2258..a13cf6ac 100644
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -190,7 +190,7 @@ def upscale_without_tiling(model, img):
     img = img[:, :, ::-1]
     img = np.ascontiguousarray(np.transpose(img, (2, 0, 1))) / 255
     img = torch.from_numpy(img).float()
-    img = img.unsqueeze(0).to(devices.device_esrgan)
+    img = devices.mps_contiguous_to(img.unsqueeze(0), devices.device_esrgan)
     with torch.no_grad():
         output = model(img)
     output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
diff --git a/modules/scunet_model.py b/modules/scunet_model.py
index 36a996bf..59532274 100644
--- a/modules/scunet_model.py
+++ b/modules/scunet_model.py
@@ -54,9 +54,8 @@ class UpscalerScuNET(modules.upscaler.Upscaler):
         img = img[:, :, ::-1]
         img = np.moveaxis(img, 2, 0) / 255
         img = torch.from_numpy(img).float()
-        img = img.unsqueeze(0).to(device)
+        img = devices.mps_contiguous_to(img.unsqueeze(0), device)
 
-        img = img.to(device)
         with torch.no_grad():
             output = model(img)
         output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
diff --git a/modules/swinir_model.py b/modules/swinir_model.py
index facd262d..4253b66d 100644
--- a/modules/swinir_model.py
+++ b/modules/swinir_model.py
@@ -111,7 +111,7 @@ def upscale(
     img = img[:, :, ::-1]
     img = np.moveaxis(img, 2, 0) / 255
     img = torch.from_numpy(img).float()
-    img = img.unsqueeze(0).to(devices.device_swinir)
+    img = devices.mps_contiguous_to(img.unsqueeze(0), devices.device_swinir)
     with torch.no_grad(), precision_scope("cuda"):
         _, _, h_old, w_old = img.size()
         h_pad = (h_old // window_size + 1) * window_size - h_old
-- 
cgit v1.2.3