aboutsummaryrefslogtreecommitdiffstats
path: root/modules/devices.py
diff options
context:
space:
mode:
authorAUTOMATIC1111 <16777216c@gmail.com>2023-01-13 11:57:38 +0000
committerGitHub <noreply@github.com>2023-01-13 11:57:38 +0000
commit9cd7716753c5be47f76b8e5555cc3e7c0f17d34d (patch)
tree345be78dd1991b77fcf4519bc44097e975e0b0c4 /modules/devices.py
parent18f86e41f6f289042c075bff1498e620ab997b8c (diff)
parent544e7a233e994f379dd67df08f5f519290b10293 (diff)
downloadstable-diffusion-webui-gfx803-9cd7716753c5be47f76b8e5555cc3e7c0f17d34d.tar.gz
stable-diffusion-webui-gfx803-9cd7716753c5be47f76b8e5555cc3e7c0f17d34d.tar.bz2
stable-diffusion-webui-gfx803-9cd7716753c5be47f76b8e5555cc3e7c0f17d34d.zip
Merge branch 'master' into tensorboard
Diffstat (limited to 'modules/devices.py')
-rw-r--r--modules/devices.py130
1 files changed, 108 insertions, 22 deletions
diff --git a/modules/devices.py b/modules/devices.py
index eb422583..caeb0276 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -1,62 +1,96 @@
+import sys, os, shlex
import contextlib
-
import torch
-
from modules import errors
+from packaging import version
-# has_mps is only available in nightly pytorch (for now), `getattr` for compatibility
-has_mps = getattr(torch, 'has_mps', False)
-cpu = torch.device("cpu")
+# has_mps is only available in nightly pytorch (for now) and macOS 12.3+.
+# check `getattr` and try it for compatibility
+def has_mps() -> bool:
+ if not getattr(torch, 'has_mps', False):
+ return False
+ try:
+ torch.zeros(1).to(torch.device("mps"))
+ return True
+ except Exception:
+ return False
+
+
+def extract_device_id(args, name):
+ for x in range(len(args)):
+ if name in args[x]:
+ return args[x + 1]
+
+ return None
+
+
+def get_cuda_device_string():
+ from modules import shared
+
+ if shared.cmd_opts.device_id is not None:
+ return f"cuda:{shared.cmd_opts.device_id}"
+
+ return "cuda"
def get_optimal_device():
if torch.cuda.is_available():
- return torch.device("cuda")
+ return torch.device(get_cuda_device_string())
- if has_mps:
+ if has_mps():
return torch.device("mps")
return cpu
+def get_device_for(task):
+ from modules import shared
+
+ if task in shared.cmd_opts.use_cpu:
+ return cpu
+
+ return get_optimal_device()
+
+
def torch_gc():
if torch.cuda.is_available():
- torch.cuda.empty_cache()
- torch.cuda.ipc_collect()
+ with torch.cuda.device(get_cuda_device_string()):
+ torch.cuda.empty_cache()
+ torch.cuda.ipc_collect()
def enable_tf32():
if torch.cuda.is_available():
+
+ # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
+ # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
+ if any([torch.cuda.get_device_capability(devid) == (7, 5) for devid in range(0, torch.cuda.device_count())]):
+ torch.backends.cudnn.benchmark = True
+
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
+
errors.run(enable_tf32, "Enabling TF32")
-device = device_interrogate = device_gfpgan = device_bsrgan = device_esrgan = device_scunet = device_codeformer = get_optimal_device()
+cpu = torch.device("cpu")
+device = device_interrogate = device_gfpgan = device_esrgan = device_codeformer = None
dtype = torch.float16
dtype_vae = torch.float16
-def randn(seed, shape):
- # Pytorch currently doesn't handle setting randomness correctly when the metal backend is used.
- if device.type == 'mps':
- generator = torch.Generator(device=cpu)
- generator.manual_seed(seed)
- noise = torch.randn(shape, generator=generator, device=cpu).to(device)
- return noise
+def randn(seed, shape):
torch.manual_seed(seed)
+ if device.type == 'mps':
+ return torch.randn(shape, device=cpu).to(device)
return torch.randn(shape, device=device)
def randn_without_seed(shape):
- # Pytorch currently doesn't handle setting randomness correctly when the metal backend is used.
if device.type == 'mps':
- generator = torch.Generator(device=cpu)
- noise = torch.randn(shape, generator=generator, device=cpu).to(device)
- return noise
-
+ return torch.randn(shape, device=cpu).to(device)
return torch.randn(shape, device=device)
@@ -70,3 +104,55 @@ def autocast(disable=False):
return contextlib.nullcontext()
return torch.autocast("cuda")
+
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/79383
+orig_tensor_to = torch.Tensor.to
+def tensor_to_fix(self, *args, **kwargs):
+ if self.device.type != 'mps' and \
+ ((len(args) > 0 and isinstance(args[0], torch.device) and args[0].type == 'mps') or \
+ (isinstance(kwargs.get('device'), torch.device) and kwargs['device'].type == 'mps')):
+ self = self.contiguous()
+ return orig_tensor_to(self, *args, **kwargs)
+
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/80800
+orig_layer_norm = torch.nn.functional.layer_norm
+def layer_norm_fix(*args, **kwargs):
+ if len(args) > 0 and isinstance(args[0], torch.Tensor) and args[0].device.type == 'mps':
+ args = list(args)
+ args[0] = args[0].contiguous()
+ return orig_layer_norm(*args, **kwargs)
+
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/90532
+orig_tensor_numpy = torch.Tensor.numpy
+def numpy_fix(self, *args, **kwargs):
+ if self.requires_grad:
+ self = self.detach()
+ return orig_tensor_numpy(self, *args, **kwargs)
+
+
+# MPS workaround for https://github.com/pytorch/pytorch/issues/89784
+orig_cumsum = torch.cumsum
+orig_Tensor_cumsum = torch.Tensor.cumsum
+def cumsum_fix(input, cumsum_func, *args, **kwargs):
+ if input.device.type == 'mps':
+ output_dtype = kwargs.get('dtype', input.dtype)
+ if any(output_dtype == broken_dtype for broken_dtype in [torch.bool, torch.int8, torch.int16, torch.int64]):
+ return cumsum_func(input.cpu(), *args, **kwargs).to(input.device)
+ return cumsum_func(input, *args, **kwargs)
+
+
+if has_mps():
+ if version.parse(torch.__version__) < version.parse("1.13"):
+ # PyTorch 1.13 doesn't need these fixes but unfortunately is slower and has regressions that prevent training from working
+ torch.Tensor.to = tensor_to_fix
+ torch.nn.functional.layer_norm = layer_norm_fix
+ torch.Tensor.numpy = numpy_fix
+ elif version.parse(torch.__version__) > version.parse("1.13.1"):
+ if not torch.Tensor([1,2]).to(torch.device("mps")).equal(torch.Tensor([1,1]).to(torch.device("mps")).cumsum(0, dtype=torch.int16)):
+ torch.cumsum = lambda input, *args, **kwargs: ( cumsum_fix(input, orig_cumsum, *args, **kwargs) )
+ torch.Tensor.cumsum = lambda self, *args, **kwargs: ( cumsum_fix(self, orig_Tensor_cumsum, *args, **kwargs) )
+ orig_narrow = torch.narrow
+ torch.narrow = lambda *args, **kwargs: ( orig_narrow(*args, **kwargs).clone() )