From fec0a895119a124a295e3dad5205de5766031dc7 Mon Sep 17 00:00:00 2001 From: Pam Date: Tue, 7 Mar 2023 00:33:13 +0500 Subject: scaled dot product attention --- modules/sd_hijack.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'modules/sd_hijack.py') diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index 79476783..76cb9120 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -42,6 +42,10 @@ def apply_optimizations(): ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.xformers_attention_forward ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward optimization_method = 'xformers' + elif cmd_opts.opt_sdp_attention and (hasattr(torch.nn.functional, "scaled_dot_product_attention") and callable(getattr(torch.nn.functional, "scaled_dot_product_attention"))): + print("Applying scaled dot product cross attention optimization.") + ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward + optimization_method = 'sdp' elif cmd_opts.opt_sub_quad_attention: print("Applying sub-quadratic cross attention optimization.") ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.sub_quad_attention_forward -- cgit v1.2.3 From 37acba263389e22bc46cfffc80b2ca8b76a85287 Mon Sep 17 00:00:00 2001 From: Pam Date: Fri, 10 Mar 2023 12:19:36 +0500 Subject: argument to disable memory efficient for sdp --- modules/sd_hijack.py | 11 ++++++++--- modules/sd_hijack_optimizations.py | 4 ++++ modules/shared.py | 1 + 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'modules/sd_hijack.py') diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index 76cb9120..f62e9adb 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -43,9 +43,14 @@ def apply_optimizations(): ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward optimization_method = 'xformers' elif cmd_opts.opt_sdp_attention and (hasattr(torch.nn.functional, "scaled_dot_product_attention") and callable(getattr(torch.nn.functional, "scaled_dot_product_attention"))): - print("Applying scaled dot product cross attention optimization.") - ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward - optimization_method = 'sdp' + if cmd_opts.opt_sdp_no_mem_attention: + print("Applying scaled dot product cross attention optimization (without memory efficient attention).") + ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_no_mem_attention_forward + optimization_method = 'sdp-no-mem' + else: + print("Applying scaled dot product cross attention optimization.") + ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward + optimization_method = 'sdp' elif cmd_opts.opt_sub_quad_attention: print("Applying sub-quadratic cross attention optimization.") ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.sub_quad_attention_forward diff --git a/modules/sd_hijack_optimizations.py b/modules/sd_hijack_optimizations.py index a324a592..68b1dd84 100644 --- a/modules/sd_hijack_optimizations.py +++ b/modules/sd_hijack_optimizations.py @@ -388,6 +388,10 @@ def scaled_dot_product_attention_forward(self, x, context=None, mask=None): hidden_states = self.to_out[1](hidden_states) return hidden_states +def scaled_dot_product_no_mem_attention_forward(self, x, context=None, mask=None): + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False): + return scaled_dot_product_attention_forward(self, x, context, mask) + def cross_attention_attnblock_forward(self, x): h_ = x h_ = self.norm(h_) diff --git a/modules/shared.py b/modules/shared.py index 12d0756b..4b81c591 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -70,6 +70,7 @@ parser.add_argument("--sub-quad-chunk-threshold", type=int, help="the percentage parser.add_argument("--opt-split-attention-invokeai", action='store_true', help="force-enables InvokeAI's cross-attention layer optimization. By default, it's on when cuda is unavailable.") parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find") parser.add_argument("--opt-sdp-attention", action='store_true', help="enable scaled dot product cross-attention layer optimization; requires PyTorch 2.*") +parser.add_argument("--opt-sdp-no-mem-attention", action='store_true', help="disables memory efficient sdp, makes image generation deterministic; requires --opt-sdp-attention") parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization") parser.add_argument("--disable-nan-check", action='store_true', help="do not check if produced images/latent spaces have nans; useful for running without a checkpoint in CI") parser.add_argument("--use-cpu", nargs='+', help="use CPU as torch device for specified modules", default=[], type=str.lower) -- cgit v1.2.3 From 0981dea94832f34d638b1aa8964cfaeffd223b47 Mon Sep 17 00:00:00 2001 From: Pam Date: Fri, 10 Mar 2023 12:58:10 +0500 Subject: sdp refactoring --- modules/sd_hijack.py | 19 ++++++++++--------- modules/shared.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'modules/sd_hijack.py') diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index f62e9adb..e98ae51a 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -37,20 +37,21 @@ def apply_optimizations(): optimization_method = None + can_use_sdp = hasattr(torch.nn.functional, "scaled_dot_product_attention") and callable(getattr(torch.nn.functional, "scaled_dot_product_attention")) # not everyone has torch 2.x to use sdp + if cmd_opts.force_enable_xformers or (cmd_opts.xformers and shared.xformers_available and torch.version.cuda and (6, 0) <= torch.cuda.get_device_capability(shared.device) <= (9, 0)): print("Applying xformers cross attention optimization.") ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.xformers_attention_forward ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward optimization_method = 'xformers' - elif cmd_opts.opt_sdp_attention and (hasattr(torch.nn.functional, "scaled_dot_product_attention") and callable(getattr(torch.nn.functional, "scaled_dot_product_attention"))): - if cmd_opts.opt_sdp_no_mem_attention: - print("Applying scaled dot product cross attention optimization (without memory efficient attention).") - ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_no_mem_attention_forward - optimization_method = 'sdp-no-mem' - else: - print("Applying scaled dot product cross attention optimization.") - ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward - optimization_method = 'sdp' + elif cmd_opts.opt_sdp_no_mem_attention and can_use_sdp: + print("Applying scaled dot product cross attention optimization (without memory efficient attention).") + ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_no_mem_attention_forward + optimization_method = 'sdp-no-mem' + elif cmd_opts.opt_sdp_attention and can_use_sdp: + print("Applying scaled dot product cross attention optimization.") + ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward + optimization_method = 'sdp' elif cmd_opts.opt_sub_quad_attention: print("Applying sub-quadratic cross attention optimization.") ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.sub_quad_attention_forward diff --git a/modules/shared.py b/modules/shared.py index 4b81c591..66a6bfa5 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -70,7 +70,7 @@ parser.add_argument("--sub-quad-chunk-threshold", type=int, help="the percentage parser.add_argument("--opt-split-attention-invokeai", action='store_true', help="force-enables InvokeAI's cross-attention layer optimization. By default, it's on when cuda is unavailable.") parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find") parser.add_argument("--opt-sdp-attention", action='store_true', help="enable scaled dot product cross-attention layer optimization; requires PyTorch 2.*") -parser.add_argument("--opt-sdp-no-mem-attention", action='store_true', help="disables memory efficient sdp, makes image generation deterministic; requires --opt-sdp-attention") +parser.add_argument("--opt-sdp-no-mem-attention", action='store_true', help="enable scaled dot product cross-attention layer optimization without memory efficient attention, makes image generation deterministic; requires PyTorch 2.*") parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization") parser.add_argument("--disable-nan-check", action='store_true', help="do not check if produced images/latent spaces have nans; useful for running without a checkpoint in CI") parser.add_argument("--use-cpu", nargs='+', help="use CPU as torch device for specified modules", default=[], type=str.lower) -- cgit v1.2.3 From 8d7fa2f67cb0554d8902d5d407166876020e067e Mon Sep 17 00:00:00 2001 From: Pam Date: Fri, 10 Mar 2023 22:48:41 +0500 Subject: sdp_attnblock_forward hijack --- modules/sd_hijack.py | 2 ++ modules/sd_hijack_optimizations.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'modules/sd_hijack.py') diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index e98ae51a..f4bb0266 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -47,10 +47,12 @@ def apply_optimizations(): elif cmd_opts.opt_sdp_no_mem_attention and can_use_sdp: print("Applying scaled dot product cross attention optimization (without memory efficient attention).") ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_no_mem_attention_forward + ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.sdp_no_mem_attnblock_forward optimization_method = 'sdp-no-mem' elif cmd_opts.opt_sdp_attention and can_use_sdp: print("Applying scaled dot product cross attention optimization.") ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward + ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.sdp_attnblock_forward optimization_method = 'sdp' elif cmd_opts.opt_sub_quad_attention: print("Applying sub-quadratic cross attention optimization.") diff --git a/modules/sd_hijack_optimizations.py b/modules/sd_hijack_optimizations.py index 68b1dd84..2e307b5d 100644 --- a/modules/sd_hijack_optimizations.py +++ b/modules/sd_hijack_optimizations.py @@ -473,6 +473,30 @@ def xformers_attnblock_forward(self, x): except NotImplementedError: return cross_attention_attnblock_forward(self, x) +def sdp_attnblock_forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + b, c, h, w = q.shape + q, k, v = map(lambda t: rearrange(t, 'b c h w -> b (h w) c'), (q, k, v)) + dtype = q.dtype + if shared.opts.upcast_attn: + q, k = q.float(), k.float() + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() + out = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=0.0, is_causal=False) + out = out.to(dtype) + out = rearrange(out, 'b (h w) c -> b c h w', h=h) + out = self.proj_out(out) + return x + out + +def sdp_no_mem_attnblock_forward(self, x): + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False): + return sdp_attnblock_forward(self, x) + def sub_quad_attnblock_forward(self, x): h_ = x h_ = self.norm(h_) -- cgit v1.2.3