From da464a3fb39ecc6ea7b22fe87271194480d8501c Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Wed, 12 Jul 2023 23:52:43 +0300 Subject: SDXL support --- modules/sd_hijack_clip.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'modules/sd_hijack_clip.py') diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py index 3b5a7666..6c17a81d 100644 --- a/modules/sd_hijack_clip.py +++ b/modules/sd_hijack_clip.py @@ -42,6 +42,10 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): self.hijack: sd_hijack.StableDiffusionModelHijack = hijack self.chunk_length = 75 + self.is_trainable = getattr(wrapped, 'is_trainable', False) + self.input_key = getattr(wrapped, 'input_key', 'txt') + self.legacy_ucg_val = None + def empty_chunk(self): """creates an empty PromptChunk and returns it""" @@ -199,8 +203,9 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): """ Accepts an array of texts; Passes texts through transformers network to create a tensor with numerical representation of those texts. Returns a tensor with shape of (B, T, C), where B is length of the array; T is length, in tokens, of texts (including padding) - T will - be a multiple of 77; and C is dimensionality of each token - for SD1 it's 768, and for SD2 it's 1024. + be a multiple of 77; and C is dimensionality of each token - for SD1 it's 768, for SD2 it's 1024, and for SDXL it's 1280. An example shape returned by this function can be: (2, 77, 768). + For SDXL, instead of returning one tensor avobe, it returns a tuple with two: the other one with shape (B, 1280) with pooled values. Webui usually sends just one text at a time through this function - the only time when texts is an array with more than one elemenet is when you do prompt editing: "a picture of a [cat:dog:0.4] eating ice cream" """ @@ -233,7 +238,10 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): embeddings_list = ", ".join([f'{name} [{embedding.checksum()}]' for name, embedding in used_embeddings.items()]) self.hijack.comments.append(f"Used embeddings: {embeddings_list}") - return torch.hstack(zs) + if getattr(self.wrapped, 'return_pooled', False): + return torch.hstack(zs), zs[0].pooled + else: + return torch.hstack(zs) def process_tokens(self, remade_batch_tokens, batch_multipliers): """ @@ -256,9 +264,9 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise batch_multipliers = torch.asarray(batch_multipliers).to(devices.device) original_mean = z.mean() - z = z * batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape) + z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape) new_mean = z.mean() - z = z * (original_mean / new_mean) + z *= (original_mean / new_mean) return z -- cgit v1.2.3 From 594c8e7b263d9b37f4b18b56b159aeb6d1bba1b4 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Thu, 13 Jul 2023 11:35:52 +0300 Subject: fix CLIP doing the unneeded normalization revert SD2.1 back to use the original repo add SDXL's force_zero_embeddings to negative prompt --- modules/processing.py | 2 +- modules/prompt_parser.py | 14 ++++++++++---- modules/sd_hijack.py | 2 +- modules/sd_hijack_clip.py | 15 +++++++++++++++ modules/sd_models_config.py | 1 - modules/sd_models_xl.py | 3 ++- 6 files changed, 29 insertions(+), 8 deletions(-) (limited to 'modules/sd_hijack_clip.py') diff --git a/modules/processing.py b/modules/processing.py index 85d35423..f01a6907 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -344,7 +344,7 @@ class StableDiffusionProcessing: def setup_conds(self): prompts = prompt_parser.SdConditioning(self.prompts, width=self.width, height=self.height) - negative_prompts = prompt_parser.SdConditioning(self.negative_prompts, width=self.width, height=self.height) + negative_prompts = prompt_parser.SdConditioning(self.negative_prompts, width=self.width, height=self.height, is_negative_prompt=True) sampler_config = sd_samplers.find_sampler_config(self.sampler_name) self.step_multiplier = 2 if sampler_config and sampler_config.options.get("second_order", False) else 1 diff --git a/modules/prompt_parser.py b/modules/prompt_parser.py index 33810669..b29d079d 100644 --- a/modules/prompt_parser.py +++ b/modules/prompt_parser.py @@ -116,11 +116,17 @@ class SdConditioning(list): A list with prompts for stable diffusion's conditioner model. Can also specify width and height of created image - SDXL needs it. """ - def __init__(self, prompts, width=None, height=None): + def __init__(self, prompts, is_negative_prompt=False, width=None, height=None, copy_from=None): super().__init__() self.extend(prompts) - self.width = width or getattr(prompts, 'width', None) - self.height = height or getattr(prompts, 'height', None) + + if copy_from is None: + copy_from = prompts + + self.is_negative_prompt = is_negative_prompt or getattr(copy_from, 'is_negative_prompt', False) + self.width = width or getattr(copy_from, 'width', None) + self.height = height or getattr(copy_from, 'height', None) + def get_learned_conditioning(model, prompts: SdConditioning | list[str], steps): @@ -153,7 +159,7 @@ def get_learned_conditioning(model, prompts: SdConditioning | list[str], steps): res.append(cached) continue - texts = [x[1] for x in prompt_schedule] + texts = SdConditioning([x[1] for x in prompt_schedule], copy_from=prompts) conds = model.get_learned_conditioning(texts) cond_schedule = [] diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index 266811f9..647cdfbe 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -190,7 +190,7 @@ class StableDiffusionModelHijack: if typename == 'FrozenCLIPEmbedder': model_embeddings = m.cond_stage_model.transformer.text_model.embeddings model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self) - m.cond_stage_model = sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords(embedder, self) + m.cond_stage_model = sd_hijack_clip.FrozenCLIPEmbedderForSDXLWithCustomWords(embedder, self) conditioner.embedders[i] = m.cond_stage_model if typename == 'FrozenOpenCLIPEmbedder2': embedder.model.token_embedding = EmbeddingsWithFixes(embedder.model.token_embedding, self) diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py index 6c17a81d..b3771909 100644 --- a/modules/sd_hijack_clip.py +++ b/modules/sd_hijack_clip.py @@ -323,3 +323,18 @@ class FrozenCLIPEmbedderWithCustomWords(FrozenCLIPEmbedderWithCustomWordsBase): embedded = embedding_layer.token_embedding.wrapped(ids.to(embedding_layer.token_embedding.wrapped.weight.device)).squeeze(0) return embedded + + +class FrozenCLIPEmbedderForSDXLWithCustomWords(FrozenCLIPEmbedderWithCustomWords): + def __init__(self, wrapped, hijack): + super().__init__(wrapped, hijack) + + def encode_with_transformers(self, tokens): + outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=self.wrapped.layer == "hidden") + + if self.wrapped.layer == "last": + z = outputs.last_hidden_state + else: + z = outputs.hidden_states[self.wrapped.layer_idx] + + return z diff --git a/modules/sd_models_config.py b/modules/sd_models_config.py index 2e92479a..04c09ab0 100644 --- a/modules/sd_models_config.py +++ b/modules/sd_models_config.py @@ -12,7 +12,6 @@ sd_xl_repo_configs_path = os.path.join(paths.paths['Stable Diffusion XL'], "conf config_default = shared.sd_default_config config_sd2 = os.path.join(sd_repo_configs_path, "v2-inference.yaml") config_sd2v = os.path.join(sd_repo_configs_path, "v2-inference-v.yaml") -config_sd2v = os.path.join(sd_xl_repo_configs_path, "sd_2_1_768.yaml") config_sd2_inpainting = os.path.join(sd_repo_configs_path, "v2-inpainting-inference.yaml") config_sdxl = os.path.join(sd_xl_repo_configs_path, "sd_xl_base.yaml") config_depth_model = os.path.join(sd_repo_configs_path, "v2-midas-inference.yaml") diff --git a/modules/sd_models_xl.py b/modules/sd_models_xl.py index 1dd4459f..b799ff46 100644 --- a/modules/sd_models_xl.py +++ b/modules/sd_models_xl.py @@ -22,7 +22,8 @@ def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch: "target_size_as_tuple": torch.tensor([height, width]).repeat(len(batch), 1).to(devices.device, devices.dtype), } - c = self.conditioner(sdxl_conds) + force_zero_negative_prompt = getattr(batch, 'is_negative_prompt', False) and all(x == '' for x in batch) + c = self.conditioner(sdxl_conds, force_zero_embeddings=['txt'] if force_zero_negative_prompt else []) return c -- cgit v1.2.3 From 8284ebd94c4d4f7e7141ebb47af206faab66ffed Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Wed, 26 Jul 2023 13:03:52 +0300 Subject: fix autograd which i broke for no good reason when implementing SDXL --- modules/sd_hijack_clip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'modules/sd_hijack_clip.py') diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py index 5443e609..990533fe 100644 --- a/modules/sd_hijack_clip.py +++ b/modules/sd_hijack_clip.py @@ -273,9 +273,9 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise batch_multipliers = torch.asarray(batch_multipliers).to(devices.device) original_mean = z.mean() - z *= batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape) + z = z * batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape) new_mean = z.mean() - z *= (original_mean / new_mean) + z = z * (original_mean / new_mean) return z -- cgit v1.2.3 From 89e6dfff717e2c2e57009fdabb0ed2b4f57b98a2 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Wed, 26 Jul 2023 15:07:56 +0300 Subject: repair SDXL --- modules/sd_hijack_clip.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'modules/sd_hijack_clip.py') diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py index 990533fe..16a5500e 100644 --- a/modules/sd_hijack_clip.py +++ b/modules/sd_hijack_clip.py @@ -270,6 +270,8 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): z = self.encode_with_transformers(tokens) + pooled = getattr(z, 'pooled', None) + # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise batch_multipliers = torch.asarray(batch_multipliers).to(devices.device) original_mean = z.mean() @@ -277,6 +279,9 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): new_mean = z.mean() z = z * (original_mean / new_mean) + if pooled is not None: + z.pooled = pooled + return z -- cgit v1.2.3 From 6f0abbb71a3f29d6df63fed82d5d5e196ca0d4de Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sat, 29 Jul 2023 15:15:06 +0300 Subject: textual inversion support for SDXL --- modules/sd_hijack.py | 8 +++++--- modules/sd_hijack_clip.py | 2 +- modules/sd_models_xl.py | 9 +++++++++ modules/textual_inversion/textual_inversion.py | 19 ++++++++++++++----- 4 files changed, 29 insertions(+), 9 deletions(-) (limited to 'modules/sd_hijack_clip.py') diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index c8fdd4f1..cfa5f0eb 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -197,7 +197,7 @@ class StableDiffusionModelHijack: conditioner.embedders[i] = sd_hijack_clip.FrozenCLIPEmbedderForSDXLWithCustomWords(embedder, self) text_cond_models.append(conditioner.embedders[i]) if typename == 'FrozenOpenCLIPEmbedder2': - embedder.model.token_embedding = EmbeddingsWithFixes(embedder.model.token_embedding, self) + embedder.model.token_embedding = EmbeddingsWithFixes(embedder.model.token_embedding, self, textual_inversion_key='clip_g') conditioner.embedders[i] = sd_hijack_open_clip.FrozenOpenCLIPEmbedder2WithCustomWords(embedder, self) text_cond_models.append(conditioner.embedders[i]) @@ -292,10 +292,11 @@ class StableDiffusionModelHijack: class EmbeddingsWithFixes(torch.nn.Module): - def __init__(self, wrapped, embeddings): + def __init__(self, wrapped, embeddings, textual_inversion_key='clip_l'): super().__init__() self.wrapped = wrapped self.embeddings = embeddings + self.textual_inversion_key = textual_inversion_key def forward(self, input_ids): batch_fixes = self.embeddings.fixes @@ -309,7 +310,8 @@ class EmbeddingsWithFixes(torch.nn.Module): vecs = [] for fixes, tensor in zip(batch_fixes, inputs_embeds): for offset, embedding in fixes: - emb = devices.cond_cast_unet(embedding.vec) + vec = embedding.vec[self.textual_inversion_key] if isinstance(embedding.vec, dict) else embedding.vec + emb = devices.cond_cast_unet(vec) emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0]) tensor = torch.cat([tensor[0:offset + 1], emb[0:emb_len], tensor[offset + 1 + emb_len:]]) diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py index 16a5500e..2f9d569b 100644 --- a/modules/sd_hijack_clip.py +++ b/modules/sd_hijack_clip.py @@ -161,7 +161,7 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): position += 1 continue - emb_len = int(embedding.vec.shape[0]) + emb_len = int(embedding.vectors) if len(chunk.tokens) + emb_len > self.chunk_length: next_chunk() diff --git a/modules/sd_models_xl.py b/modules/sd_models_xl.py index 40559208..bc219508 100644 --- a/modules/sd_models_xl.py +++ b/modules/sd_models_xl.py @@ -56,6 +56,14 @@ def encode_embedding_init_text(self: sgm.modules.GeneralConditioner, init_text, return torch.cat(res, dim=1) +def tokenize(self: sgm.modules.GeneralConditioner, texts): + for embedder in [embedder for embedder in self.embedders if hasattr(embedder, 'tokenize')]: + return embedder.tokenize(texts) + + raise AssertionError('no tokenizer available') + + + def process_texts(self, texts): for embedder in [embedder for embedder in self.embedders if hasattr(embedder, 'process_texts')]: return embedder.process_texts(texts) @@ -68,6 +76,7 @@ def get_target_prompt_token_count(self, token_count): # those additions to GeneralConditioner make it possible to use it as model.cond_stage_model from SD1.5 in exist sgm.modules.GeneralConditioner.encode_embedding_init_text = encode_embedding_init_text +sgm.modules.GeneralConditioner.tokenize = tokenize sgm.modules.GeneralConditioner.process_texts = process_texts sgm.modules.GeneralConditioner.get_target_prompt_token_count = get_target_prompt_token_count diff --git a/modules/textual_inversion/textual_inversion.py b/modules/textual_inversion/textual_inversion.py index 6166c76f..4713bc2d 100644 --- a/modules/textual_inversion/textual_inversion.py +++ b/modules/textual_inversion/textual_inversion.py @@ -181,29 +181,38 @@ class EmbeddingDatabase: else: return + # textual inversion embeddings if 'string_to_param' in data: param_dict = data['string_to_param'] param_dict = getattr(param_dict, '_parameters', param_dict) # fix for torch 1.12.1 loading saved file from torch 1.11 assert len(param_dict) == 1, 'embedding file has multiple terms in it' emb = next(iter(param_dict.items()))[1] - # diffuser concepts - elif type(data) == dict and type(next(iter(data.values()))) == torch.Tensor: + vec = emb.detach().to(devices.device, dtype=torch.float32) + shape = vec.shape[-1] + vectors = vec.shape[0] + elif type(data) == dict and 'clip_g' in data and 'clip_l' in data: # SDXL embedding + vec = {k: v.detach().to(devices.device, dtype=torch.float32) for k, v in data.items()} + shape = data['clip_g'].shape[-1] + data['clip_l'].shape[-1] + vectors = data['clip_g'].shape[0] + elif type(data) == dict and type(next(iter(data.values()))) == torch.Tensor: # diffuser concepts assert len(data.keys()) == 1, 'embedding file has multiple terms in it' emb = next(iter(data.values())) if len(emb.shape) == 1: emb = emb.unsqueeze(0) + vec = emb.detach().to(devices.device, dtype=torch.float32) + shape = vec.shape[-1] + vectors = vec.shape[0] else: raise Exception(f"Couldn't identify {filename} as neither textual inversion embedding nor diffuser concept.") - vec = emb.detach().to(devices.device, dtype=torch.float32) embedding = Embedding(vec, name) embedding.step = data.get('step', None) embedding.sd_checkpoint = data.get('sd_checkpoint', None) embedding.sd_checkpoint_name = data.get('sd_checkpoint_name', None) - embedding.vectors = vec.shape[0] - embedding.shape = vec.shape[-1] + embedding.vectors = vectors + embedding.shape = shape embedding.filename = path embedding.set_hash(hashes.sha256(embedding.filename, "textual_inversion/" + name) or '') -- cgit v1.2.3 From f56a309432996936a41040ea723d76a8cb488f87 Mon Sep 17 00:00:00 2001 From: w-e-w <40751091+w-e-w@users.noreply.github.com> Date: Thu, 3 Aug 2023 10:25:49 +0900 Subject: fix missing TI hash --- modules/sd_hijack_clip.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'modules/sd_hijack_clip.py') diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py index 2f9d569b..8f29057a 100644 --- a/modules/sd_hijack_clip.py +++ b/modules/sd_hijack_clip.py @@ -245,6 +245,8 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module): hashes.append(f"{name}: {shorthash}") if hashes: + if self.hijack.extra_generation_params.get("TI hashes"): + hashes.append(self.hijack.extra_generation_params.get("TI hashes")) self.hijack.extra_generation_params["TI hashes"] = ", ".join(hashes) if getattr(self.wrapped, 'return_pooled', False): -- cgit v1.2.3