aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMalumaDev <piano.lu92@gmail.com>2022-10-16 15:53:56 +0000
committerMalumaDev <piano.lu92@gmail.com>2022-10-16 15:53:56 +0000
commit9324cdaa3199d65c182858785dd1eca42b192b8e (patch)
tree47dc89324b5b4528a1744e269d82a5106bfa4e60
parente4f8b5f00dd33b7547cc6b76fbed26bb83b37a64 (diff)
downloadstable-diffusion-webui-gfx803-9324cdaa3199d65c182858785dd1eca42b192b8e.tar.gz
stable-diffusion-webui-gfx803-9324cdaa3199d65c182858785dd1eca42b192b8e.tar.bz2
stable-diffusion-webui-gfx803-9324cdaa3199d65c182858785dd1eca42b192b8e.zip
ui fix, re organization of the code
-rw-r--r--modules/aesthetic_clip.py154
-rw-r--r--modules/img2img.py14
-rw-r--r--modules/processing.py29
-rw-r--r--modules/sd_hijack.py102
-rw-r--r--modules/sd_models.py5
-rw-r--r--modules/shared.py14
-rw-r--r--modules/textual_inversion/dataset.py2
-rw-r--r--modules/txt2img.py18
-rw-r--r--modules/ui.py52
9 files changed, 233 insertions, 157 deletions
diff --git a/modules/aesthetic_clip.py b/modules/aesthetic_clip.py
index ccb35c73..34efa931 100644
--- a/modules/aesthetic_clip.py
+++ b/modules/aesthetic_clip.py
@@ -1,3 +1,4 @@
+import copy
import itertools
import os
from pathlib import Path
@@ -7,11 +8,12 @@ import gc
import gradio as gr
import torch
from PIL import Image
-from modules import shared
-from modules.shared import device
-from transformers import CLIPModel, CLIPProcessor
+from torch import optim
-from tqdm.auto import tqdm
+from modules import shared
+from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer
+from tqdm.auto import tqdm, trange
+from modules.shared import opts, device
def get_all_images_in_folder(folder):
@@ -37,12 +39,39 @@ def iter_to_batched(iterable, n=1):
yield chunk
+def create_ui():
+ with gr.Group():
+ with gr.Accordion("Open for Clip Aesthetic!", open=False):
+ with gr.Row():
+ aesthetic_weight = gr.Slider(minimum=0, maximum=1, step=0.01, label="Aesthetic weight",
+ value=0.9)
+ aesthetic_steps = gr.Slider(minimum=0, maximum=50, step=1, label="Aesthetic steps", value=5)
+
+ with gr.Row():
+ aesthetic_lr = gr.Textbox(label='Aesthetic learning rate',
+ placeholder="Aesthetic learning rate", value="0.0001")
+ aesthetic_slerp = gr.Checkbox(label="Slerp interpolation", value=False)
+ aesthetic_imgs = gr.Dropdown(sorted(shared.aesthetic_embeddings.keys()),
+ label="Aesthetic imgs embedding",
+ value="None")
+
+ with gr.Row():
+ aesthetic_imgs_text = gr.Textbox(label='Aesthetic text for imgs',
+ placeholder="This text is used to rotate the feature space of the imgs embs",
+ value="")
+ aesthetic_slerp_angle = gr.Slider(label='Slerp angle', minimum=0, maximum=1, step=0.01,
+ value=0.1)
+ aesthetic_text_negative = gr.Checkbox(label="Is negative text", value=False)
+
+ return aesthetic_weight, aesthetic_steps, aesthetic_lr, aesthetic_slerp, aesthetic_imgs, aesthetic_imgs_text, aesthetic_slerp_angle, aesthetic_text_negative
+
+
def generate_imgs_embd(name, folder, batch_size):
# clipModel = CLIPModel.from_pretrained(
# shared.sd_model.cond_stage_model.clipModel.name_or_path
# )
- model = CLIPModel.from_pretrained(shared.sd_model.cond_stage_model.clipModel.name_or_path).to(device)
- processor = CLIPProcessor.from_pretrained(shared.sd_model.cond_stage_model.clipModel.name_or_path)
+ model = shared.clip_model.to(device)
+ processor = CLIPProcessor.from_pretrained(model.name_or_path)
with torch.no_grad():
embs = []
@@ -63,7 +92,6 @@ def generate_imgs_embd(name, folder, batch_size):
torch.save(embs, path)
model = model.cpu()
- del model
del processor
del embs
gc.collect()
@@ -74,4 +102,114 @@ def generate_imgs_embd(name, folder, batch_size):
"""
shared.update_aesthetic_embeddings()
return gr.Dropdown.update(choices=sorted(shared.aesthetic_embeddings.keys()), label="Imgs embedding",
- value="None"), res, ""
+ value="None"), \
+ gr.Dropdown.update(choices=sorted(shared.aesthetic_embeddings.keys()),
+ label="Imgs embedding",
+ value="None"), res, ""
+
+
+def slerp(low, high, val):
+ low_norm = low / torch.norm(low, dim=1, keepdim=True)
+ high_norm = high / torch.norm(high, dim=1, keepdim=True)
+ omega = torch.acos((low_norm * high_norm).sum(1))
+ so = torch.sin(omega)
+ res = (torch.sin((1.0 - val) * omega) / so).unsqueeze(1) * low + (torch.sin(val * omega) / so).unsqueeze(1) * high
+ return res
+
+
+class AestheticCLIP:
+ def __init__(self):
+ self.skip = False
+ self.aesthetic_steps = 0
+ self.aesthetic_weight = 0
+ self.aesthetic_lr = 0
+ self.slerp = False
+ self.aesthetic_text_negative = ""
+ self.aesthetic_slerp_angle = 0
+ self.aesthetic_imgs_text = ""
+
+ self.image_embs_name = None
+ self.image_embs = None
+ self.load_image_embs(None)
+
+ def set_aesthetic_params(self, aesthetic_lr=0, aesthetic_weight=0, aesthetic_steps=0, image_embs_name=None,
+ aesthetic_slerp=True, aesthetic_imgs_text="",
+ aesthetic_slerp_angle=0.15,
+ aesthetic_text_negative=False):
+ self.aesthetic_imgs_text = aesthetic_imgs_text
+ self.aesthetic_slerp_angle = aesthetic_slerp_angle
+ self.aesthetic_text_negative = aesthetic_text_negative
+ self.slerp = aesthetic_slerp
+ self.aesthetic_lr = aesthetic_lr
+ self.aesthetic_weight = aesthetic_weight
+ self.aesthetic_steps = aesthetic_steps
+ self.load_image_embs(image_embs_name)
+
+ def set_skip(self, skip):
+ self.skip = skip
+
+ def load_image_embs(self, image_embs_name):
+ if image_embs_name is None or len(image_embs_name) == 0 or image_embs_name == "None":
+ image_embs_name = None
+ self.image_embs_name = None
+ if image_embs_name is not None and self.image_embs_name != image_embs_name:
+ self.image_embs_name = image_embs_name
+ self.image_embs = torch.load(shared.aesthetic_embeddings[self.image_embs_name], map_location=device)
+ self.image_embs /= self.image_embs.norm(dim=-1, keepdim=True)
+ self.image_embs.requires_grad_(False)
+
+ def __call__(self, z, remade_batch_tokens):
+ if not self.skip and self.aesthetic_steps != 0 and self.aesthetic_lr != 0 and self.aesthetic_weight != 0 and self.image_embs_name is not None:
+ tokenizer = shared.sd_model.cond_stage_model.tokenizer
+ if not opts.use_old_emphasis_implementation:
+ remade_batch_tokens = [
+ [tokenizer.bos_token_id] + x[:75] + [tokenizer.eos_token_id] for x in
+ remade_batch_tokens]
+
+ tokens = torch.asarray(remade_batch_tokens).to(device)
+
+ model = copy.deepcopy(shared.clip_model).to(device)
+ model.requires_grad_(True)
+ if self.aesthetic_imgs_text is not None and len(self.aesthetic_imgs_text) > 0:
+ text_embs_2 = model.get_text_features(
+ **tokenizer([self.aesthetic_imgs_text], padding=True, return_tensors="pt").to(device))
+ if self.aesthetic_text_negative:
+ text_embs_2 = self.image_embs - text_embs_2
+ text_embs_2 /= text_embs_2.norm(dim=-1, keepdim=True)
+ img_embs = slerp(self.image_embs, text_embs_2, self.aesthetic_slerp_angle)
+ else:
+ img_embs = self.image_embs
+
+ with torch.enable_grad():
+
+ # We optimize the model to maximize the similarity
+ optimizer = optim.Adam(
+ model.text_model.parameters(), lr=self.aesthetic_lr
+ )
+
+ for _ in trange(self.aesthetic_steps, desc="Aesthetic optimization"):
+ text_embs = model.get_text_features(input_ids=tokens)
+ text_embs = text_embs / text_embs.norm(dim=-1, keepdim=True)
+ sim = text_embs @ img_embs.T
+ loss = -sim
+ optimizer.zero_grad()
+ loss.mean().backward()
+ optimizer.step()
+
+ zn = model.text_model(input_ids=tokens, output_hidden_states=-opts.CLIP_stop_at_last_layers)
+ if opts.CLIP_stop_at_last_layers > 1:
+ zn = zn.hidden_states[-opts.CLIP_stop_at_last_layers]
+ zn = model.text_model.final_layer_norm(zn)
+ else:
+ zn = zn.last_hidden_state
+ model.cpu()
+ del model
+ gc.collect()
+ torch.cuda.empty_cache()
+ zn = torch.concat([zn[77 * i:77 * (i + 1)] for i in range(max(z.shape[1] // 77, 1))], 1)
+ if self.slerp:
+ z = slerp(z, zn, self.aesthetic_weight)
+ else:
+ z = z * (1 - self.aesthetic_weight) + zn * self.aesthetic_weight
+
+ return z
diff --git a/modules/img2img.py b/modules/img2img.py
index 24126774..4ed80c4b 100644
--- a/modules/img2img.py
+++ b/modules/img2img.py
@@ -56,7 +56,14 @@ def process_batch(p, input_dir, output_dir, args):
processed_image.save(os.path.join(output_dir, filename))
-def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, init_img, init_img_with_mask, init_img_inpaint, init_mask_inpaint, mask_mode, steps: int, sampler_index: int, mask_blur: int, inpainting_fill: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, denoising_strength: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, *args):
+def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, init_img, init_img_with_mask, init_img_inpaint, init_mask_inpaint, mask_mode, steps: int, sampler_index: int, mask_blur: int, inpainting_fill: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, denoising_strength: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str,
+ aesthetic_lr=0,
+ aesthetic_weight=0, aesthetic_steps=0,
+ aesthetic_imgs=None,
+ aesthetic_slerp=False,
+ aesthetic_imgs_text="",
+ aesthetic_slerp_angle=0.15,
+ aesthetic_text_negative=False, *args):
is_inpaint = mode == 1
is_batch = mode == 2
@@ -109,6 +116,11 @@ def img2img(mode: int, prompt: str, negative_prompt: str, prompt_style: str, pro
inpainting_mask_invert=inpainting_mask_invert,
)
+ shared.aesthetic_clip.set_aesthetic_params(float(aesthetic_lr), float(aesthetic_weight), int(aesthetic_steps),
+ aesthetic_imgs, aesthetic_slerp, aesthetic_imgs_text,
+ aesthetic_slerp_angle,
+ aesthetic_text_negative)
+
if shared.cmd_opts.enable_console_prompts:
print(f"\nimg2img: {prompt}", file=shared.progress_print_out)
diff --git a/modules/processing.py b/modules/processing.py
index 1db26c3e..685f9fcd 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -146,7 +146,8 @@ class Processed:
self.prompt = self.prompt if type(self.prompt) != list else self.prompt[0]
self.negative_prompt = self.negative_prompt if type(self.negative_prompt) != list else self.negative_prompt[0]
self.seed = int(self.seed if type(self.seed) != list else self.seed[0]) if self.seed is not None else -1
- self.subseed = int(self.subseed if type(self.subseed) != list else self.subseed[0]) if self.subseed is not None else -1
+ self.subseed = int(
+ self.subseed if type(self.subseed) != list else self.subseed[0]) if self.subseed is not None else -1
self.all_prompts = all_prompts or [self.prompt]
self.all_seeds = all_seeds or [self.seed]
@@ -332,16 +333,9 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments, iteration
return f"{all_prompts[index]}{negative_prompt_text}\n{generation_params_text}".strip()
-def process_images(p: StableDiffusionProcessing, aesthetic_lr=0, aesthetic_weight=0, aesthetic_steps=0,
- aesthetic_imgs=None, aesthetic_slerp=False, aesthetic_imgs_text="",
- aesthetic_slerp_angle=0.15,
- aesthetic_text_negative=False) -> Processed:
+def process_images(p: StableDiffusionProcessing) -> Processed:
"""this is the main loop that both txt2img and img2img use; it calls func_init once inside all the scopes and func_sample once per batch"""
- aesthetic_lr = float(aesthetic_lr)
- aesthetic_weight = float(aesthetic_weight)
- aesthetic_steps = int(aesthetic_steps)
-
if type(p.prompt) == list:
assert (len(p.prompt) > 0)
else:
@@ -417,16 +411,10 @@ def process_images(p: StableDiffusionProcessing, aesthetic_lr=0, aesthetic_weigh
# uc = p.sd_model.get_learned_conditioning(len(prompts) * [p.negative_prompt])
# c = p.sd_model.get_learned_conditioning(prompts)
with devices.autocast():
- if hasattr(shared.sd_model.cond_stage_model, "set_aesthetic_params"):
- shared.sd_model.cond_stage_model.set_aesthetic_params()
+ shared.aesthetic_clip.set_skip(True)
uc = prompt_parser.get_learned_conditioning(shared.sd_model, len(prompts) * [p.negative_prompt],
p.steps)
- if hasattr(shared.sd_model.cond_stage_model, "set_aesthetic_params"):
- shared.sd_model.cond_stage_model.set_aesthetic_params(aesthetic_lr, aesthetic_weight,
- aesthetic_steps, aesthetic_imgs,
- aesthetic_slerp, aesthetic_imgs_text,
- aesthetic_slerp_angle,
- aesthetic_text_negative)
+ shared.aesthetic_clip.set_skip(False)
c = prompt_parser.get_multicond_learned_conditioning(shared.sd_model, prompts, p.steps)
if len(model_hijack.comments) > 0:
@@ -582,7 +570,6 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
self.truncate_x = int(self.firstphase_width - firstphase_width_truncated) // opt_f
self.truncate_y = int(self.firstphase_height - firstphase_height_truncated) // opt_f
-
def sample(self, conditioning, unconditional_conditioning, seeds, subseeds, subseed_strength):
self.sampler = sd_samplers.create_sampler_with_index(sd_samplers.samplers, self.sampler_index, self.sd_model)
@@ -600,10 +587,12 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
seed_resize_from_w=self.seed_resize_from_w, p=self)
samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning)
- samples = samples[:, :, self.truncate_y//2:samples.shape[2]-self.truncate_y//2, self.truncate_x//2:samples.shape[3]-self.truncate_x//2]
+ samples = samples[:, :, self.truncate_y // 2:samples.shape[2] - self.truncate_y // 2,
+ self.truncate_x // 2:samples.shape[3] - self.truncate_x // 2]
if opts.use_scale_latent_for_hires_fix:
- samples = torch.nn.functional.interpolate(samples, size=(self.height // opt_f, self.width // opt_f), mode="bilinear")
+ samples = torch.nn.functional.interpolate(samples, size=(self.height // opt_f, self.width // opt_f),
+ mode="bilinear")
else:
decoded_samples = decode_first_stage(self.sd_model, samples)
lowres_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0)
diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py
index 5d0590af..227e7670 100644
--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@@ -29,8 +29,8 @@ def apply_optimizations():
ldm.modules.diffusionmodules.model.nonlinearity = silu
-
- if cmd_opts.force_enable_xformers or (cmd_opts.xformers and shared.xformers_available and torch.version.cuda and (6, 0) <= torch.cuda.get_device_capability(shared.device) <= (9, 0)):
+ if cmd_opts.force_enable_xformers or (cmd_opts.xformers and shared.xformers_available and torch.version.cuda and (
+ 6, 0) <= torch.cuda.get_device_capability(shared.device) <= (9, 0)):
print("Applying xformers cross attention optimization.")
ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.xformers_attention_forward
ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward
@@ -118,33 +118,14 @@ class StableDiffusionModelHijack:
return remade_batch_tokens[0], token_count, get_target_prompt_token_count(token_count)
-def slerp(low, high, val):
- low_norm = low / torch.norm(low, dim=1, keepdim=True)
- high_norm = high / torch.norm(high, dim=1, keepdim=True)
- omega = torch.acos((low_norm * high_norm).sum(1))
- so = torch.sin(omega)
- res = (torch.sin((1.0 - val) * omega) / so).unsqueeze(1) * low + (torch.sin(val * omega) / so).unsqueeze(1) * high
- return res
-
-
class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
def __init__(self, wrapped, hijack):
super().__init__()
self.wrapped = wrapped
- self.clipModel = CLIPModel.from_pretrained(
- self.wrapped.transformer.name_or_path
- )
- del self.clipModel.vision_model
- self.tokenizer = CLIPTokenizer.from_pretrained(self.wrapped.transformer.name_or_path)
- self.hijack: StableDiffusionModelHijack = hijack
- self.tokenizer = wrapped.tokenizer
- # self.vision = CLIPVisionModel.from_pretrained(self.wrapped.transformer.name_or_path).eval()
- self.image_embs_name = None
- self.image_embs = None
- self.load_image_embs(None)
self.token_mults = {}
-
+ self.hijack: StableDiffusionModelHijack = hijack
+ self.tokenizer = wrapped.tokenizer
self.comma_token = [v for k, v in self.tokenizer.get_vocab().items() if k == ',</w>'][0]
tokens_with_parens = [(k, v) for k, v in self.tokenizer.get_vocab().items() if
@@ -164,28 +145,6 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
if mult != 1.0:
self.token_mults[ident] = mult
- def set_aesthetic_params(self, aesthetic_lr=0, aesthetic_weight=0, aesthetic_steps=0, image_embs_name=None,
- aesthetic_slerp=True, aesthetic_imgs_text="",
- aesthetic_slerp_angle=0.15,
- aesthetic_text_negative=False):
- self.aesthetic_imgs_text = aesthetic_imgs_text
- self.aesthetic_slerp_angle = aesthetic_slerp_angle
- self.aesthetic_text_negative = aesthetic_text_negative
- self.slerp = aesthetic_slerp
- self.aesthetic_lr = aesthetic_lr
- self.aesthetic_weight = aesthetic_weight
- self.aesthetic_steps = aesthetic_steps
- self.load_image_embs(image_embs_name)
-
- def load_image_embs(self, image_embs_name):
- if image_embs_name is None or len(image_embs_name) == 0 or image_embs_name == "None":
- image_embs_name = None
- if image_embs_name is not None and self.image_embs_name != image_embs_name:
- self.image_embs_name = image_embs_name
- self.image_embs = torch.load(shared.aesthetic_embeddings[self.image_embs_name], map_location=device)
- self.image_embs /= self.image_embs.norm(dim=-1, keepdim=True)
- self.image_embs.requires_grad_(False)
-
def tokenize_line(self, line, used_custom_terms, hijack_comments):
id_end = self.wrapped.tokenizer.eos_token_id
@@ -391,58 +350,7 @@ class FrozenCLIPEmbedderWithCustomWords(torch.nn.Module):
z1 = self.process_tokens(tokens, multipliers)
z = z1 if z is None else torch.cat((z, z1), axis=-2)
-
- if self.aesthetic_steps != 0 and self.aesthetic_lr != 0 and self.aesthetic_weight != 0 and self.image_embs_name != None:
- if not opts.use_old_emphasis_implementation:
- remade_batch_tokens = [
- [self.wrapped.tokenizer.bos_token_id] + x[:75] + [self.wrapped.tokenizer.eos_token_id] for x in
- remade_batch_tokens]
-
- tokens = torch.asarray(remade_batch_tokens).to(device)
-
- model = copy.deepcopy(self.clipModel).to(device)
- model.requires_grad_(True)
- if self.aesthetic_imgs_text is not None and len(self.aesthetic_imgs_text) > 0:
- text_embs_2 = model.get_text_features(
- **self.tokenizer([self.aesthetic_imgs_text], padding=True, return_tensors="pt").to(device))
- if self.aesthetic_text_negative:
- text_embs_2 = self.image_embs - text_embs_2
- text_embs_2 /= text_embs_2.norm(dim=-1, keepdim=True)
- img_embs = slerp(self.image_embs, text_embs_2, self.aesthetic_slerp_angle)
- else:
- img_embs = self.image_embs
-
- with torch.enable_grad():
-
- # We optimize the model to maximize the similarity
- optimizer = optim.Adam(
- model.text_model.parameters(), lr=self.aesthetic_lr
- )
-
- for i in trange(self.aesthetic_steps, desc="Aesthetic optimization"):
- text_embs = model.get_text_features(input_ids=tokens)
- text_embs = text_embs / text_embs.norm(dim=-1, keepdim=True)
- sim = text_embs @ img_embs.T
- loss = -sim
- optimizer.zero_grad()
- loss.mean().backward()
- optimizer.step()
-
- zn = model.text_model(input_ids=tokens, output_hidden_states=-opts.CLIP_stop_at_last_layers)
- if opts.CLIP_stop_at_last_layers > 1:
- zn = zn.hidden_states[-opts.CLIP_stop_at_last_layers]
- zn = model.text_model.final_layer_norm(zn)
- else:
- zn = zn.last_hidden_state
- model.cpu()
- del model
-
- zn = torch.concat([zn for i in range(z.shape[1] // 77)], 1)
- if self.slerp:
- z = slerp(z, zn, self.aesthetic_weight)
- else:
- z = z * (1 - self.aesthetic_weight) + zn * self.aesthetic_weight
-
+ z = shared.aesthetic_clip(z, remade_batch_tokens)
remade_batch_tokens = rem_tokens
batch_multipliers = rem_multipliers
i += 1
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 3aa21ec1..8e4ee435 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -20,7 +20,7 @@ checkpoints_loaded = collections.OrderedDict()
try:
# this silences the annoying "Some weights of the model checkpoint were not used when initializing..." message at start.
- from transformers import logging
+ from transformers import logging, CLIPModel
logging.set_verbosity_error()
except Exception:
@@ -196,6 +196,9 @@ def load_model():
sd_hijack.model_hijack.hijack(sd_model)
+ if shared.clip_model is None or shared.clip_model.transformer.name_or_path != sd_model.cond_stage_model.wrapped.transformer.name_or_path:
+ shared.clip_model = CLIPModel.from_pretrained(sd_model.cond_stage_model.wrapped.transformer.name_or_path)
+
sd_model.eval()
print(f"Model loaded.")
diff --git a/modules/shared.py b/modules/shared.py
index e2c98b2d..e19ca779 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -3,6 +3,7 @@ import datetime
import json
import os
import sys
+from collections import OrderedDict
import gradio as gr
import tqdm
@@ -94,15 +95,15 @@ os.makedirs(cmd_opts.hypernetwork_dir, exist_ok=True)
hypernetworks = hypernetwork.list_hypernetworks(cmd_opts.hypernetwork_dir)
loaded_hypernetwork = None
-aesthetic_embeddings = {f.replace(".pt",""): os.path.join(cmd_opts.aesthetic_embeddings_dir, f) for f in
- os.listdir(cmd_opts.aesthetic_embeddings_dir) if f.endswith(".pt")}
-aesthetic_embeddings = aesthetic_embeddings | {"None": None}
+aesthetic_embeddings = {}
def update_aesthetic_embeddings():
global aesthetic_embeddings
aesthetic_embeddings = {f.replace(".pt",""): os.path.join(cmd_opts.aesthetic_embeddings_dir, f) for f in
os.listdir(cmd_opts.aesthetic_embeddings_dir) if f.endswith(".pt")}
- aesthetic_embeddings = aesthetic_embeddings | {"None": None}
+ aesthetic_embeddings = OrderedDict(**{"None": None}, **aesthetic_embeddings)
+
+update_aesthetic_embeddings()
def reload_hypernetworks():
global hypernetworks
@@ -381,6 +382,11 @@ sd_upscalers = []
sd_model = None
+clip_model = None
+
+from modules.aesthetic_clip import AestheticCLIP
+aesthetic_clip = AestheticCLIP()
+
progress_print_out = sys.stdout
diff --git a/modules/textual_inversion/dataset.py b/modules/textual_inversion/dataset.py
index 68ceffe3..23bb4b6a 100644
--- a/modules/textual_inversion/dataset.py
+++ b/modules/textual_inversion/dataset.py
@@ -49,7 +49,7 @@ class PersonalizedBase(Dataset):
print("Preparing dataset...")
for path in tqdm.tqdm(self.image_paths):
try:
- image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.Resampling.BICUBIC)
+ image = Image.open(path).convert('RGB').resize((self.width, self.height), PIL.Image.BICUBIC)
except Exception:
continue
diff --git a/modules/txt2img.py b/modules/txt2img.py
index 8f394d05..6cbc50fc 100644
--- a/modules/txt2img.py
+++ b/modules/txt2img.py
@@ -1,12 +1,17 @@
import modules.scripts
-from modules.processing import StableDiffusionProcessing, Processed, StableDiffusionProcessingTxt2Img, StableDiffusionProcessingImg2Img, process_images
+from modules.processing import StableDiffusionProcessing, Processed, StableDiffusionProcessingTxt2Img, \
+ StableDiffusionProcessingImg2Img, process_images
from modules.shared import opts, cmd_opts
import modules.shared as shared
import modules.processing as processing
from modules.ui import plaintext_to_html
-def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int, restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int, subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool, height: int, width: int, enable_hr: bool, denoising_strength: float, firstphase_width: int, firstphase_height: int,aesthetic_lr=0,
+def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2: str, steps: int, sampler_index: int,
+ restore_faces: bool, tiling: bool, n_iter: int, batch_size: int, cfg_scale: float, seed: int, subseed: int,
+ subseed_strength: float, seed_resize_from_h: int, seed_resize_from_w: int, seed_enable_extras: bool,
+ height: int, width: int, enable_hr: bool, denoising_strength: float, firstphase_width: int,
+ firstphase_height: int, aesthetic_lr=0,
aesthetic_weight=0, aesthetic_steps=0,
aesthetic_imgs=None,
aesthetic_slerp=False,
@@ -41,15 +46,17 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
firstphase_height=firstphase_height if enable_hr else None,
)
+ shared.aesthetic_clip.set_aesthetic_params(float(aesthetic_lr), float(aesthetic_weight), int(aesthetic_steps),
+ aesthetic_imgs, aesthetic_slerp, aesthetic_imgs_text, aesthetic_slerp_angle,
+ aesthetic_text_negative)
+
if cmd_opts.enable_console_prompts:
print(f"\ntxt2img: {prompt}", file=shared.progress_print_out)
processed = modules.scripts.scripts_txt2img.run(p, *args)
if processed is None:
- processed = process_images(p, aesthetic_lr, aesthetic_weight, aesthetic_steps, aesthetic_imgs, aesthetic_slerp,aesthetic_imgs_text,
- aesthetic_slerp_angle,
- aesthetic_text_negative)
+ processed = process_images(p)
shared.total_tqdm.clear()
@@ -61,4 +68,3 @@ def txt2img(prompt: str, negative_prompt: str, prompt_style: str, prompt_style2:
processed.images = []
return processed.images, generation_info_js, plaintext_to_html(processed.info)
-
diff --git a/modules/ui.py b/modules/ui.py
index 4069f0d2..0e5d73f0 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -43,7 +43,7 @@ from modules.images import save_image
import modules.textual_inversion.ui
import modules.hypernetworks.ui
-import modules.aesthetic_clip
+import modules.aesthetic_clip as aesthetic_clip
import modules.images_history as img_his
@@ -593,23 +593,25 @@ def create_ui(wrap_gradio_gpu_call):
width = gr.Slider(minimum=64, maximum=2048, step=64, label="Width", value=512)
height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512)
- with gr.Group():
- with gr.Accordion("Open for Clip Aesthetic!",open=False):
- with gr.Row():
- aesthetic_weight = gr.Slider(minimum=0, maximum=1, step=0.01, label="Aesthetic weight", value=0.9)
- aesthetic_steps = gr.Slider(minimum=0, maximum=50, step=1, label="Aesthetic steps", value=5)
-
- with gr.Row():
- aesthetic_lr = gr.Textbox(label='Aesthetic learning rate', placeholder="Aesthetic learning rate", value="0.0001")
- aesthetic_slerp = gr.Checkbox(label="Slerp interpolation", value=False)
- aesthetic_imgs = gr.Dropdown(sorted(aesthetic_embeddings.keys()),
- label="Aesthetic imgs embedding",
- value="None")
-
- with gr.Row():
- aesthetic_imgs_text = gr.Textbox(label='Aesthetic text for imgs', placeholder="This text is used to rotate the feature space of the imgs embs", value="")
- aesthetic_slerp_angle = gr.Slider(label='Slerp angle',minimum=0, maximum=1, step=0.01, value=0.1)
- aesthetic_text_negative = gr.Checkbox(label="Is negative text", value=False)
+ # with gr.Group():
+ # with gr.Accordion("Open for Clip Aesthetic!",open=False):
+ # with gr.Row():
+ # aesthetic_weight = gr.Slider(minimum=0, maximum=1, step=0.01, label="Aesthetic weight", value=0.9)
+ # aesthetic_steps = gr.Slider(minimum=0, maximum=50, step=1, label="Aesthetic steps", value=5)
+ #
+ # with gr.Row():
+ # aesthetic_lr = gr.Textbox(label='Aesthetic learning rate', placeholder="Aesthetic learning rate", value="0.0001")
+ # aesthetic_slerp = gr.Checkbox(label="Slerp interpolation", value=False)
+ # aesthetic_imgs = gr.Dropdown(sorted(aesthetic_embeddings.keys()),
+ # label="Aesthetic imgs embedding",
+ # value="None")
+ #
+ # with gr.Row():
+ # aesthetic_imgs_text = gr.Textbox(label='Aesthetic text for imgs', placeholder="This text is used to rotate the feature space of the imgs embs", value="")
+ # aesthetic_slerp_angle = gr.Slider(label='Slerp angle',minimum=0, maximum=1, step=0.01, value=0.1)
+ # aesthetic_text_negative = gr.Checkbox(label="Is negative text", value=False)
+
+ aesthetic_weight, aesthetic_steps, aesthetic_lr, aesthetic_slerp, aesthetic_imgs, aesthetic_imgs_text, aesthetic_slerp_angle, aesthetic_text_negative = aesthetic_clip.create_ui()
with gr.Row():
@@ -840,6 +842,9 @@ def create_ui(wrap_gradio_gpu_call):
width = gr.Slider(minimum=64, maximum=2048, step=64, label="Width", value=512)
height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512)
+ aesthetic_weight_im, aesthetic_steps_im, aesthetic_lr_im, aesthetic_slerp_im, aesthetic_imgs_im, aesthetic_imgs_text_im, aesthetic_slerp_angle_im, aesthetic_text_negative_im = aesthetic_clip.create_ui()
+
+
with gr.Row():
restore_faces = gr.Checkbox(label='Restore faces', value=False, visible=len(shared.face_restorers) > 1)
tiling = gr.Checkbox(label='Tiling', value=False)
@@ -944,6 +949,14 @@ def create_ui(wrap_gradio_gpu_call):
inpainting_mask_invert,
img2img_batch_input_dir,
img2img_batch_output_dir,
+ aesthetic_lr_im,
+ aesthetic_weight_im,
+ aesthetic_steps_im,
+ aesthetic_imgs_im,
+ aesthetic_slerp_im,
+ aesthetic_imgs_text_im,
+ aesthetic_slerp_angle_im,
+ aesthetic_text_negative_im,
] + custom_inputs,
outputs=[
img2img_gallery,
@@ -1283,7 +1296,7 @@ def create_ui(wrap_gradio_gpu_call):
)
create_embedding_ae.click(
- fn=modules.aesthetic_clip.generate_imgs_embd,
+ fn=aesthetic_clip.generate_imgs_embd,
inputs=[
new_embedding_name_ae,
process_src_ae,
@@ -1291,6 +1304,7 @@ def create_ui(wrap_gradio_gpu_call):
],
outputs=[
aesthetic_imgs,
+ aesthetic_imgs_im,
ti_output,
ti_outcome,
]