From 3bca90b249d749ed5429f76e380d2ffa52fc0d41 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 13:48:27 +0300 Subject: hires fix checkpoint selection --- modules/processing.py | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index b0992ee1..7026487a 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -935,7 +935,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): cached_hr_uc = [None, None] cached_hr_c = [None, None] - def __init__(self, enable_hr: bool = False, denoising_strength: float = 0.75, firstphase_width: int = 0, firstphase_height: int = 0, hr_scale: float = 2.0, hr_upscaler: str = None, hr_second_pass_steps: int = 0, hr_resize_x: int = 0, hr_resize_y: int = 0, hr_sampler_name: str = None, hr_prompt: str = '', hr_negative_prompt: str = '', **kwargs): + def __init__(self, enable_hr: bool = False, denoising_strength: float = 0.75, firstphase_width: int = 0, firstphase_height: int = 0, hr_scale: float = 2.0, hr_upscaler: str = None, hr_second_pass_steps: int = 0, hr_resize_x: int = 0, hr_resize_y: int = 0, hr_checkpoint_name: str = None, hr_sampler_name: str = None, hr_prompt: str = '', hr_negative_prompt: str = '', **kwargs): super().__init__(**kwargs) self.enable_hr = enable_hr self.denoising_strength = denoising_strength @@ -946,11 +946,14 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): self.hr_resize_y = hr_resize_y self.hr_upscale_to_x = hr_resize_x self.hr_upscale_to_y = hr_resize_y + self.hr_checkpoint_name = hr_checkpoint_name + self.hr_checkpoint_info = None self.hr_sampler_name = hr_sampler_name self.hr_prompt = hr_prompt self.hr_negative_prompt = hr_negative_prompt self.all_hr_prompts = None self.all_hr_negative_prompts = None + self.latent_scale_mode = None if firstphase_width != 0 or firstphase_height != 0: self.hr_upscale_to_x = self.width @@ -973,6 +976,14 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): def init(self, all_prompts, all_seeds, all_subseeds): if self.enable_hr: + if self.hr_checkpoint_name: + self.hr_checkpoint_info = sd_models.get_closet_checkpoint_match(self.hr_checkpoint_name) + + if self.hr_checkpoint_info is None: + raise Exception(f'Could not find checkpoint with name {self.hr_checkpoint_name}') + + self.extra_generation_params["Hires checkpoint"] = self.hr_checkpoint_info.short_title + if self.hr_sampler_name is not None and self.hr_sampler_name != self.sampler_name: self.extra_generation_params["Hires sampler"] = self.hr_sampler_name @@ -982,6 +993,11 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): if tuple(self.hr_negative_prompt) != tuple(self.negative_prompt): self.extra_generation_params["Hires negative prompt"] = self.hr_negative_prompt + self.latent_scale_mode = shared.latent_upscale_modes.get(self.hr_upscaler, None) if self.hr_upscaler is not None else shared.latent_upscale_modes.get(shared.latent_upscale_default_mode, "nearest") + if self.enable_hr and self.latent_scale_mode is None: + if not any(x.name == self.hr_upscaler for x in shared.sd_upscalers): + raise Exception(f"could not find upscaler named {self.hr_upscaler}") + if opts.use_old_hires_fix_width_height and self.applied_old_hires_behavior_to != (self.width, self.height): self.hr_resize_x = self.width self.hr_resize_y = self.height @@ -1020,14 +1036,6 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): self.truncate_x = (self.hr_upscale_to_x - target_w) // opt_f self.truncate_y = (self.hr_upscale_to_y - target_h) // opt_f - # special case: the user has chosen to do nothing - if self.hr_upscale_to_x == self.width and self.hr_upscale_to_y == self.height: - self.enable_hr = False - self.denoising_strength = None - self.extra_generation_params.pop("Hires upscale", None) - self.extra_generation_params.pop("Hires resize", None) - return - if not state.processing_has_refined_job_count: if state.job_count == -1: state.job_count = self.n_iter @@ -1045,17 +1053,22 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): def sample(self, conditioning, unconditional_conditioning, seeds, subseeds, subseed_strength, prompts): self.sampler = sd_samplers.create_sampler(self.sampler_name, self.sd_model) - latent_scale_mode = shared.latent_upscale_modes.get(self.hr_upscaler, None) if self.hr_upscaler is not None else shared.latent_upscale_modes.get(shared.latent_upscale_default_mode, "nearest") - if self.enable_hr and latent_scale_mode is None: - if not any(x.name == self.hr_upscaler for x in shared.sd_upscalers): - raise Exception(f"could not find upscaler named {self.hr_upscaler}") - x = create_random_tensors([opt_C, self.height // opt_f, self.width // opt_f], seeds=seeds, subseeds=subseeds, subseed_strength=self.subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self) samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x)) if not self.enable_hr: return samples + current = shared.sd_model.sd_checkpoint_info + try: + if self.hr_checkpoint_info is not None: + sd_models.reload_model_weights(info=self.hr_checkpoint_info) + + return self.sample_hr_pass(samples, seeds, subseeds, subseed_strength, prompts) + finally: + sd_models.reload_model_weights(info=current) + + def sample_hr_pass(self, samples, seeds, subseeds, subseed_strength, prompts): self.is_hr_pass = True target_width = self.hr_upscale_to_x @@ -1073,11 +1086,11 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): info = create_infotext(self, self.all_prompts, self.all_seeds, self.all_subseeds, [], iteration=self.iteration, position_in_batch=index) images.save_image(image, self.outpath_samples, "", seeds[index], prompts[index], opts.samples_format, info=info, p=self, suffix="-before-highres-fix") - if latent_scale_mode is not None: + if self.latent_scale_mode is not None: for i in range(samples.shape[0]): save_intermediate(samples, i) - samples = torch.nn.functional.interpolate(samples, size=(target_height // opt_f, target_width // opt_f), mode=latent_scale_mode["mode"], antialias=latent_scale_mode["antialias"]) + samples = torch.nn.functional.interpolate(samples, size=(target_height // opt_f, target_width // opt_f), mode=self.latent_scale_mode["mode"], antialias=self.latent_scale_mode["antialias"]) # Avoid making the inpainting conditioning unless necessary as # this does need some extra compute to decode / encode the image again. @@ -1193,7 +1206,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): self.hr_uc = None self.hr_c = None - if self.enable_hr: + if self.enable_hr and self.hr_checkpoint_info is None: if shared.opts.hires_fix_use_firstpass_conds: self.calculate_hr_conds() -- cgit v1.2.3 From 40cd59207b96f9e522fdc104b43279880b671ce4 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 14:10:26 +0300 Subject: make it work with SDXL --- modules/processing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 7026487a..b8af1301 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1197,8 +1197,11 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): if self.hr_c is not None: return - self.hr_uc = self.get_conds_with_caching(prompt_parser.get_learned_conditioning, self.hr_negative_prompts, self.steps * self.step_multiplier, [self.cached_hr_uc, self.cached_uc], self.hr_extra_network_data) - self.hr_c = self.get_conds_with_caching(prompt_parser.get_multicond_learned_conditioning, self.hr_prompts, self.steps * self.step_multiplier, [self.cached_hr_c, self.cached_c], self.hr_extra_network_data) + hr_prompts = prompt_parser.SdConditioning(self.hr_prompts, width=self.hr_upscale_to_x, height=self.hr_upscale_to_y) + hr_negative_prompts = prompt_parser.SdConditioning(self.hr_negative_prompts, width=self.hr_upscale_to_x, height=self.hr_upscale_to_y, is_negative_prompt=True) + + self.hr_uc = self.get_conds_with_caching(prompt_parser.get_learned_conditioning, hr_negative_prompts, self.steps * self.step_multiplier, [self.cached_hr_uc, self.cached_uc], self.hr_extra_network_data) + self.hr_c = self.get_conds_with_caching(prompt_parser.get_multicond_learned_conditioning, hr_prompts, self.steps * self.step_multiplier, [self.cached_hr_c, self.cached_c], self.hr_extra_network_data) def setup_conds(self): super().setup_conds() -- cgit v1.2.3 From 77761e7bad8a7cbffc9028dc0b2f63169aaf25f9 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 14:10:33 +0300 Subject: linter --- modules/processing.py | 2 +- modules/ui.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index b8af1301..21dbef16 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1055,6 +1055,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): x = create_random_tensors([opt_C, self.height // opt_f, self.width // opt_f], seeds=seeds, subseeds=subseeds, subseed_strength=self.subseed_strength, seed_resize_from_h=self.seed_resize_from_h, seed_resize_from_w=self.seed_resize_from_w, p=self) samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x)) + del x if not self.enable_hr: return samples @@ -1137,7 +1138,6 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): noise = create_random_tensors(samples.shape[1:], seeds=seeds, subseeds=subseeds, subseed_strength=subseed_strength, p=self) # GC now before running the next img2img to prevent running out of memory - x = None devices.torch_gc() if not self.disable_extra_networks: diff --git a/modules/ui.py b/modules/ui.py index 6d8265f2..6fc9de83 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -476,9 +476,9 @@ def create_ui(): hr_resize_y = gr.Slider(minimum=0, maximum=2048, step=8, label="Resize height to", value=0, elem_id="txt2img_hr_resize_y") with FormRow(elem_id="txt2img_hires_fix_row3", variant="compact", visible=opts.hires_fix_show_sampler) as hr_sampler_container: - checkpoint_choices = lambda: ["Use same checkpoint"] + modules.sd_models.checkpoint_tiles(use_short=True) - hr_checkpoint_name = gr.Dropdown(label='Hires checkpoint', elem_id="hr_checkpoint", choices=checkpoint_choices(), value="Use same checkpoint") - create_refresh_button(hr_checkpoint_name, modules.sd_models.list_models, lambda: {"choices": checkpoint_choices()}, "hr_checkpoint_refresh") + + hr_checkpoint_name = gr.Dropdown(label='Hires checkpoint', elem_id="hr_checkpoint", choices=["Use same checkpoint"] + modules.sd_models.checkpoint_tiles(use_short=True), value="Use same checkpoint") + create_refresh_button(hr_checkpoint_name, modules.sd_models.list_models, lambda: {"choices": ["Use same checkpoint"] + modules.sd_models.checkpoint_tiles(use_short=True)}, "hr_checkpoint_refresh") hr_sampler_index = gr.Dropdown(label='Hires sampling method', elem_id="hr_sampler", choices=["Use same sampler"] + [x.name for x in samplers_for_img2img], value="Use same sampler", type="index") -- cgit v1.2.3 From eec540b22798ddcf8a03d947519c36635d77d722 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 15:04:12 +0300 Subject: repair non-latent upscaling broken for SDXL --- modules/processing.py | 1 + 1 file changed, 1 insertion(+) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 21dbef16..6fb14516 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1119,6 +1119,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): decoded_samples = torch.from_numpy(np.array(batch_images)) decoded_samples = decoded_samples.to(shared.device) decoded_samples = 2. * decoded_samples - 1. + decoded_samples = decoded_samples.to(shared.device, dtype=devices.dtype_vae) samples = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(decoded_samples)) -- cgit v1.2.3 From a64fbe89288802f8b5ec8ca7bcab5aaf2c7bfea5 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 15:12:09 +0300 Subject: make it possible to use checkpoints of different types (SD1, SDXL) in first and second pass of hires fix --- modules/processing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 6fb14516..c4da208f 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1060,16 +1060,21 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): if not self.enable_hr: return samples + if self.latent_scale_mode is None: + decoded_samples = decode_first_stage(self.sd_model, samples) + else: + decoded_samples = None + current = shared.sd_model.sd_checkpoint_info try: if self.hr_checkpoint_info is not None: sd_models.reload_model_weights(info=self.hr_checkpoint_info) - return self.sample_hr_pass(samples, seeds, subseeds, subseed_strength, prompts) + return self.sample_hr_pass(samples, decoded_samples, seeds, subseeds, subseed_strength, prompts) finally: sd_models.reload_model_weights(info=current) - def sample_hr_pass(self, samples, seeds, subseeds, subseed_strength, prompts): + def sample_hr_pass(self, samples, decoded_samples, seeds, subseeds, subseed_strength, prompts): self.is_hr_pass = True target_width = self.hr_upscale_to_x @@ -1100,7 +1105,6 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): else: image_conditioning = self.txt2img_image_conditioning(samples) else: - decoded_samples = decode_first_stage(self.sd_model, samples) lowres_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0) batch_images = [] -- cgit v1.2.3 From cc53db6652b11e6f7bca42c3aa93bd6761ed3d3f Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 15:30:33 +0300 Subject: this time for sure --- modules/processing.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index c4da208f..3190b964 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -538,8 +538,12 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see return x +class DecodedSamples(list): + already_decoded = True + + def decode_latent_batch(model, batch, target_device=None, check_for_nans=False): - samples = [] + samples = DecodedSamples() for i in range(batch.shape[0]): sample = decode_first_stage(model, batch[i:i + 1])[0] @@ -793,7 +797,11 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed: with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(): samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts) - x_samples_ddim = decode_latent_batch(p.sd_model, samples_ddim, target_device=devices.cpu, check_for_nans=True) + if getattr(samples_ddim, 'already_decoded', False): + x_samples_ddim = samples_ddim + else: + x_samples_ddim = decode_latent_batch(p.sd_model, samples_ddim, target_device=devices.cpu, check_for_nans=True) + x_samples_ddim = torch.stack(x_samples_ddim).float() x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) @@ -1161,9 +1169,11 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): sd_models.apply_token_merging(self.sd_model, self.get_token_merging_ratio()) + decoded_samples = decode_latent_batch(self.sd_model, samples, target_device=devices.cpu, check_for_nans=True) + self.is_hr_pass = False - return samples + return decoded_samples def close(self): super().close() -- cgit v1.2.3 From 02038036ff571e0f04a94c3e279609666e239dec Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 16:16:31 +0300 Subject: make it so that VAE NaNs autodetection also works during first pass of hires fix --- modules/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 3190b964..0677de81 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1069,7 +1069,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): return samples if self.latent_scale_mode is None: - decoded_samples = decode_first_stage(self.sd_model, samples) + decoded_samples = torch.stack(decode_latent_batch(self.sd_model, samples, target_device=devices.cpu, check_for_nans=True)) else: decoded_samples = None -- cgit v1.2.3 From 0af4127fd14360ebb12c6569d98aebf8047abbfc Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Sun, 30 Jul 2023 19:36:24 +0300 Subject: delete the field that is preventing the model from being unloaded and is causing increased RAM usage --- modules/processing.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 0677de81..b09433b0 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1076,11 +1076,15 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): current = shared.sd_model.sd_checkpoint_info try: if self.hr_checkpoint_info is not None: + del self.sampler sd_models.reload_model_weights(info=self.hr_checkpoint_info) + devices.torch_gc() return self.sample_hr_pass(samples, decoded_samples, seeds, subseeds, subseed_strength, prompts) finally: + del self.sampler sd_models.reload_model_weights(info=current) + devices.torch_gc() def sample_hr_pass(self, samples, decoded_samples, seeds, subseeds, subseed_strength, prompts): self.is_hr_pass = True -- cgit v1.2.3 From dca121e9035ba36b3f7484c8a31a7776d85c0960 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Mon, 31 Jul 2023 09:13:07 +0300 Subject: set the field to None instead --- modules/processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index b09433b0..35e7b87e 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1076,13 +1076,13 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): current = shared.sd_model.sd_checkpoint_info try: if self.hr_checkpoint_info is not None: - del self.sampler + self.sampler = None sd_models.reload_model_weights(info=self.hr_checkpoint_info) devices.torch_gc() return self.sample_hr_pass(samples, decoded_samples, seeds, subseeds, subseed_strength, prompts) finally: - del self.sampler + self.sampler = None sd_models.reload_model_weights(info=current) devices.torch_gc() -- cgit v1.2.3 From 29d7e31d89e9d686784eacbdbfc5b15959eb4449 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Mon, 31 Jul 2023 10:43:26 +0300 Subject: repair AttributeError: 'NoneType' object has no attribute 'conditioning_key' --- modules/processing.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 35e7b87e..1f0c0b3b 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1104,6 +1104,13 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): info = create_infotext(self, self.all_prompts, self.all_seeds, self.all_subseeds, [], iteration=self.iteration, position_in_batch=index) images.save_image(image, self.outpath_samples, "", seeds[index], prompts[index], opts.samples_format, info=info, p=self, suffix="-before-highres-fix") + img2img_sampler_name = self.hr_sampler_name or self.sampler_name + + if self.sampler_name in ['PLMS', 'UniPC']: # PLMS/UniPC do not support img2img so we just silently switch to DDIM + img2img_sampler_name = 'DDIM' + + self.sampler = sd_samplers.create_sampler(img2img_sampler_name, self.sd_model) + if self.latent_scale_mode is not None: for i in range(samples.shape[0]): save_intermediate(samples, i) @@ -1143,13 +1150,6 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): shared.state.nextjob() - img2img_sampler_name = self.hr_sampler_name or self.sampler_name - - if self.sampler_name in ['PLMS', 'UniPC']: # PLMS/UniPC do not support img2img so we just silently switch to DDIM - img2img_sampler_name = 'DDIM' - - self.sampler = sd_samplers.create_sampler(img2img_sampler_name, self.sd_model) - samples = samples[:, :, self.truncate_y//2:samples.shape[2]-(self.truncate_y+1)//2, self.truncate_x//2:samples.shape[3]-(self.truncate_x+1)//2] noise = create_random_tensors(samples.shape[1:], seeds=seeds, subseeds=subseeds, subseed_strength=subseed_strength, p=self) -- cgit v1.2.3 From c09bc2c60856ca1ab2243386176badf909affdbe Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Mon, 31 Jul 2023 13:20:26 +0300 Subject: fix "clamp_scalar_cpu" not implemented for 'Half' --- modules/processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index 1f0c0b3b..f8f8bddc 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1069,7 +1069,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing): return samples if self.latent_scale_mode is None: - decoded_samples = torch.stack(decode_latent_batch(self.sd_model, samples, target_device=devices.cpu, check_for_nans=True)) + decoded_samples = torch.stack(decode_latent_batch(self.sd_model, samples, target_device=devices.cpu, check_for_nans=True)).to(dtype=torch.float32) else: decoded_samples = None -- cgit v1.2.3 From ccb92339348f6973de39cde062982a51a4cd0818 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Wed, 2 Aug 2023 18:53:09 +0300 Subject: add yet another torch_gc to reclaim some of VRAM after the initial stage of img2img --- modules/processing.py | 1 + 1 file changed, 1 insertion(+) (limited to 'modules/processing.py') diff --git a/modules/processing.py b/modules/processing.py index b0992ee1..0b66cd2a 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -1348,6 +1348,7 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing): image = image.to(shared.device, dtype=devices.dtype_vae) self.init_latent = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(image)) + devices.torch_gc() if self.resize_mode == 3: self.init_latent = torch.nn.functional.interpolate(self.init_latent, size=(self.height // opt_f, self.width // opt_f), mode="bilinear") -- cgit v1.2.3 From 84b6fcd02ca6d6ab48c4b6be4bb8724b1c2e7014 Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Thu, 3 Aug 2023 00:00:23 +0300 Subject: add NV option for Random number generator source setting, which allows to generate same pictures on CPU/AMD/Mac as on NVidia videocards. --- modules/devices.py | 39 ++++++++++++++- modules/processing.py | 6 +-- modules/rng_philox.py | 100 ++++++++++++++++++++++++++++++++++++++ modules/sd_samplers_kdiffusion.py | 5 +- modules/shared.py | 2 +- 5 files changed, 142 insertions(+), 10 deletions(-) create mode 100644 modules/rng_philox.py (limited to 'modules/processing.py') diff --git a/modules/devices.py b/modules/devices.py index 57e51da3..b58776d8 100644 --- a/modules/devices.py +++ b/modules/devices.py @@ -3,7 +3,7 @@ import contextlib from functools import lru_cache import torch -from modules import errors +from modules import errors, rng_philox if sys.platform == "darwin": from modules import mac_specific @@ -90,23 +90,58 @@ def cond_cast_float(input): return input.float() if unet_needs_upcast else input +nv_rng = None + + def randn(seed, shape): from modules.shared import opts - torch.manual_seed(seed) + manual_seed(seed) + + if opts.randn_source == "NV": + return torch.asarray(nv_rng.randn(shape), device=device) + if opts.randn_source == "CPU" or device.type == 'mps': return torch.randn(shape, device=cpu).to(device) + return torch.randn(shape, device=device) +def randn_like(x): + from modules.shared import opts + + if opts.randn_source == "NV": + return torch.asarray(nv_rng.randn(x.shape), device=x.device, dtype=x.dtype) + + if opts.randn_source == "CPU" or x.device.type == 'mps': + return torch.randn_like(x, device=cpu).to(x.device) + + return torch.randn_like(x) + + def randn_without_seed(shape): from modules.shared import opts + if opts.randn_source == "NV": + return torch.asarray(nv_rng.randn(shape), device=device) + if opts.randn_source == "CPU" or device.type == 'mps': return torch.randn(shape, device=cpu).to(device) + return torch.randn(shape, device=device) +def manual_seed(seed): + from modules.shared import opts + + if opts.randn_source == "NV": + global nv_rng + nv_rng = rng_philox.Generator(seed) + return + + torch.manual_seed(seed) + + def autocast(disable=False): from modules import shared diff --git a/modules/processing.py b/modules/processing.py index 0b66cd2a..8f34c8b4 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -492,7 +492,7 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see noise_shape = shape if seed_resize_from_h <= 0 or seed_resize_from_w <= 0 else (shape[0], seed_resize_from_h//8, seed_resize_from_w//8) subnoise = None - if subseeds is not None: + if subseeds is not None and subseed_strength != 0: subseed = 0 if i >= len(subseeds) else subseeds[i] subnoise = devices.randn(subseed, noise_shape) @@ -524,7 +524,7 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see cnt = p.sampler.number_of_needed_noises(p) if eta_noise_seed_delta > 0: - torch.manual_seed(seed + eta_noise_seed_delta) + devices.manual_seed(seed + eta_noise_seed_delta) for j in range(cnt): sampler_noises[j].append(devices.randn_without_seed(tuple(noise_shape))) @@ -636,7 +636,7 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter "Token merging ratio": None if token_merging_ratio == 0 else token_merging_ratio, "Token merging ratio hr": None if not enable_hr or token_merging_ratio_hr == 0 else token_merging_ratio_hr, "Init image hash": getattr(p, 'init_img_hash', None), - "RNG": opts.randn_source if opts.randn_source != "GPU" else None, + "RNG": opts.randn_source if opts.randn_source != "GPU" and opts.randn_source != "NV" else None, "NGMS": None if p.s_min_uncond == 0 else p.s_min_uncond, **p.extra_generation_params, "Version": program_version() if opts.add_version_to_infotext else None, diff --git a/modules/rng_philox.py b/modules/rng_philox.py new file mode 100644 index 00000000..b5c02483 --- /dev/null +++ b/modules/rng_philox.py @@ -0,0 +1,100 @@ +"""RNG imitiating torch cuda randn on CPU. You are welcome. + +Usage: + +``` +g = Generator(seed=0) +print(g.randn(shape=(3, 4))) +``` + +Expected output: +``` +[[-0.92466259 -0.42534415 -2.6438457 0.14518388] + [-0.12086647 -0.57972564 -0.62285122 -0.32838709] + [-1.07454231 -0.36314407 -1.67105067 2.26550497]] +``` +""" + +import numpy as np + +philox_m = [0xD2511F53, 0xCD9E8D57] +philox_w = [0x9E3779B9, 0xBB67AE85] + +two_pow32_inv = np.array([2.3283064e-10], dtype=np.float32) +two_pow32_inv_2pi = np.array([2.3283064e-10 * 6.2831855], dtype=np.float32) + + +def uint32(x): + """Converts (N,) np.uint64 array into (2, N) np.unit32 array.""" + return np.moveaxis(x.view(np.uint32).reshape(-1, 2), 0, 1) + + +def philox4_round(counter, key): + """A single round of the Philox 4x32 random number generator.""" + + v1 = uint32(counter[0].astype(np.uint64) * philox_m[0]) + v2 = uint32(counter[2].astype(np.uint64) * philox_m[1]) + + counter[0] = v2[1] ^ counter[1] ^ key[0] + counter[1] = v2[0] + counter[2] = v1[1] ^ counter[3] ^ key[1] + counter[3] = v1[0] + + +def philox4_32(counter, key, rounds=10): + """Generates 32-bit random numbers using the Philox 4x32 random number generator. + + Parameters: + counter (numpy.ndarray): A 4xN array of 32-bit integers representing the counter values (offset into generation). + key (numpy.ndarray): A 2xN array of 32-bit integers representing the key values (seed). + rounds (int): The number of rounds to perform. + + Returns: + numpy.ndarray: A 4xN array of 32-bit integers containing the generated random numbers. + """ + + for _ in range(rounds - 1): + philox4_round(counter, key) + + key[0] = key[0] + philox_w[0] + key[1] = key[1] + philox_w[1] + + philox4_round(counter, key) + return counter + + +def box_muller(x, y): + """Returns just the first out of two numbers generated by Box–Muller transform algorithm.""" + u = x.astype(np.float32) * two_pow32_inv + two_pow32_inv / 2 + v = y.astype(np.float32) * two_pow32_inv_2pi + two_pow32_inv_2pi / 2 + + s = np.sqrt(-2.0 * np.log(u)) + + r1 = s * np.sin(v) + return r1.astype(np.float32) + + +class Generator: + """RNG that produces same outputs as torch.randn(..., device='cuda') on CPU""" + + def __init__(self, seed): + self.seed = seed + self.offset = 0 + + def randn(self, shape): + """Generate a sequence of n standard normal random variables using the Philox 4x32 random number generator and the Box-Muller transform.""" + + n = 1 + for x in shape: + n *= x + + counter = np.zeros((4, n), dtype=np.uint32) + counter[0] = self.offset + counter[2] = np.arange(n, dtype=np.uint32) # up to 2^32 numbers can be generated - if you want more you'd need to spill into counter[3] + self.offset += 1 + + key = uint32(np.array([[self.seed] * n], dtype=np.uint64)) + + g = philox4_32(counter, key) + + return box_muller(g[0], g[1]).reshape(shape) # discard g[2] and g[3] diff --git a/modules/sd_samplers_kdiffusion.py b/modules/sd_samplers_kdiffusion.py index e0da3425..d72c1b5f 100644 --- a/modules/sd_samplers_kdiffusion.py +++ b/modules/sd_samplers_kdiffusion.py @@ -260,10 +260,7 @@ class TorchHijack: if noise.shape == x.shape: return noise - if opts.randn_source == "CPU" or x.device.type == 'mps': - return torch.randn_like(x, device=devices.cpu).to(x.device) - else: - return torch.randn_like(x) + return devices.randn_like(x) class KDiffusionSampler: diff --git a/modules/shared.py b/modules/shared.py index aa72c9c8..7103b4ca 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -428,7 +428,7 @@ options_templates.update(options_section(('sd', "Stable Diffusion"), { "CLIP_stop_at_last_layers": OptionInfo(1, "Clip skip", gr.Slider, {"minimum": 1, "maximum": 12, "step": 1}).link("wiki", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#clip-skip").info("ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer"), "upcast_attn": OptionInfo(False, "Upcast cross attention layer to float32"), "auto_vae_precision": OptionInfo(True, "Automaticlly revert VAE to 32-bit floats").info("triggers when a tensor with NaNs is produced in VAE; disabling the option in this case will result in a black square image"), - "randn_source": OptionInfo("GPU", "Random number generator source.", gr.Radio, {"choices": ["GPU", "CPU"]}).info("changes seeds drastically; use CPU to produce the same picture across different videocard vendors"), + "randn_source": OptionInfo("GPU", "Random number generator source.", gr.Radio, {"choices": ["GPU", "CPU", "NV"]}).info("changes seeds drastically; use CPU to produce the same picture across different videocard vendors; use NV to produce same picture as on NVidia videocards"), })) options_templates.update(options_section(('sdxl', "Stable Diffusion XL"), { -- cgit v1.2.3 From f0c1063a707a4a43823b0ed00e2a8eeb22a9ed0a Mon Sep 17 00:00:00 2001 From: AUTOMATIC1111 <16777216c@gmail.com> Date: Fri, 4 Aug 2023 09:09:09 +0300 Subject: resolve some of circular import issues for kohaku --- modules/hypernetworks/hypernetwork.py | 5 ++--- modules/processing.py | 7 +------ modules/sd_hijack.py | 6 +++--- modules/sd_samplers_common.py | 10 ++++++++-- modules/textual_inversion/textual_inversion.py | 4 +++- 5 files changed, 17 insertions(+), 15 deletions(-) (limited to 'modules/processing.py') diff --git a/modules/hypernetworks/hypernetwork.py b/modules/hypernetworks/hypernetwork.py index c4821d21..70f1cbd2 100644 --- a/modules/hypernetworks/hypernetwork.py +++ b/modules/hypernetworks/hypernetwork.py @@ -10,7 +10,7 @@ import torch import tqdm from einops import rearrange, repeat from ldm.util import default -from modules import devices, processing, sd_models, shared, sd_samplers, hashes, sd_hijack_checkpoint, errors +from modules import devices, sd_models, shared, sd_samplers, hashes, sd_hijack_checkpoint, errors from modules.textual_inversion import textual_inversion, logging from modules.textual_inversion.learn_schedule import LearnRateScheduler from torch import einsum @@ -469,8 +469,7 @@ def create_hypernetwork(name, enable_sizes, overwrite_old, layer_structure=None, def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, varsize, steps, clip_grad_mode, clip_grad_value, shuffle_tags, tag_drop_out, latent_sampling_method, use_weight, create_image_every, save_hypernetwork_every, template_filename, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height): - # images allows training previews to have infotext. Importing it at the top causes a circular import problem. - from modules import images + from modules import images, processing save_hypernetwork_every = save_hypernetwork_every or 0 create_image_every = create_image_every or 0 diff --git a/modules/processing.py b/modules/processing.py index 8f34c8b4..8086a2b0 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -30,6 +30,7 @@ from ldm.models.diffusion.ddpm import LatentDepth2ImageDiffusion from einops import repeat, rearrange from blendmodes.blend import blendLayers, BlendType +decode_first_stage = sd_samplers_common.decode_first_stage # some of those options should not be changed at all because they would break the model, so I removed them from options. opt_C = 4 @@ -572,12 +573,6 @@ def decode_latent_batch(model, batch, target_device=None, check_for_nans=False): return samples -def decode_first_stage(model, x): - x = model.decode_first_stage(x.to(devices.dtype_vae)) - - return x - - def get_fixed_seed(seed): if seed is None or seed == '' or seed == -1: return int(random.randrange(4294967294)) diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index cfa5f0eb..609fd56c 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -2,7 +2,6 @@ import torch from torch.nn.functional import silu from types import MethodType -import modules.textual_inversion.textual_inversion from modules import devices, sd_hijack_optimizations, shared, script_callbacks, errors, sd_unet from modules.hypernetworks import hypernetwork from modules.shared import cmd_opts @@ -164,12 +163,13 @@ class StableDiffusionModelHijack: clip = None optimization_method = None - embedding_db = modules.textual_inversion.textual_inversion.EmbeddingDatabase() - def __init__(self): + import modules.textual_inversion.textual_inversion + self.extra_generation_params = {} self.comments = [] + self.embedding_db = modules.textual_inversion.textual_inversion.EmbeddingDatabase() self.embedding_db.add_embedding_dir(cmd_opts.embeddings_dir) def apply_optimizations(self, option=None): diff --git a/modules/sd_samplers_common.py b/modules/sd_samplers_common.py index 5deda761..b3d344e7 100644 --- a/modules/sd_samplers_common.py +++ b/modules/sd_samplers_common.py @@ -2,7 +2,7 @@ from collections import namedtuple import numpy as np import torch from PIL import Image -from modules import devices, processing, images, sd_vae_approx, sd_samplers, sd_vae_taesd, shared +from modules import devices, images, sd_vae_approx, sd_samplers, sd_vae_taesd, shared from modules.shared import opts, state SamplerData = namedtuple('SamplerData', ['name', 'constructor', 'aliases', 'options']) @@ -35,7 +35,7 @@ def single_sample_to_image(sample, approximation=None): x_sample = sample * 1.5 x_sample = sd_vae_taesd.model()(x_sample.to(devices.device, devices.dtype).unsqueeze(0))[0].detach() else: - x_sample = processing.decode_first_stage(shared.sd_model, sample.unsqueeze(0))[0] * 0.5 + 0.5 + x_sample = decode_first_stage(shared.sd_model, sample.unsqueeze(0))[0] * 0.5 + 0.5 x_sample = torch.clamp(x_sample, min=0.0, max=1.0) x_sample = 255. * np.moveaxis(x_sample.cpu().numpy(), 0, 2) @@ -44,6 +44,12 @@ def single_sample_to_image(sample, approximation=None): return Image.fromarray(x_sample) +def decode_first_stage(model, x): + x = model.decode_first_stage(x.to(devices.dtype_vae)) + + return x + + def sample_to_image(samples, index=0, approximation=None): return single_sample_to_image(samples[index], approximation) diff --git a/modules/textual_inversion/textual_inversion.py b/modules/textual_inversion/textual_inversion.py index 4713bc2d..aa79dc09 100644 --- a/modules/textual_inversion/textual_inversion.py +++ b/modules/textual_inversion/textual_inversion.py @@ -13,7 +13,7 @@ import numpy as np from PIL import Image, PngImagePlugin from torch.utils.tensorboard import SummaryWriter -from modules import shared, devices, sd_hijack, processing, sd_models, images, sd_samplers, sd_hijack_checkpoint, errors, hashes +from modules import shared, devices, sd_hijack, sd_models, images, sd_samplers, sd_hijack_checkpoint, errors, hashes import modules.textual_inversion.dataset from modules.textual_inversion.learn_schedule import LearnRateScheduler @@ -387,6 +387,8 @@ def validate_train_inputs(model_name, learn_rate, batch_size, gradient_step, dat def train_embedding(id_task, embedding_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, varsize, steps, clip_grad_mode, clip_grad_value, shuffle_tags, tag_drop_out, latent_sampling_method, use_weight, create_image_every, save_embedding_every, template_filename, save_image_with_stored_embedding, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, preview_width, preview_height): + from modules import processing + save_embedding_every = save_embedding_every or 0 create_image_every = create_image_every or 0 template_file = textual_inversion_templates.get(template_filename, None) -- cgit v1.2.3