From 39541d7725bc42f456a604b07c50aba503a5a09a Mon Sep 17 00:00:00 2001
From: Fampai <>
Date: Fri, 4 Nov 2022 04:50:22 -0400
Subject: Fixes race condition in training when VAE is unloaded
set_current_image can attempt to use the VAE when it is unloaded to
the CPU while training
---
modules/hypernetworks/hypernetwork.py | 4 ++++
1 file changed, 4 insertions(+)
(limited to 'modules/hypernetworks')
diff --git a/modules/hypernetworks/hypernetwork.py b/modules/hypernetworks/hypernetwork.py
index 6e1a10cf..fcb96059 100644
--- a/modules/hypernetworks/hypernetwork.py
+++ b/modules/hypernetworks/hypernetwork.py
@@ -390,7 +390,10 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log
with torch.autocast("cuda"):
ds = modules.textual_inversion.dataset.PersonalizedBase(data_root=data_root, width=training_width, height=training_height, repeats=shared.opts.training_image_repeats_per_epoch, placeholder_token=hypernetwork_name, model=shared.sd_model, device=devices.device, template_file=template_file, include_cond=True, batch_size=batch_size)
+ old_parallel_processing_allowed = shared.parallel_processing_allowed
+
if unload:
+ shared.parallel_processing_allowed = False
shared.sd_model.cond_stage_model.to(devices.cpu)
shared.sd_model.first_stage_model.to(devices.cpu)
@@ -531,6 +534,7 @@ Last saved image: {html.escape(last_saved_image)}
filename = os.path.join(shared.cmd_opts.hypernetwork_dir, f'{hypernetwork_name}.pt')
save_hypernetwork(hypernetwork, checkpoint, hypernetwork_name, filename)
+ shared.parallel_processing_allowed = old_parallel_processing_allowed
return hypernetwork, filename
--
cgit v1.2.3
From 4d5f1691dda971ec7b461dd880426300fd54ccee Mon Sep 17 00:00:00 2001
From: brkirch
Date: Mon, 28 Nov 2022 21:36:35 -0500
Subject: Use devices.autocast instead of torch.autocast
---
modules/hypernetworks/hypernetwork.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
(limited to 'modules/hypernetworks')
diff --git a/modules/hypernetworks/hypernetwork.py b/modules/hypernetworks/hypernetwork.py
index 8466887f..eb5ae372 100644
--- a/modules/hypernetworks/hypernetwork.py
+++ b/modules/hypernetworks/hypernetwork.py
@@ -495,7 +495,7 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step,
if shared.state.interrupted:
break
- with torch.autocast("cuda"):
+ with devices.autocast():
x = batch.latent_sample.to(devices.device, non_blocking=pin_memory)
if tag_drop_out != 0 or shuffle_tags:
shared.sd_model.cond_stage_model.to(devices.device)
--
cgit v1.2.3