From 8f5912984794c4c69e429c4636e984854d911b6a Mon Sep 17 00:00:00 2001 From: Melan Date: Thu, 20 Oct 2022 22:37:16 +0200 Subject: Some changes to the tensorboard code and hypernetwork support --- modules/hypernetworks/hypernetwork.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'modules/hypernetworks') diff --git a/modules/hypernetworks/hypernetwork.py b/modules/hypernetworks/hypernetwork.py index 74300122..5e919775 100644 --- a/modules/hypernetworks/hypernetwork.py +++ b/modules/hypernetworks/hypernetwork.py @@ -4,6 +4,7 @@ import html import os import sys import traceback +import tensorboard import tqdm import csv @@ -18,7 +19,6 @@ import modules.textual_inversion.dataset from modules.textual_inversion import textual_inversion from modules.textual_inversion.learn_schedule import LearnRateScheduler - class HypernetworkModule(torch.nn.Module): multiplier = 1.0 @@ -291,6 +291,9 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log scheduler = LearnRateScheduler(learn_rate, steps, ititial_step) optimizer = torch.optim.AdamW(weights, lr=scheduler.learn_rate) + if shared.opts.training_enable_tensorboard: + tensorboard_writer = textual_inversion.tensorboard_setup(log_directory) + pbar = tqdm.tqdm(enumerate(ds), total=steps - ititial_step) for i, entries in pbar: hypernetwork.step = i + ititial_step @@ -315,6 +318,7 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log optimizer.zero_grad() loss.backward() optimizer.step() + mean_loss = losses.mean() if torch.isnan(mean_loss): raise RuntimeError("Loss diverged.") @@ -323,6 +327,14 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log if hypernetwork.step > 0 and hypernetwork_dir is not None and hypernetwork.step % save_hypernetwork_every == 0: last_saved_file = os.path.join(hypernetwork_dir, f'{hypernetwork_name}-{hypernetwork.step}.pt') hypernetwork.save(last_saved_file) + + if shared.opts.training_enable_tensorboard: + epoch_num = hypernetwork.step // len(ds) + epoch_step = hypernetwork.step - (epoch_num * len(ds)) + 1 + + textual_inversion.tensorboard_add(tensorboard_writer, loss=mean_loss, + global_step=hypernetwork.step, step=epoch_step, + learn_rate=scheduler.learn_rate, epoch_num=epoch_num) textual_inversion.write_loss(log_directory, "hypernetwork_loss.csv", hypernetwork.step, len(ds), { "loss": f"{mean_loss:.7f}", @@ -360,6 +372,10 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, data_root, log processed = processing.process_images(p) image = processed.images[0] if len(processed.images)>0 else None + if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images: + textual_inversion.tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", + image, hypernetwork.step) + if unload: shared.sd_model.cond_stage_model.to(devices.cpu) shared.sd_model.first_stage_model.to(devices.cpu) -- cgit v1.2.3