diff --git a/.gitignore b/.gitignore index ecef2713bc..33f0de4df2 100644 --- a/.gitignore +++ b/.gitignore @@ -199,7 +199,13 @@ checkpoints .scratch/ .vscode/ gfpgan/ -models/ldm/stable-diffusion-v1/model.sha256 +models/ldm/stable-diffusion-v1/*.sha256 # GFPGAN model files gfpgan/ + +# config file (will be created by installer) +configs/models.yaml + +# weights (will be created by installer) +models/ldm/stable-diffusion-v1/*.ckpt \ No newline at end of file diff --git a/configs/autoencoder/autoencoder_kl_16x16x16.yaml b/configs/autoencoder/autoencoder_kl_16x16x16.yaml deleted file mode 100644 index 5f1d10ec75..0000000000 --- a/configs/autoencoder/autoencoder_kl_16x16x16.yaml +++ /dev/null @@ -1,54 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 16 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 16 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [16] - dropout: 0.0 - - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/autoencoder/autoencoder_kl_32x32x4.yaml b/configs/autoencoder/autoencoder_kl_32x32x4.yaml deleted file mode 100644 index ab8b36fe6e..0000000000 --- a/configs/autoencoder/autoencoder_kl_32x32x4.yaml +++ /dev/null @@ -1,53 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 4 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/autoencoder/autoencoder_kl_64x64x3.yaml b/configs/autoencoder/autoencoder_kl_64x64x3.yaml deleted file mode 100644 index 5e3db5c4e2..0000000000 --- a/configs/autoencoder/autoencoder_kl_64x64x3.yaml +++ /dev/null @@ -1,54 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 3 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/autoencoder/autoencoder_kl_8x8x64.yaml b/configs/autoencoder/autoencoder_kl_8x8x64.yaml deleted file mode 100644 index 5ccd09d38e..0000000000 --- a/configs/autoencoder/autoencoder_kl_8x8x64.yaml +++ /dev/null @@ -1,53 +0,0 @@ -model: - base_learning_rate: 4.5e-6 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: "val/rec_loss" - embed_dim: 64 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 0.000001 - disc_weight: 0.5 - - ddconfig: - double_z: True - z_channels: 64 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,1,2,2,4,4] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [16,8] - dropout: 0.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 12 - wrap: True - train: - target: ldm.data.imagenet.ImageNetSRTrain - params: - size: 256 - degradation: pil_nearest - validation: - target: ldm.data.imagenet.ImageNetSRValidation - params: - size: 256 - degradation: pil_nearest - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 1000 - max_images: 8 - increase_log_steps: True - - trainer: - benchmark: True - accumulate_grad_batches: 2 diff --git a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml b/configs/latent-diffusion/celebahq-ldm-vq-4.yaml deleted file mode 100644 index 89b3df4fe1..0000000000 --- a/configs/latent-diffusion/celebahq-ldm-vq-4.yaml +++ /dev/null @@ -1,86 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - image_size: 64 - channels: 3 - monitor: val/loss_simple_ema - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - # note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 64 for f4 - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ckpt_path: models/first_stage_models/vq-f4/model.ckpt - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: taming.data.faceshq.CelebAHQTrain - params: - size: 256 - validation: - target: taming.data.faceshq.CelebAHQValidation - params: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/cin-ldm-vq-f8.yaml b/configs/latent-diffusion/cin-ldm-vq-f8.yaml deleted file mode 100644 index b8cd9e2ef5..0000000000 --- a/configs/latent-diffusion/cin-ldm-vq-f8.yaml +++ /dev/null @@ -1,98 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 32 - channels: 4 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 256 - attention_resolutions: - #note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 32 for f8 - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 4 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 512 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 4 - n_embed: 16384 - ckpt_path: configs/first_stage_models/vq-f8/model.yaml - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.ClassEmbedder - params: - embed_dim: 512 - key: class_label -data: - target: main.DataModuleFromConfig - params: - batch_size: 64 - num_workers: 12 - wrap: false - train: - target: ldm.data.imagenet.ImageNetTrain - params: - config: - size: 256 - validation: - target: ldm.data.imagenet.ImageNetValidation - params: - config: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/cin256-v2.yaml b/configs/latent-diffusion/cin256-v2.yaml deleted file mode 100644 index b7c1aa240c..0000000000 --- a/configs/latent-diffusion/cin256-v2.yaml +++ /dev/null @@ -1,68 +0,0 @@ -model: - base_learning_rate: 0.0001 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss - use_ema: False - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 192 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 5 - num_heads: 1 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 512 - - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: ldm.modules.encoders.modules.ClassEmbedder - params: - n_classes: 1001 - embed_dim: 512 - key: class_label diff --git a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml b/configs/latent-diffusion/ffhq-ldm-vq-4.yaml deleted file mode 100644 index 1899e30f77..0000000000 --- a/configs/latent-diffusion/ffhq-ldm-vq-4.yaml +++ /dev/null @@ -1,85 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - image_size: 64 - channels: 3 - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - # note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 64 for f4 - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ckpt_path: configs/first_stage_models/vq-f4/model.yaml - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 42 - num_workers: 5 - wrap: false - train: - target: taming.data.faceshq.FFHQTrain - params: - size: 256 - validation: - target: taming.data.faceshq.FFHQValidation - params: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml b/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml deleted file mode 100644 index c4ca66c16c..0000000000 --- a/configs/latent-diffusion/lsun_bedrooms-ldm-vq-4.yaml +++ /dev/null @@ -1,85 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - image_size: 64 - channels: 3 - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - # note: this isn\t actually the resolution but - # the downsampling factor, i.e. this corresnponds to - # attention on spatial resolution 8,16,32, as the - # spatial reolution of the latents is 64 for f4 - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - ckpt_path: configs/first_stage_models/vq-f4/model.yaml - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: ldm.data.lsun.LSUNBedroomsTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNBedroomsValidation - params: - size: 256 - - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml b/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml deleted file mode 100644 index 18dc8c2d9c..0000000000 --- a/configs/latent-diffusion/lsun_churches-ldm-kl-8.yaml +++ /dev/null @@ -1,91 +0,0 @@ -model: - base_learning_rate: 5.0e-5 # set to target_lr by starting main.py with '--scale_lr False' - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0155 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - loss_type: l1 - first_stage_key: "image" - cond_stage_key: "image" - image_size: 32 - channels: 4 - cond_stage_trainable: False - concat_mode: False - scale_by_std: True - monitor: 'val/loss_simple_ema' - - scheduler_config: # 10000 warmup steps - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: [10000] - cycle_lengths: [10000000000000] - f_start: [1.e-6] - f_max: [1.] - f_min: [ 1.] - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 192 - attention_resolutions: [ 1, 2, 4, 8 ] # 32, 16, 8, 4 - num_res_blocks: 2 - channel_mult: [ 1,2,2,4,4 ] # 32, 16, 8, 4, 2 - num_heads: 8 - use_scale_shift_norm: True - resblock_updown: True - - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: "val/rec_loss" - ckpt_path: "models/first_stage_models/kl-f8/model.ckpt" - ddconfig: - double_z: True - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: "__is_unconditional__" - -data: - target: main.DataModuleFromConfig - params: - batch_size: 96 - num_workers: 5 - wrap: False - train: - target: ldm.data.lsun.LSUNChurchesTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNChurchesValidation - params: - size: 256 - -lightning: - callbacks: - image_logger: - target: main.ImageLogger - params: - batch_frequency: 5000 - max_images: 8 - increase_log_steps: False - - - trainer: - benchmark: True \ No newline at end of file diff --git a/configs/latent-diffusion/txt2img-1p4B-eval.yaml b/configs/latent-diffusion/txt2img-1p4B-eval.yaml deleted file mode 100644 index 8e331cbfdf..0000000000 --- a/configs/latent-diffusion/txt2img-1p4B-eval.yaml +++ /dev/null @@ -1,71 +0,0 @@ -model: - base_learning_rate: 5.0e-05 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.00085 - linear_end: 0.012 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: caption - image_size: 32 - channels: 4 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - scale_factor: 0.18215 - use_ema: False - - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 4 - - 4 - num_heads: 8 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 1280 - use_checkpoint: true - legacy: False - - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: - target: ldm.modules.encoders.modules.BERTEmbedder - params: - n_embed: 1280 - n_layer: 32 diff --git a/configs/models.yaml b/configs/models.yaml index 162da38da2..cb4191c503 100644 --- a/configs/models.yaml +++ b/configs/models.yaml @@ -1,5 +1,5 @@ # This file describes the alternative machine learning models -# available to the dream script. +# available to InvokeAI script. # # To add a new model, follow the examples below. Each # model requires a model config file, a weights file, @@ -8,22 +8,29 @@ stable-diffusion-1.4: config: configs/stable-diffusion/v1-inference.yaml weights: models/ldm/stable-diffusion-v1/model.ckpt -# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt + vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt description: Stable Diffusion inference model version 1.4 width: 512 height: 512 +stable-diffusion-1.5: + description: The newest Stable Diffusion version 1.5 weight file (4.27 GB) + weights: ./models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt + config: ./configs/stable-diffusion/v1-inference.yaml + width: 512 + height: 512 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt default: true inpainting-1.5: - description: runwayML tuned inpainting model v1.5 - weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt - config: configs/stable-diffusion/v1-inpainting-inference.yaml -# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt + description: RunwayML SD 1.5 model optimized for inpainting (4.27 GB) + weights: ./models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt + config: ./configs/stable-diffusion/v1-inpainting-inference.yaml width: 512 height: 512 -stable-diffusion-1.5: - config: configs/stable-diffusion/v1-inference.yaml - weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt -# vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt - description: Stable Diffusion inference model version 1.5 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt +waifu-diffusion-1.3: + description: Stable Diffusion 1.4 fine tuned on anime-styled images (4.27) + weights: ./models/ldm/stable-diffusion-v1/model-epoch09-float32.ckpt + config: ./configs/stable-diffusion/v1-inference.yaml width: 512 height: 512 + vae: ./models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt diff --git a/configs/retrieval-augmented-diffusion/768x768.yaml b/configs/retrieval-augmented-diffusion/768x768.yaml deleted file mode 100644 index b51b1d8373..0000000000 --- a/configs/retrieval-augmented-diffusion/768x768.yaml +++ /dev/null @@ -1,68 +0,0 @@ -model: - base_learning_rate: 0.0001 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.015 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: jpg - cond_stage_key: nix - image_size: 48 - channels: 16 - cond_stage_trainable: false - conditioning_key: crossattn - monitor: val/loss_simple_ema - scale_by_std: false - scale_factor: 0.22765929 - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 48 - in_channels: 16 - out_channels: 16 - model_channels: 448 - attention_resolutions: - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - use_scale_shift_norm: false - resblock_updown: false - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 768 - use_checkpoint: true - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 16 - ddconfig: - double_z: true - z_channels: 16 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: torch.nn.Identity \ No newline at end of file diff --git a/docs/features/INSTALLING_MODELS.md b/docs/features/INSTALLING_MODELS.md new file mode 100644 index 0000000000..aa04cc14b7 --- /dev/null +++ b/docs/features/INSTALLING_MODELS.md @@ -0,0 +1,9 @@ +--- +title: Installing Models +--- + +# :octicons-paintbrush-16: Installing Models + +## TO COME + + diff --git a/ldm/invoke/model_cache.py b/ldm/invoke/model_cache.py index f972a9eb16..a5aa343303 100644 --- a/ldm/invoke/model_cache.py +++ b/ldm/invoke/model_cache.py @@ -281,7 +281,7 @@ class ModelCache(object): Returns the preamble for the config file. ''' return '''# This file describes the alternative machine learning models -# available to the dream script. +# available to InvokeAI script. # # To add a new model, follow the examples below. Each # model requires a model config file, a weights file, diff --git a/models/first_stage_models/kl-f16/config.yaml b/models/first_stage_models/kl-f16/config.yaml deleted file mode 100644 index 661921cf75..0000000000 --- a/models/first_stage_models/kl-f16/config.yaml +++ /dev/null @@ -1,44 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 16 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 16 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 6 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/kl-f32/config.yaml b/models/first_stage_models/kl-f32/config.yaml deleted file mode 100644 index 7b642b136a..0000000000 --- a/models/first_stage_models/kl-f32/config.yaml +++ /dev/null @@ -1,46 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 64 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 64 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - - 8 - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 6 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/kl-f4/config.yaml b/models/first_stage_models/kl-f4/config.yaml deleted file mode 100644 index 85cfb3e94e..0000000000 --- a/models/first_stage_models/kl-f4/config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 3 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 10 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/kl-f8/config.yaml b/models/first_stage_models/kl-f8/config.yaml deleted file mode 100644 index 921aa42533..0000000000 --- a/models/first_stage_models/kl-f8/config.yaml +++ /dev/null @@ -1,42 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.AutoencoderKL - params: - monitor: val/rec_loss - embed_dim: 4 - lossconfig: - target: ldm.modules.losses.LPIPSWithDiscriminator - params: - disc_start: 50001 - kl_weight: 1.0e-06 - disc_weight: 0.5 - ddconfig: - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 4 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/vq-f16/config.yaml b/models/first_stage_models/vq-f16/config.yaml deleted file mode 100644 index 91c7454906..0000000000 --- a/models/first_stage_models/vq-f16/config.yaml +++ /dev/null @@ -1,49 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 8 - n_embed: 16384 - ddconfig: - double_z: false - z_channels: 8 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 16 - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 250001 - disc_weight: 0.75 - disc_num_layers: 2 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 14 - num_workers: 20 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/vq-f4-noattn/config.yaml b/models/first_stage_models/vq-f4-noattn/config.yaml deleted file mode 100644 index f8e499fa2a..0000000000 --- a/models/first_stage_models/vq-f4-noattn/config.yaml +++ /dev/null @@ -1,46 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - - ddconfig: - attn_type: none - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 11 - disc_weight: 0.75 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 8 - num_workers: 12 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - crop_size: 256 diff --git a/models/first_stage_models/vq-f4/config.yaml b/models/first_stage_models/vq-f4/config.yaml deleted file mode 100644 index 7d8cef3252..0000000000 --- a/models/first_stage_models/vq-f4/config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 0 - disc_weight: 0.75 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 8 - num_workers: 16 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - crop_size: 256 diff --git a/models/first_stage_models/vq-f8-n256/config.yaml b/models/first_stage_models/vq-f8-n256/config.yaml deleted file mode 100644 index 8519e13d61..0000000000 --- a/models/first_stage_models/vq-f8-n256/config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 4 - n_embed: 256 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_start: 250001 - disc_weight: 0.75 - codebook_weight: 1.0 - -data: - target: main.DataModuleFromConfig - params: - batch_size: 10 - num_workers: 20 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/first_stage_models/vq-f8/config.yaml b/models/first_stage_models/vq-f8/config.yaml deleted file mode 100644 index efd6801ca9..0000000000 --- a/models/first_stage_models/vq-f8/config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -model: - base_learning_rate: 4.5e-06 - target: ldm.models.autoencoder.VQModel - params: - embed_dim: 4 - n_embed: 16384 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator - params: - disc_conditional: false - disc_in_channels: 3 - disc_num_layers: 2 - disc_start: 1 - disc_weight: 0.6 - codebook_weight: 1.0 -data: - target: main.DataModuleFromConfig - params: - batch_size: 10 - num_workers: 20 - wrap: true - train: - target: ldm.data.openimages.FullOpenImagesTrain - params: - size: 384 - crop_size: 256 - validation: - target: ldm.data.openimages.FullOpenImagesValidation - params: - size: 384 - crop_size: 256 diff --git a/models/ldm/bsr_sr/config.yaml b/models/ldm/bsr_sr/config.yaml deleted file mode 100644 index 861692a8d1..0000000000 --- a/models/ldm/bsr_sr/config.yaml +++ /dev/null @@ -1,80 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0155 - log_every_t: 100 - timesteps: 1000 - loss_type: l2 - first_stage_key: image - cond_stage_key: LR_image - image_size: 64 - channels: 3 - concat_mode: true - cond_stage_trainable: false - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 6 - out_channels: 3 - model_channels: 160 - attention_resolutions: - - 16 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 2 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: torch.nn.Identity -data: - target: main.DataModuleFromConfig - params: - batch_size: 64 - wrap: false - num_workers: 12 - train: - target: ldm.data.openimages.SuperresOpenImagesAdvancedTrain - params: - size: 256 - degradation: bsrgan_light - downscale_f: 4 - min_crop_f: 0.5 - max_crop_f: 1.0 - random_crop: true - validation: - target: ldm.data.openimages.SuperresOpenImagesAdvancedValidation - params: - size: 256 - degradation: bsrgan_light - downscale_f: 4 - min_crop_f: 0.5 - max_crop_f: 1.0 - random_crop: true diff --git a/models/ldm/celeba256/config.yaml b/models/ldm/celeba256/config.yaml deleted file mode 100644 index a12f4e9d39..0000000000 --- a/models/ldm/celeba256/config.yaml +++ /dev/null @@ -1,70 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: false - concat_mode: false - monitor: val/loss - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: ldm.data.faceshq.CelebAHQTrain - params: - size: 256 - validation: - target: ldm.data.faceshq.CelebAHQValidation - params: - size: 256 diff --git a/models/ldm/cin256/config.yaml b/models/ldm/cin256/config.yaml deleted file mode 100644 index 9bc1b4566a..0000000000 --- a/models/ldm/cin256/config.yaml +++ /dev/null @@ -1,80 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 32 - channels: 4 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 256 - attention_resolutions: - - 4 - - 2 - - 1 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 4 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 512 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 4 - n_embed: 16384 - ddconfig: - double_z: false - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: - - 32 - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.ClassEmbedder - params: - embed_dim: 512 - key: class_label -data: - target: main.DataModuleFromConfig - params: - batch_size: 64 - num_workers: 12 - wrap: false - train: - target: ldm.data.imagenet.ImageNetTrain - params: - config: - size: 256 - validation: - target: ldm.data.imagenet.ImageNetValidation - params: - config: - size: 256 diff --git a/models/ldm/ffhq256/config.yaml b/models/ldm/ffhq256/config.yaml deleted file mode 100644 index 0ddfd1b93e..0000000000 --- a/models/ldm/ffhq256/config.yaml +++ /dev/null @@ -1,70 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: false - concat_mode: false - monitor: val/loss - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 42 - num_workers: 5 - wrap: false - train: - target: ldm.data.faceshq.FFHQTrain - params: - size: 256 - validation: - target: ldm.data.faceshq.FFHQValidation - params: - size: 256 diff --git a/models/ldm/inpainting_big/config.yaml b/models/ldm/inpainting_big/config.yaml deleted file mode 100644 index da5fd5ea50..0000000000 --- a/models/ldm/inpainting_big/config.yaml +++ /dev/null @@ -1,67 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: masked_image - image_size: 64 - channels: 3 - concat_mode: true - monitor: val/loss - scheduler_config: - target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler - params: - verbosity_interval: 0 - warm_up_steps: 1000 - max_decay_steps: 50000 - lr_start: 0.001 - lr_max: 0.1 - lr_min: 0.0001 - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 7 - out_channels: 3 - model_channels: 256 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_heads: 8 - resblock_updown: true - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - attn_type: none - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: ldm.modules.losses.contperceptual.DummyLoss - cond_stage_config: __is_first_stage__ diff --git a/models/ldm/layout2img-openimages256/config.yaml b/models/ldm/layout2img-openimages256/config.yaml deleted file mode 100644 index 9e1dc15fe2..0000000000 --- a/models/ldm/layout2img-openimages256/config.yaml +++ /dev/null @@ -1,81 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: coordinates_bbox - image_size: 64 - channels: 3 - conditioning_key: crossattn - cond_stage_trainable: true - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 128 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 3 - context_dim: 512 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.BERTEmbedder - params: - n_embed: 512 - n_layer: 16 - vocab_size: 8192 - max_seq_len: 92 - use_tokenizer: false - monitor: val/loss_simple_ema -data: - target: main.DataModuleFromConfig - params: - batch_size: 24 - wrap: false - num_workers: 10 - train: - target: ldm.data.openimages.OpenImagesBBoxTrain - params: - size: 256 - validation: - target: ldm.data.openimages.OpenImagesBBoxValidation - params: - size: 256 diff --git a/models/ldm/lsun_beds256/config.yaml b/models/ldm/lsun_beds256/config.yaml deleted file mode 100644 index 1a50c766a5..0000000000 --- a/models/ldm/lsun_beds256/config.yaml +++ /dev/null @@ -1,70 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: class_label - image_size: 64 - channels: 3 - cond_stage_trainable: false - concat_mode: false - monitor: val/loss - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 224 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 4 - num_head_channels: 32 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: __is_unconditional__ -data: - target: main.DataModuleFromConfig - params: - batch_size: 48 - num_workers: 5 - wrap: false - train: - target: ldm.data.lsun.LSUNBedroomsTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNBedroomsValidation - params: - size: 256 diff --git a/models/ldm/lsun_churches256/config.yaml b/models/ldm/lsun_churches256/config.yaml deleted file mode 100644 index 424d0914c9..0000000000 --- a/models/ldm/lsun_churches256/config.yaml +++ /dev/null @@ -1,92 +0,0 @@ -model: - base_learning_rate: 5.0e-05 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0155 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: image - image_size: 32 - channels: 4 - cond_stage_trainable: false - concat_mode: false - scale_by_std: true - monitor: val/loss_simple_ema - scheduler_config: - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: - - 10000 - cycle_lengths: - - 10000000000000 - f_start: - - 1.0e-06 - f_max: - - 1.0 - f_min: - - 1.0 - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 32 - in_channels: 4 - out_channels: 4 - model_channels: 192 - attention_resolutions: - - 1 - - 2 - - 4 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 2 - - 4 - - 4 - num_heads: 8 - use_scale_shift_norm: true - resblock_updown: true - first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - - cond_stage_config: '__is_unconditional__' - -data: - target: main.DataModuleFromConfig - params: - batch_size: 96 - num_workers: 5 - wrap: false - train: - target: ldm.data.lsun.LSUNChurchesTrain - params: - size: 256 - validation: - target: ldm.data.lsun.LSUNChurchesValidation - params: - size: 256 diff --git a/models/ldm/semantic_synthesis256/config.yaml b/models/ldm/semantic_synthesis256/config.yaml deleted file mode 100644 index 1a721cfffa..0000000000 --- a/models/ldm/semantic_synthesis256/config.yaml +++ /dev/null @@ -1,59 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: segmentation - image_size: 64 - channels: 3 - concat_mode: true - cond_stage_trainable: true - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 6 - out_channels: 3 - model_channels: 128 - attention_resolutions: - - 32 - - 16 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 4 - - 8 - num_heads: 8 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.SpatialRescaler - params: - n_stages: 2 - in_channels: 182 - out_channels: 3 diff --git a/models/ldm/semantic_synthesis512/config.yaml b/models/ldm/semantic_synthesis512/config.yaml deleted file mode 100644 index 8faded2eec..0000000000 --- a/models/ldm/semantic_synthesis512/config.yaml +++ /dev/null @@ -1,78 +0,0 @@ -model: - base_learning_rate: 1.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0205 - log_every_t: 100 - timesteps: 1000 - loss_type: l1 - first_stage_key: image - cond_stage_key: segmentation - image_size: 128 - channels: 3 - concat_mode: true - cond_stage_trainable: true - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 128 - in_channels: 6 - out_channels: 3 - model_channels: 128 - attention_resolutions: - - 32 - - 16 - - 8 - num_res_blocks: 2 - channel_mult: - - 1 - - 4 - - 8 - num_heads: 8 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - monitor: val/rec_loss - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.SpatialRescaler - params: - n_stages: 2 - in_channels: 182 - out_channels: 3 -data: - target: main.DataModuleFromConfig - params: - batch_size: 8 - wrap: false - num_workers: 10 - train: - target: ldm.data.landscapes.RFWTrain - params: - size: 768 - crop_size: 512 - segmentation_to_float32: true - validation: - target: ldm.data.landscapes.RFWValidation - params: - size: 768 - crop_size: 512 - segmentation_to_float32: true diff --git a/models/ldm/text2img256/config.yaml b/models/ldm/text2img256/config.yaml deleted file mode 100644 index 3f54a01515..0000000000 --- a/models/ldm/text2img256/config.yaml +++ /dev/null @@ -1,77 +0,0 @@ -model: - base_learning_rate: 2.0e-06 - target: ldm.models.diffusion.ddpm.LatentDiffusion - params: - linear_start: 0.0015 - linear_end: 0.0195 - num_timesteps_cond: 1 - log_every_t: 200 - timesteps: 1000 - first_stage_key: image - cond_stage_key: caption - image_size: 64 - channels: 3 - cond_stage_trainable: true - conditioning_key: crossattn - monitor: val/loss_simple_ema - unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - image_size: 64 - in_channels: 3 - out_channels: 3 - model_channels: 192 - attention_resolutions: - - 8 - - 4 - - 2 - num_res_blocks: 2 - channel_mult: - - 1 - - 2 - - 3 - - 5 - num_head_channels: 32 - use_spatial_transformer: true - transformer_depth: 1 - context_dim: 640 - first_stage_config: - target: ldm.models.autoencoder.VQModelInterface - params: - embed_dim: 3 - n_embed: 8192 - ddconfig: - double_z: false - z_channels: 3 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity - cond_stage_config: - target: ldm.modules.encoders.modules.BERTEmbedder - params: - n_embed: 640 - n_layer: 32 -data: - target: main.DataModuleFromConfig - params: - batch_size: 28 - num_workers: 10 - wrap: false - train: - target: ldm.data.previews.pytorch_dataset.PreviewsTrain - params: - size: 256 - validation: - target: ldm.data.previews.pytorch_dataset.PreviewsValidation - params: - size: 256 diff --git a/scripts/preload_models.py b/scripts/preload_models.py index bf0a5ffb99..e5a4850a8b 100644 --- a/scripts/preload_models.py +++ b/scripts/preload_models.py @@ -3,9 +3,11 @@ # Before running stable-diffusion on an internet-isolated machine, # run this script from one with internet connectivity. The # two machines must share a common .cache directory. -from transformers import CLIPTokenizer, CLIPTextModel +# +# Coauthor: Kevin Turner http://github.com/keturn +# +print('Loading Python libraries...\n') import clip -from transformers import BertTokenizerFast, AutoFeatureExtractor import sys import transformers import os @@ -14,9 +16,247 @@ import torch import urllib.request import zipfile import traceback +import getpass +from omegaconf import OmegaConf +from pathlib import Path +from transformers import CLIPTokenizer, CLIPTextModel +from transformers import BertTokenizerFast, AutoFeatureExtractor +from huggingface_hub import hf_hub_download, HfFolder, hf_hub_url transformers.logging.set_verbosity_error() +#--------------------------globals-- +Model_dir = './models/ldm/stable-diffusion-v1/' +Config_file = './configs/models.yaml' +SD_Configs = './configs/stable-diffusion' +Datasets = { + 'stable-diffusion-1.5': { + 'description': 'The newest Stable Diffusion version 1.5 weight file (4.27 GB)', + 'repo_id': 'runwayml/stable-diffusion-v1-5', + 'config': 'v1-inference.yaml', + 'file': 'v1-5-pruned-emaonly.ckpt', + 'recommended': True, + 'width': 512, + 'height': 512, + }, + 'inpainting-1.5': { + 'description': 'RunwayML SD 1.5 model optimized for inpainting (4.27 GB)', + 'repo_id': 'runwayml/stable-diffusion-inpainting', + 'config': 'v1-inpainting-inference.yaml', + 'file': 'sd-v1-5-inpainting.ckpt', + 'recommended': True, + 'width': 512, + 'height': 512, + }, + 'stable-diffusion-1.4': { + 'description': 'The original Stable Diffusion version 1.4 weight file (4.27 GB)', + 'repo_id': 'CompVis/stable-diffusion-v-1-4-original', + 'config': 'v1-inference.yaml', + 'file': 'sd-v1-4.ckpt', + 'recommended': False, + 'width': 512, + 'height': 512, + }, + 'waifu-diffusion-1.3': { + 'description': 'Stable Diffusion 1.4 fine tuned on anime-styled images (4.27)', + 'repo_id': 'hakurei/waifu-diffusion-v1-3', + 'config': 'v1-inference.yaml', + 'file': 'model-epoch09-float32.ckpt', + 'recommended': False, + 'width': 512, + 'height': 512, + }, + 'ft-mse-improved-autoencoder-840000': { + 'description': 'StabilityAI improved autoencoder fine-tuned for human faces (recommended; 335 MB)', + 'repo_id': 'stabilityai/sd-vae-ft-mse-original', + 'config': 'VAE', + 'file': 'vae-ft-mse-840000-ema-pruned.ckpt', + 'recommended': True, + 'width': 512, + 'height': 512, + }, +} +Config_preamble = '''# This file describes the alternative machine learning models +# available to InvokeAI script. +# +# To add a new model, follow the examples below. Each +# model requires a model config file, a weights file, +# and the width and height of the images it +# was trained on. +''' + +#--------------------------------------------- +def introduction(): + print( + '''Welcome to InvokeAI. This script will help download the Stable Diffusion weight files +and other large models that are needed for text to image generation. At any point you may interrupt +this program and resume later.\n''' + ) + +#--------------------------------------------- +def yes_or_no(prompt:str, default_yes=True): + default = "y" if default_yes else 'n' + response = input(f'{prompt} [{default}] ') or default + if default_yes: + return response[0] not in ('n','N') + else: + return response[0] in ('y','Y') + +#--------------------------------------------- +def user_wants_to_download_weights(): + return yes_or_no('Would you like to download the Stable Diffusion model weights now?') + +#--------------------------------------------- +def select_datasets(): + done = False + while not done: + print(''' +Choose the weight file(s) you wish to download. Before downloading you +will be given the option to view and change your selections. +''' + ) + datasets = dict() + + counter = 1 + dflt = None # the first model selected will be the default; TODO let user change + for ds in Datasets.keys(): + recommended = '(recommended)' if Datasets[ds]['recommended'] else '' + print(f'[{counter}] {ds}:\n {Datasets[ds]["description"]} {recommended}') + if yes_or_no(' Download?',default_yes=Datasets[ds]['recommended']): + datasets[ds]=counter + counter += 1 + + print('The following weight files will be downloaded:') + for ds in datasets: + dflt = '*' if dflt is None else '' + print(f' [{datasets[ds]}] {ds}{dflt}') + print("*default") + ok_to_download = yes_or_no('Ok to download?') + if not ok_to_download: + if yes_or_no('Change your selection?'): + pass + else: + done = True + else: + done = True + return datasets if ok_to_download else None + +#-------------------------------Authenticate against Hugging Face +def authenticate(): + print(''' +To download the Stable Diffusion weight files you need to read and accept the +CreativeML Responsible AI license. If you have not already done so, please +create an account at https://huggingface.co. Then login under your account and +read and accept the license available at https://huggingface.co/CompVis/stable-diffusion-v-1-4-original. +''' + ) + input('Press when you are ready to continue:') + access_token = HfFolder.get_token() + if access_token is None: + print(''' +Thank you! Now you need to authenticate with your HuggingFace access token. +Go to https://huggingface.co/settings/tokens and create a token. Copy it to the +clipboard and paste it here: ''' + ) + access_token = getpass.getpass() + HfFolder.save_token(access_token) + return access_token + +#--------------------------------------------- +# look for legacy model.ckpt in models directory and offer to +# normalize its name +def migrate_models_ckpt(): + if not os.path.exists(os.path.join(Model_dir,'model.ckpt')): + return + new_name = Datasets['stable-diffusion-1.4']['file'] + print('You seem to have the Stable Diffusion v4.1 "model.ckpt" already installed.') + rename = yes_or_no(f'Ok to rename it to "{new_name}" for future reference?') + if rename: + print(f'model.ckpt => {new_name}') + os.rename(os.path.join(Model_dir,'model.ckpt'),os.path.join(Model_dir,new_name)) + +#--------------------------------------------- +def download_weight_datasets(models:dict, access_token:str): + migrate_models_ckpt() + successful = dict() + for mod in models.keys(): + repo_id = Datasets[mod]['repo_id'] + filename = Datasets[mod]['file'] + success = conditional_download( + repo_id=repo_id, + model_name=filename, + access_token=access_token + ) + if success: + successful[mod] = True + keys = ', '.join(successful.keys()) + print(f'Successfully installed {keys}') + return successful + +#--------------------------------------------- +def conditional_download(repo_id:str, model_name:str, access_token:str): + model_dest = os.path.join(Model_dir, model_name) + if os.path.exists(model_dest): + print(f' * {model_name}: exists') + return True + os.makedirs(os.path.dirname(model_dest), exist_ok=True) + + try: + print(f' * {model_name}: downloading or retrieving from cache...') + path = Path(hf_hub_download(repo_id, model_name, use_auth_token=access_token)) + path.resolve(strict=True).link_to(model_dest) + except Exception as e: + print(f'** Error downloading {model_name}: {str(e)} **') + return False + return True + +#--------------------------------------------- +def update_config_file(successfully_downloaded:dict): + try: + yaml = new_config_file_contents(successfully_downloaded) + tmpfile = os.path.join(os.path.dirname(Config_file),'new_config.tmp') + with open(tmpfile, 'w') as outfile: + outfile.write(Config_preamble) + outfile.write(yaml) + os.rename(tmpfile,Config_file) + except Exception as e: + print(f'**Error creating config file {Config_file}: {str(e)} **') + return + print(f'Successfully created new configuration file {Config_file}') + + +#--------------------------------------------- +def new_config_file_contents(successfully_downloaded:dict)->str: + conf = OmegaConf.load(Config_file) + + # find the VAE file, if there is one + vae = None + default_selected = False + + for model in successfully_downloaded: + if Datasets[model]['config'] == 'VAE': + vae = Datasets[model]['file'] + + for model in successfully_downloaded: + if Datasets[model]['config'] == 'VAE': # skip VAE entries + continue + stanza = conf[model] if model in conf else { } + + stanza['description'] = Datasets[model]['description'] + stanza['weights'] = os.path.join(Model_dir,Datasets[model]['file']) + stanza['config'] =os.path.join(SD_Configs, Datasets[model]['config']) + stanza['width'] = Datasets[model]['width'] + stanza['height'] = Datasets[model]['height'] + stanza.pop('default',None) # this will be set later + if vae: + stanza['vae'] = os.path.join(Model_dir,vae) + # BUG - the first stanza is always the default. User should select. + if not default_selected: + stanza['default'] = True + default_selected = True + conf[model] = stanza + return OmegaConf.to_yaml(conf) + #--------------------------------------------- # this will preload the Bert tokenizer fles def download_bert(): @@ -66,7 +306,6 @@ def download_gfpgan(): print(traceback.format_exc()) print('Loading models from GFPGAN') - import urllib.request for model in ( [ 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth', @@ -152,6 +391,15 @@ def download_safety_checker(): #------------------------------------- if __name__ == '__main__': + introduction() + if user_wants_to_download_weights(): + models = select_datasets() + if models is None: + if yes_or_no('Quit?',default_yes=False): + sys.exit(0) + access_token = authenticate() + successfully_downloaded = download_weight_datasets(models, access_token) + update_config_file(successfully_downloaded) download_bert() download_kornia() download_clip()