From 83a3cc9eb480a0cedf4b7ff73a8427880af0171a Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 00:30:48 -0400
Subject: [PATCH 01/15] start support for 1.5 inpainting model, not complete

---
 ldm/generate.py                  | 11 ++++-
 ldm/invoke/generator/base.py     | 19 ++++----
 ldm/invoke/model_cache.py        |  2 +
 ldm/models/autoencoder.py        |  2 +-
 ldm/models/diffusion/ddim.py     | 14 +++++-
 ldm/models/diffusion/ddpm.py     | 79 +++++++++++++++++++++++++++++++-
 ldm/models/diffusion/ksampler.py |  2 +
 ldm/models/diffusion/sampler.py  | 30 +++++++++++-
 8 files changed, 145 insertions(+), 14 deletions(-)

diff --git a/ldm/generate.py b/ldm/generate.py
index 8ffb7110a3..a834844e7e 100644
--- a/ldm/generate.py
+++ b/ldm/generate.py
@@ -404,7 +404,10 @@ class Generate:
             )
 
             # TODO: Hacky selection of operation to perform. Needs to be refactored.
-            if (init_image is not None) and (mask_image is not None):
+            if self.sampler.conditioning_key() in ('hybrid','concat'):
+                print(f'** Inpainting model detected. Will try it! **')
+                generator = self._make_omnibus()
+            elif (init_image is not None) and (mask_image is not None):
                 generator = self._make_inpaint()
             elif (embiggen != None or embiggen_tiles != None):
                 generator = self._make_embiggen()
@@ -690,6 +693,12 @@ class Generate:
             self.generators['inpaint'] = Inpaint(self.model, self.precision)
         return self.generators['inpaint']
 
+    def _make_omnibus(self):
+        if not self.generators.get('omnibus'):
+            from ldm.invoke.generator.omnibus import Omnibus
+            self.generators['omnibus'] = Omnibus(self.model, self.precision)
+        return self.generators['omnibus']
+
     def load_model(self):
         '''
         preload model identified in self.model_name
diff --git a/ldm/invoke/generator/base.py b/ldm/invoke/generator/base.py
index 89476cd216..c70924449b 100644
--- a/ldm/invoke/generator/base.py
+++ b/ldm/invoke/generator/base.py
@@ -40,12 +40,13 @@ class Generator():
         self.variation_amount = variation_amount
         self.with_variations  = with_variations
 
-    def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
+    def generate(self,prompt,init_image,width,height,sampler, iterations=1,seed=None,
                  image_callback=None, step_callback=None, threshold=0.0, perlin=0.0,
                  **kwargs):
         scope = choose_autocast(self.precision)
-        make_image          = self.get_make_image(
+        make_image = self.get_make_image(
             prompt,
+            sampler = sampler,
             init_image    = init_image,
             width         = width,
             height        = height,
@@ -54,13 +55,16 @@ class Generator():
             perlin        = perlin,
             **kwargs
         )
-
         results             = []
         seed                = seed if seed is not None else self.new_seed()
         first_seed          = seed
         seed, initial_noise = self.generate_initial_noise(seed, width, height)
-        with scope(self.model.device.type), self.model.ema_scope():
+
+        scope = (scope(self.model.device.type), self.model.ema_scope()) if sampler.conditioning_key() not in ('hybrid','concat') else scope(self.model.device.type)
+        
+        with scope:
             for n in trange(iterations, desc='Generating'):
+                print('DEBUG: in iterations loop() called')
                 x_T = None
                 if self.variation_amount > 0:
                     seed_everything(seed)
@@ -75,7 +79,6 @@ class Generator():
                         x_T = self.get_noise(width,height)
                     except:
                         pass
-
                 image = make_image(x_T)
                 results.append([image, seed])
                 if image_callback is not None:
@@ -83,10 +86,10 @@ class Generator():
                 seed = self.new_seed()
         return results
     
-    def sample_to_image(self,samples):
+    def sample_to_image(self,samples)->Image.Image:
         """
-        Returns a function returning an image derived from the prompt and the initial image
-        Return value depends on the seed at the time you call it
+        Given samples returned from a sampler, converts
+        it into a PIL Image
         """
         x_samples = self.model.decode_first_stage(samples)
         x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
diff --git a/ldm/invoke/model_cache.py b/ldm/invoke/model_cache.py
index f580dfba25..f972a9eb16 100644
--- a/ldm/invoke/model_cache.py
+++ b/ldm/invoke/model_cache.py
@@ -13,6 +13,7 @@ import gc
 import hashlib
 import psutil
 import transformers
+import traceback
 import os
 from sys import getrefcount
 from omegaconf import OmegaConf
@@ -73,6 +74,7 @@ class ModelCache(object):
                 self.models[model_name]['hash'] = hash
             except Exception as e:
                 print(f'** model {model_name} could not be loaded: {str(e)}')
+                print(traceback.format_exc())
                 print(f'** restoring {self.current_model}')
                 self.get_model(self.current_model)
                 return None
diff --git a/ldm/models/autoencoder.py b/ldm/models/autoencoder.py
index 359f5688d1..3db7b6fd73 100644
--- a/ldm/models/autoencoder.py
+++ b/ldm/models/autoencoder.py
@@ -66,7 +66,7 @@ class VQModel(pl.LightningModule):
         self.use_ema = use_ema
         if self.use_ema:
             self.model_ema = LitEma(self)
-            print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
+            print(f'>> Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
 
         if ckpt_path is not None:
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
diff --git a/ldm/models/diffusion/ddim.py b/ldm/models/diffusion/ddim.py
index f5dada8627..bfb78c1397 100644
--- a/ldm/models/diffusion/ddim.py
+++ b/ldm/models/diffusion/ddim.py
@@ -41,7 +41,19 @@ class DDIMSampler(Sampler):
         else:
             x_in = torch.cat([x] * 2)
             t_in = torch.cat([t] * 2)
-            c_in = torch.cat([unconditional_conditioning, c])
+            if isinstance(c, dict):
+                assert isinstance(unconditional_conditioning, dict)
+                c_in = dict()
+                for k in c:
+                    if isinstance(c[k], list):
+                        c_in[k] = [
+                            torch.cat([unconditional_conditioning[k][i], c[k][i]])
+                            for i in range(len(c[k]))
+                        ]
+                    else:
+                        c_in[k] = torch.cat([unconditional_conditioning[k], c[k]])
+            else:
+                c_in = torch.cat([unconditional_conditioning, c])
             e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
             e_t = e_t_uncond + unconditional_guidance_scale * (
                 e_t - e_t_uncond
diff --git a/ldm/models/diffusion/ddpm.py b/ldm/models/diffusion/ddpm.py
index 4b62b5e393..fd3b6688f3 100644
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@@ -19,6 +19,7 @@ from functools import partial
 from tqdm import tqdm
 from torchvision.utils import make_grid
 from pytorch_lightning.utilities.distributed import rank_zero_only
+from omegaconf import ListConfig
 import urllib
 
 from ldm.util import (
@@ -120,7 +121,7 @@ class DDPM(pl.LightningModule):
         self.use_ema = use_ema
         if self.use_ema:
             self.model_ema = LitEma(self.model)
-            print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
+            print(f'   | Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
 
         self.use_scheduler = scheduler_config is not None
         if self.use_scheduler:
@@ -1883,6 +1884,24 @@ class LatentDiffusion(DDPM):
 
         return samples, intermediates
 
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, batch_size, null_label=None):
+        if null_label is not None:
+            xc = null_label
+            if isinstance(xc, ListConfig):
+                xc = list(xc)
+            if isinstance(xc, dict) or isinstance(xc, list):
+                c = self.get_learned_conditioning(xc)
+            else:
+                if hasattr(xc, "to"):
+                    xc = xc.to(self.device)
+                c = self.get_learned_conditioning(xc)
+        else:
+            # todo: get null label from cond_stage_model
+            raise NotImplementedError()
+        c = repeat(c, "1 ... -> b ...", b=batch_size).to(self.device)
+        return c
+
     @torch.no_grad()
     def log_images(
         self,
@@ -2138,6 +2157,7 @@ class DiffusionWrapper(pl.LightningModule):
         ]
 
     def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
+        print(f'DEBUG (ddpm) c_concat = {c_concat}')
         if self.conditioning_key is None:
             out = self.diffusion_model(x, t)
         elif self.conditioning_key == 'concat':
@@ -2147,8 +2167,8 @@ class DiffusionWrapper(pl.LightningModule):
             cc = torch.cat(c_crossattn, 1)
             out = self.diffusion_model(x, t, context=cc)
         elif self.conditioning_key == 'hybrid':
-            xc = torch.cat([x] + c_concat, dim=1)
             cc = torch.cat(c_crossattn, 1)
+            xc = torch.cat([x] + c_concat, dim=1)
             out = self.diffusion_model(xc, t, context=cc)
         elif self.conditioning_key == 'adm':
             cc = c_crossattn[0]
@@ -2187,3 +2207,58 @@ class Layout2ImgDiffusion(LatentDiffusion):
         cond_img = torch.stack(bbox_imgs, dim=0)
         logs['bbox_image'] = cond_img
         return logs
+
+class LatentInpaintDiffusion(LatentDiffusion):
+    def __init__(
+        self,
+        concat_keys=("mask", "masked_image"),
+        masked_image_key="masked_image",
+        finetune_keys=None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.masked_image_key = masked_image_key
+        assert self.masked_image_key in concat_keys
+        self.concat_keys = concat_keys
+
+
+    @torch.no_grad()
+    def get_input(
+        self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False
+    ):
+        # note: restricted to non-trainable encoders currently
+        assert (
+            not self.cond_stage_trainable
+        ), "trainable cond stages not yet supported for inpainting"
+        z, c, x, xrec, xc = super().get_input(
+            batch,
+            self.first_stage_key,
+            return_first_stage_outputs=True,
+            force_c_encode=True,
+            return_original_cond=True,
+            bs=bs,
+        )
+
+        assert exists(self.concat_keys)
+        c_cat = list()
+        for ck in self.concat_keys:
+            cc = (
+                rearrange(batch[ck], "b h w c -> b c h w")
+                .to(memory_format=torch.contiguous_format)
+                .float()
+            )
+            if bs is not None:
+                cc = cc[:bs]
+                cc = cc.to(self.device)
+            bchw = z.shape
+            if ck != self.masked_image_key:
+                cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
+            else:
+                cc = self.get_first_stage_encoding(self.encode_first_stage(cc))
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
+        if return_first_stage_outputs:
+            return z, all_conds, x, xrec, xc
+        return z, all_conds
diff --git a/ldm/models/diffusion/ksampler.py b/ldm/models/diffusion/ksampler.py
index ac0615b30c..68c26b5d6c 100644
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@@ -281,3 +281,5 @@ class KSampler(Sampler):
         '''
         return self.model.inner_model.q_sample(x0,ts)
 
+    def conditioning_key(self)->str:
+        return self.model.inner_model.model.conditioning_key
diff --git a/ldm/models/diffusion/sampler.py b/ldm/models/diffusion/sampler.py
index ff705513f8..9e57bc25d4 100644
--- a/ldm/models/diffusion/sampler.py
+++ b/ldm/models/diffusion/sampler.py
@@ -158,6 +158,18 @@ class Sampler(object):
         **kwargs,
     ):
 
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                ctmp = conditioning[list(conditioning.keys())[0]]
+                while isinstance(ctmp, list):
+                    ctmp = ctmp[0]
+                cbs = ctmp.shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+
         # check to see if make_schedule() has run, and if not, run it
         if self.ddim_timesteps is None:
             self.make_schedule(
@@ -193,7 +205,7 @@ class Sampler(object):
         )
         return samples, intermediates
 
-    #torch.no_grad()
+    @torch.no_grad()
     def do_sampling(
             self,
             cond,
@@ -307,6 +319,19 @@ class Sampler(object):
             mask              = None,
     ):
 
+        print(f'DEBUG(sampler): cond = {cond}')
+        if cond is not None:
+            if isinstance(cond, dict):
+                ctmp = cond[list(cond.keys())[0]]
+                while isinstance(ctmp, list):
+                    ctmp = ctmp[0]
+                cbs = ctmp.shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conds but batch-size is {batch_size}")
+            else:
+                if cond.shape[0] != batch_size:
+                    print(f"Warning: Got {cond.shape[0]} conditionings but batch-size is {batch_size}")
+
         timesteps = (
             np.arange(self.ddpm_num_timesteps)
             if use_original_steps
@@ -411,3 +436,6 @@ class Sampler(object):
         return self.model.inner_model.q_sample(x0,ts)
         '''
         return self.model.q_sample(x0,ts)
+
+    def conditioning_key(self)->str:
+        return self.model.model.conditioning_key

From be8a992b85f9c10eac380161849b9e4e69978908 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 00:38:24 -0400
Subject: [PATCH 02/15] add missing file

---
 ldm/invoke/generator/omnibus.py | 144 ++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 ldm/invoke/generator/omnibus.py

diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
new file mode 100644
index 0000000000..43192cd152
--- /dev/null
+++ b/ldm/invoke/generator/omnibus.py
@@ -0,0 +1,144 @@
+"""omnibus module to be used with the runwayml 9-channel custom inpainting model"""
+
+import torch
+import numpy as  np
+from PIL import Image
+from ldm.invoke.generator.base import downsampling
+from ldm.invoke.generator.img2img import Img2Img
+from ldm.invoke.generator.txt2img import Txt2Img
+
+class Omnibus(Img2Img,Txt2Img):
+    def __init__(self, model, precision):
+        super().__init__(model, precision)
+
+    def get_make_image(
+            self,
+            prompt,
+            sampler,
+            steps,
+            cfg_scale,
+            ddim_eta,
+            conditioning,
+            width,
+            height,
+            init_image = None,
+            mask_image = None,
+            strength = None,
+            step_callback=None,
+            threshold=0.0,
+            perlin=0.0,
+            **kwargs):
+        """
+        Returns a function returning an image derived from the prompt and the initial image
+        Return value depends on the seed at the time you call it.
+        """
+        self.perlin = perlin
+
+        sampler.make_schedule(
+            ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
+        )
+
+        if isinstance(init_image, Image.Image):
+            init_image = self._image_to_tensor(init_image)
+
+        if isinstance(mask_image, Image.Image):
+            mask_image = self._image_to_tensor(mask_image,normalize=False)
+
+        t_enc = steps
+
+        if init_image is not None and mask_image is not None: # inpainting
+            masked_image = init_image * (1 - mask_image)  # masked image is the image masked by mask - masked regions zero
+            
+        elif init_image is not None: # img2img
+            scope = choose_autocast(self.precision)
+            with scope(self.model.device.type):
+                self.init_latent = self.model.get_first_stage_encoding(
+                    self.model.encode_first_stage(init_image)
+                ) # move to latent space
+            # create a completely black mask  (1s)
+            mask_image = torch.ones(init_image.shape[0], 3, init_image.width, init_image.height, device=self.model.device)
+            # and the masked image is just a copy of the original
+            masked_image = init_image
+            t_enc = int(strength * steps)
+
+        else: # txt2img
+            mask_image = torch.zeros(init_image.shape[0], 3, init_image.width, init_image.height, device=self.model.device)
+            masked_image = mask_image
+
+        model = self.model
+
+        def make_image(x_T):
+            with torch.no_grad():
+                with torch.autocast("cuda"):
+
+                    batch = self.make_batch_sd(
+                        init_image,
+                        mask_image,
+                        masked_image,
+                        prompt=prompt,
+                        device=model.device,
+                        num_samples=1
+                    )
+                    
+                    c = model.cond_stage_model.encode(batch["txt"])
+
+                    c_cat = list()
+                    for ck in model.concat_keys:
+                        cc = batch[ck].float()
+                        if ck != model.masked_image_key:
+                            bchw = [num_samples, 4, h//8, w//8]
+                            cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
+                        else:
+                            cc = model.get_first_stage_encoding(model.encode_first_stage(cc))
+                        c_cat.append(cc)
+                    c_cat = torch.cat(c_cat, dim=1)
+
+                    # cond
+                    cond={"c_concat": [c_cat], "c_crossattn": [c]}
+
+                    # uncond cond
+                    uc_cross = model.get_unconditional_conditioning(num_samples, "")
+                    uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+                    shape = [model.channels, h//8, w//8]
+                    
+                    samples, = sampler.sample(
+                        batch_size = 1,
+                        S = t_enc,
+                        x_T = x_T,
+                        conditioning = cond,
+                        shape = shape,
+                        verbose = False,
+                        unconditional_guidance_scale = cfg_scale,
+                        unconditional_conditioning = uc_full,
+                        eta = 1.0,
+                        img_callback = step_callback,
+                        threshold = threshold,
+                    )
+                    if self.free_gpu_mem:
+                        self.model.model.to("cpu")
+            return self.sample_to_image(samples)
+
+        return make_image
+
+    def make_batch_sd(
+            image,
+            mask,
+            masked_image,
+            prompt,
+            device,
+            num_samples=1):
+        batch = {
+                "image": repeat(image.to(device=device), "1 ... -> n ...", n=num_samples),
+                "txt": num_samples * [prompt],
+                "mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
+                "masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
+                }
+        return batch
+
+    def get_noise(self, width:int, height:int):
+        if self.init_latent:
+            print('DEBUG: returning Img2Img.getnoise()')
+            return super(Img2Img,self).get_noise(width,height)
+        else:
+            print('DEBUG: returning Txt2Img.getnoise()')
+            return super(Txt2Img,self).get_noise(width,height)

From a2e53892ec97fa4057e2a2a022ffd92c372bad17 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 00:47:13 -0400
Subject: [PATCH 03/15] fixed synax errors; now channel mismatch issue

---
 ldm/invoke/generator/omnibus.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index 43192cd152..b6ddbfdb03 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -2,6 +2,7 @@
 
 import torch
 import numpy as  np
+from einops import repeat
 from PIL import Image
 from ldm.invoke.generator.base import downsampling
 from ldm.invoke.generator.img2img import Img2Img
@@ -33,6 +34,7 @@ class Omnibus(Img2Img,Txt2Img):
         Return value depends on the seed at the time you call it.
         """
         self.perlin = perlin
+        num_samples = 1
 
         sampler.make_schedule(
             ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
@@ -77,7 +79,7 @@ class Omnibus(Img2Img,Txt2Img):
                         masked_image,
                         prompt=prompt,
                         device=model.device,
-                        num_samples=1
+                        num_samples=num_samples,
                     )
                     
                     c = model.cond_stage_model.encode(batch["txt"])
@@ -86,7 +88,7 @@ class Omnibus(Img2Img,Txt2Img):
                     for ck in model.concat_keys:
                         cc = batch[ck].float()
                         if ck != model.masked_image_key:
-                            bchw = [num_samples, 4, h//8, w//8]
+                            bchw = [num_samples, 4, height//8, width//8]
                             cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
                         else:
                             cc = model.get_first_stage_encoding(model.encode_first_stage(cc))
@@ -99,7 +101,7 @@ class Omnibus(Img2Img,Txt2Img):
                     # uncond cond
                     uc_cross = model.get_unconditional_conditioning(num_samples, "")
                     uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
-                    shape = [model.channels, h//8, w//8]
+                    shape = [model.channels, height//8, width//8]
                     
                     samples, = sampler.sample(
                         batch_size = 1,
@@ -121,6 +123,7 @@ class Omnibus(Img2Img,Txt2Img):
         return make_image
 
     def make_batch_sd(
+            self,
             image,
             mask,
             masked_image,

From 175c7bddfc531fe59ded96ef881186ff06331c48 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 07:12:31 -0400
Subject: [PATCH 04/15] add missing inpainting yaml file

---
 .../v1-inpainting-inference.yaml              | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 configs/stable-diffusion/v1-inpainting-inference.yaml

diff --git a/configs/stable-diffusion/v1-inpainting-inference.yaml b/configs/stable-diffusion/v1-inpainting-inference.yaml
new file mode 100644
index 0000000000..5652e04374
--- /dev/null
+++ b/configs/stable-diffusion/v1-inpainting-inference.yaml
@@ -0,0 +1,79 @@
+model:
+  base_learning_rate: 7.5e-05
+  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid   # important
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    finetune_keys: null
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    personalization_config:
+      target: ldm.modules.embedding_manager.EmbeddingManager
+      params:
+        placeholder_strings: ["*"]
+        initializer_words: ['face', 'man', 'photo', 'africanmale']
+        per_image_tokens: false
+        num_vectors_per_token: 1
+        progressive_words: False
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 9  # 4 data + 4 downscaled image + 1 mask
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

From aaf7a4f1d3573f832a4231f4e2f7976f09ec3a89 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 10:00:28 -0400
Subject: [PATCH 05/15] inpaint and txt2img working with ddim sampler

---
 ldm/generate.py                 |  5 +++++
 ldm/invoke/generator/img2img.py |  5 ++++-
 ldm/invoke/generator/omnibus.py | 18 ++++++++++++------
 ldm/models/diffusion/ddpm.py    |  1 -
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/ldm/generate.py b/ldm/generate.py
index a834844e7e..e1d35e6607 100644
--- a/ldm/generate.py
+++ b/ldm/generate.py
@@ -655,6 +655,7 @@ class Generate:
 
         return init_image,init_mask
 
+    # lots o' repeated code here! Turn into a make_func()
     def _make_base(self):
         if not self.generators.get('base'):
             from ldm.invoke.generator import Generator
@@ -665,6 +666,7 @@ class Generate:
         if not self.generators.get('img2img'):
             from ldm.invoke.generator.img2img import Img2Img
             self.generators['img2img'] = Img2Img(self.model, self.precision)
+            self.generators['img2img'].free_gpu_mem = self.free_gpu_mem
         return self.generators['img2img']
 
     def _make_embiggen(self):
@@ -693,10 +695,13 @@ class Generate:
             self.generators['inpaint'] = Inpaint(self.model, self.precision)
         return self.generators['inpaint']
 
+    # "omnibus" supports the runwayML custom inpainting model, which does
+    # txt2img, img2img and inpainting using slight variations on the same code
     def _make_omnibus(self):
         if not self.generators.get('omnibus'):
             from ldm.invoke.generator.omnibus import Omnibus
             self.generators['omnibus'] = Omnibus(self.model, self.precision)
+            self.generators['omnibus'].free_gpu_mem = self.free_gpu_mem
         return self.generators['omnibus']
 
     def load_model(self):
diff --git a/ldm/invoke/generator/img2img.py b/ldm/invoke/generator/img2img.py
index 613f1aca31..31c3ca256e 100644
--- a/ldm/invoke/generator/img2img.py
+++ b/ldm/invoke/generator/img2img.py
@@ -77,7 +77,10 @@ class Img2Img(Generator):
 
     def _image_to_tensor(self, image:Image, normalize:bool=True)->Tensor:
         image = np.array(image).astype(np.float32) / 255.0
-        image = image[None].transpose(0, 3, 1, 2)
+        if len(image.shape) == 2:  # 'L' image, as in a mask
+            image = image[None,None]
+        else:                      # 'RGB' image
+            image = image[None].transpose(0, 3, 1, 2)
         image = torch.from_numpy(image)
         if normalize:
             image = 2.0 * image - 1.0
diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index b6ddbfdb03..fd3d3d47f2 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -3,7 +3,7 @@
 import torch
 import numpy as  np
 from einops import repeat
-from PIL import Image
+from PIL import Image, ImageOps
 from ldm.invoke.generator.base import downsampling
 from ldm.invoke.generator.img2img import Img2Img
 from ldm.invoke.generator.txt2img import Txt2Img
@@ -44,7 +44,7 @@ class Omnibus(Img2Img,Txt2Img):
             init_image = self._image_to_tensor(init_image)
 
         if isinstance(mask_image, Image.Image):
-            mask_image = self._image_to_tensor(mask_image,normalize=False)
+            mask_image = self._image_to_tensor(ImageOps.invert(mask_image).convert('L'),normalize=False)
 
         t_enc = steps
 
@@ -53,10 +53,12 @@ class Omnibus(Img2Img,Txt2Img):
             
         elif init_image is not None: # img2img
             scope = choose_autocast(self.precision)
+
             with scope(self.model.device.type):
                 self.init_latent = self.model.get_first_stage_encoding(
                     self.model.encode_first_stage(init_image)
                 ) # move to latent space
+
             # create a completely black mask  (1s)
             mask_image = torch.ones(init_image.shape[0], 3, init_image.width, init_image.height, device=self.model.device)
             # and the masked image is just a copy of the original
@@ -64,8 +66,9 @@ class Omnibus(Img2Img,Txt2Img):
             t_enc = int(strength * steps)
 
         else: # txt2img
-            mask_image = torch.zeros(init_image.shape[0], 3, init_image.width, init_image.height, device=self.model.device)
-            masked_image = mask_image
+            init_image = torch.zeros(1, 3, width, height, device=self.model.device)
+            mask_image = torch.ones(1, 1, width, height, device=self.model.device)
+            masked_image = init_image
 
         model = self.model
 
@@ -102,8 +105,8 @@ class Omnibus(Img2Img,Txt2Img):
                     uc_cross = model.get_unconditional_conditioning(num_samples, "")
                     uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
                     shape = [model.channels, height//8, width//8]
-                    
-                    samples, = sampler.sample(
+
+                    samples, _ = sampler.sample(
                         batch_size = 1,
                         S = t_enc,
                         x_T = x_T,
@@ -136,6 +139,9 @@ class Omnibus(Img2Img,Txt2Img):
                 "mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
                 "masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
                 }
+        print(f'DEBUG: image = {batch["image"]} shape={batch["image"].shape}')
+        print(f'DEBUG: mask = {batch["mask"]} shape={batch["mask"].shape}')
+        print(f'DEBUG: masked_image = {batch["masked_image"]} shape={batch["masked_image"].shape}')
         return batch
 
     def get_noise(self, width:int, height:int):
diff --git a/ldm/models/diffusion/ddpm.py b/ldm/models/diffusion/ddpm.py
index fd3b6688f3..26492baf7c 100644
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@@ -2157,7 +2157,6 @@ class DiffusionWrapper(pl.LightningModule):
         ]
 
     def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
-        print(f'DEBUG (ddpm) c_concat = {c_concat}')
         if self.conditioning_key is None:
             out = self.diffusion_model(x, t)
         elif self.conditioning_key == 'concat':

From b101be041bb1de73f637711b40936aff8c82aceb Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 10:45:12 -0400
Subject: [PATCH 06/15] add support for runwayML custom inpainting model

This is still a work in progress but seems functional. It supports
inpainting, txt2img and img2img on the ddim and k* samplers (plms
still needs work, but I know what to do).

To test this, get the file `sd-v1-5-inpainting.ckpt' from
https://huggingface.co/runwayml/stable-diffusion-inpainting and place it
at `models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt`

Launch invoke.py with --model inpainting-1.5 and proceed as usual.

Caveats:

1. The inpainting model takes about 800 Mb more memory than the standard
   1.5 model. This model will not work on 4 GB cards.

2. The inpainting model is temperamental. It wants you to describe the
   entire scene and not just the masked area to replace. So if you want
   to replace the parrot on a man's shoulder with a crow, the prompt
   "crow" may fail. Try "man with a crow on shoulder" instead. The
   symptom of a failed inpainting is that the area will be erased and
   replaced with background.

3. This has not been tested well. Please report bugs.
---
 configs/models.yaml              |  7 +++++++
 ldm/invoke/generator/omnibus.py  | 11 ++++-------
 ldm/models/diffusion/ksampler.py | 22 +++++++++++++++++++---
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/configs/models.yaml b/configs/models.yaml
index 46bc2df8b4..9522f025b1 100644
--- a/configs/models.yaml
+++ b/configs/models.yaml
@@ -13,6 +13,13 @@ stable-diffusion-1.4:
   default: true
   width: 512
   height: 512
+inpainting-1.5:
+  description: runwayML tuned inpainting model v1.5
+  weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
+  config: configs/stable-diffusion/v1-inpainting-inference.yaml
+#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  width: 512
+  height: 512
 stable-diffusion-1.5:
   config: configs/stable-diffusion/v1-inference.yaml
   weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index fd3d3d47f2..99fe046654 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -4,6 +4,7 @@ import torch
 import numpy as  np
 from einops import repeat
 from PIL import Image, ImageOps
+from ldm.invoke.devices import choose_autocast
 from ldm.invoke.generator.base import downsampling
 from ldm.invoke.generator.img2img import Img2Img
 from ldm.invoke.generator.txt2img import Txt2Img
@@ -60,7 +61,7 @@ class Omnibus(Img2Img,Txt2Img):
                 ) # move to latent space
 
             # create a completely black mask  (1s)
-            mask_image = torch.ones(init_image.shape[0], 3, init_image.width, init_image.height, device=self.model.device)
+            mask_image = torch.ones(1, 1, init_image.shape[2], init_image.shape[3], device=self.model.device)
             # and the masked image is just a copy of the original
             masked_image = init_image
             t_enc = int(strength * steps)
@@ -74,7 +75,8 @@ class Omnibus(Img2Img,Txt2Img):
 
         def make_image(x_T):
             with torch.no_grad():
-                with torch.autocast("cuda"):
+                scope = choose_autocast(self.precision)
+                with scope(self.model.device.type):
 
                     batch = self.make_batch_sd(
                         init_image,
@@ -139,15 +141,10 @@ class Omnibus(Img2Img,Txt2Img):
                 "mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
                 "masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
                 }
-        print(f'DEBUG: image = {batch["image"]} shape={batch["image"].shape}')
-        print(f'DEBUG: mask = {batch["mask"]} shape={batch["mask"].shape}')
-        print(f'DEBUG: masked_image = {batch["masked_image"]} shape={batch["masked_image"].shape}')
         return batch
 
     def get_noise(self, width:int, height:int):
         if self.init_latent:
-            print('DEBUG: returning Img2Img.getnoise()')
             return super(Img2Img,self).get_noise(width,height)
         else:
-            print('DEBUG: returning Txt2Img.getnoise()')
             return super(Txt2Img,self).get_noise(width,height)
diff --git a/ldm/models/diffusion/ksampler.py b/ldm/models/diffusion/ksampler.py
index 68c26b5d6c..59a3bebe4d 100644
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@@ -12,6 +12,22 @@ from ldm.modules.diffusionmodules.util import (
     extract_into_tensor,
 )
 
+def make_cond_in(uncond, cond):
+    if isinstance(cond, dict):
+        assert isinstance(uncond, dict)
+        cond_in = dict()
+        for k in cond:
+            if isinstance(cond[k], list):
+                cond_in[k] = [
+                    torch.cat([uncond[k][i], cond[k][i]])
+                    for i in range(len(cond[k]))
+                ]
+            else:
+                cond_in[k] = torch.cat([uncond[k], cond[k]])
+    else:
+        cond_in = torch.cat([uncond, cond])
+    return cond_in
+
 def cfg_apply_threshold(result, threshold = 0.0, scale = 0.7):
     if threshold <= 0.0:
         return result
@@ -37,7 +53,7 @@ class CFGDenoiser(nn.Module):
     def forward(self, x, sigma, uncond, cond, cond_scale):
         x_in = torch.cat([x] * 2)
         sigma_in = torch.cat([sigma] * 2)
-        cond_in = torch.cat([uncond, cond])
+        cond_in = make_cond_in(uncond,cond)
         uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
         if self.warmup < self.warmup_max:
             thresh = max(1, 1 + (self.threshold - 1) * (self.warmup / self.warmup_max))
@@ -64,13 +80,12 @@ class KSampler(Sampler):
         def forward(self, x, sigma, uncond, cond, cond_scale):
             x_in = torch.cat([x] * 2)
             sigma_in = torch.cat([sigma] * 2)
-            cond_in = torch.cat([uncond, cond])
+            cond_in  = make_cond_in(uncond, cond)
             uncond, cond = self.inner_model(
                 x_in, sigma_in, cond=cond_in
             ).chunk(2)
             return uncond + (cond - uncond) * cond_scale
 
-
     def make_schedule(
             self,
             ddim_num_steps,
@@ -283,3 +298,4 @@ class KSampler(Sampler):
 
     def conditioning_key(self)->str:
         return self.model.inner_model.model.conditioning_key
+

From 83e1c39ab85de131175edef209ff416ee1fd8814 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 11:42:30 -0400
Subject: [PATCH 07/15] plms works, bugs quashed

- The plms sampler now works with custom inpainting model
- Quashed bug that was causing generation on normal models to fail (oops!)
- Can now generate non-square images with custom inpainting model
---
 ldm/invoke/generator/base.py     |  7 +++----
 ldm/invoke/generator/omnibus.py  |  4 ++--
 ldm/models/diffusion/ksampler.py | 27 ++++++---------------------
 ldm/models/diffusion/plms.py     |  2 +-
 ldm/models/diffusion/sampler.py  | 21 +++++++++++++++++++++
 5 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/ldm/invoke/generator/base.py b/ldm/invoke/generator/base.py
index c70924449b..03f066323c 100644
--- a/ldm/invoke/generator/base.py
+++ b/ldm/invoke/generator/base.py
@@ -60,11 +60,10 @@ class Generator():
         first_seed          = seed
         seed, initial_noise = self.generate_initial_noise(seed, width, height)
 
-        scope = (scope(self.model.device.type), self.model.ema_scope()) if sampler.conditioning_key() not in ('hybrid','concat') else scope(self.model.device.type)
-        
-        with scope:
+        # There used to be an additional self.model.ema_scope() here, but it breaks
+        # the inpaint-1.5 model. Not sure what it did.... ?
+        with scope(self.model.device.type):
             for n in trange(iterations, desc='Generating'):
-                print('DEBUG: in iterations loop() called')
                 x_T = None
                 if self.variation_amount > 0:
                     seed_everything(seed)
diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index 99fe046654..c8de01addb 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -67,8 +67,8 @@ class Omnibus(Img2Img,Txt2Img):
             t_enc = int(strength * steps)
 
         else: # txt2img
-            init_image = torch.zeros(1, 3, width, height, device=self.model.device)
-            mask_image = torch.ones(1, 1, width, height, device=self.model.device)
+            init_image = torch.zeros(1, 3, height, width, device=self.model.device)
+            mask_image = torch.ones(1, 1, height, width, device=self.model.device)
             masked_image = init_image
 
         model = self.model
diff --git a/ldm/models/diffusion/ksampler.py b/ldm/models/diffusion/ksampler.py
index 59a3bebe4d..5f223cdf46 100644
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@@ -12,22 +12,6 @@ from ldm.modules.diffusionmodules.util import (
     extract_into_tensor,
 )
 
-def make_cond_in(uncond, cond):
-    if isinstance(cond, dict):
-        assert isinstance(uncond, dict)
-        cond_in = dict()
-        for k in cond:
-            if isinstance(cond[k], list):
-                cond_in[k] = [
-                    torch.cat([uncond[k][i], cond[k][i]])
-                    for i in range(len(cond[k]))
-                ]
-            else:
-                cond_in[k] = torch.cat([uncond[k], cond[k]])
-    else:
-        cond_in = torch.cat([uncond, cond])
-    return cond_in
-
 def cfg_apply_threshold(result, threshold = 0.0, scale = 0.7):
     if threshold <= 0.0:
         return result
@@ -43,9 +27,10 @@ def cfg_apply_threshold(result, threshold = 0.0, scale = 0.7):
 
 
 class CFGDenoiser(nn.Module):
-    def __init__(self, model, threshold = 0, warmup = 0):
+    def __init__(self, sampler, threshold = 0, warmup = 0):
         super().__init__()
-        self.inner_model = model
+        self.inner_model = sampler.model
+        self.sampler = sampler
         self.threshold = threshold
         self.warmup_max = warmup
         self.warmup = max(warmup / 10, 1)
@@ -53,7 +38,7 @@ class CFGDenoiser(nn.Module):
     def forward(self, x, sigma, uncond, cond, cond_scale):
         x_in = torch.cat([x] * 2)
         sigma_in = torch.cat([sigma] * 2)
-        cond_in = make_cond_in(uncond,cond)
+        cond_in = self.sampler.make_cond_in(uncond,cond)
         uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
         if self.warmup < self.warmup_max:
             thresh = max(1, 1 + (self.threshold - 1) * (self.warmup / self.warmup_max))
@@ -80,7 +65,7 @@ class KSampler(Sampler):
         def forward(self, x, sigma, uncond, cond, cond_scale):
             x_in = torch.cat([x] * 2)
             sigma_in = torch.cat([sigma] * 2)
-            cond_in  = make_cond_in(uncond, cond)
+            cond_in  = self.make_cond_in(uncond, cond)
             uncond, cond = self.inner_model(
                 x_in, sigma_in, cond=cond_in
             ).chunk(2)
@@ -209,7 +194,7 @@ class KSampler(Sampler):
         else:
             x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0]
 
-        model_wrap_cfg = CFGDenoiser(self.model, threshold=threshold, warmup=max(0.8*S,S-10))
+        model_wrap_cfg = CFGDenoiser(self, threshold=threshold, warmup=max(0.8*S,S-10))
         extra_args = {
             'cond': conditioning,
             'uncond': unconditional_conditioning,
diff --git a/ldm/models/diffusion/plms.py b/ldm/models/diffusion/plms.py
index 9e722eb932..4261f549d2 100644
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@@ -45,7 +45,7 @@ class PLMSSampler(Sampler):
             else:
                 x_in = torch.cat([x] * 2)
                 t_in = torch.cat([t] * 2)
-                c_in = torch.cat([unconditional_conditioning, c])
+                c_in = self.make_cond_in(unconditional_conditioning, c)
                 e_t_uncond, e_t = self.model.apply_model(
                     x_in, t_in, c_in
                 ).chunk(2)
diff --git a/ldm/models/diffusion/sampler.py b/ldm/models/diffusion/sampler.py
index 9e57bc25d4..fd7ba106c1 100644
--- a/ldm/models/diffusion/sampler.py
+++ b/ldm/models/diffusion/sampler.py
@@ -439,3 +439,24 @@ class Sampler(object):
 
     def conditioning_key(self)->str:
         return self.model.model.conditioning_key
+
+    def make_cond_in(self, uncond, cond):
+        '''
+        This handles the choice between a conditional conditioning
+        that is a tensor (used by cross attention) vs one that is a dict
+        used by 'hybrid'
+        '''
+        if isinstance(cond, dict):
+            assert isinstance(uncond, dict)
+            cond_in = dict()
+            for k in cond:
+                if isinstance(cond[k], list):
+                    cond_in[k] = [
+                        torch.cat([uncond[k][i], cond[k][i]])
+                        for i in range(len(cond[k]))
+                    ]
+                else:
+                    cond_in[k] = torch.cat([uncond[k], cond[k]])
+        else:
+            cond_in = torch.cat([uncond, cond])
+        return cond_in

From e33971fe2cb046d08548366bc775a4091b0739ea Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 11:42:30 -0400
Subject: [PATCH 08/15] plms works, bugs quashed

- The plms sampler now works with custom inpainting model
- Quashed bug that was causing generation on normal models to fail (oops!)
- Can now generate non-square images with custom inpainting model

Credits for advice and assistance during porting:

@any-winter-4079 (http://github.com/any-winter-4079)
@db3000 (Danny Beer http://github.com/db3000)
---
 ldm/invoke/generator/base.py     |  7 +++----
 ldm/invoke/generator/omnibus.py  |  4 ++--
 ldm/models/diffusion/ksampler.py | 27 ++++++---------------------
 ldm/models/diffusion/plms.py     |  2 +-
 ldm/models/diffusion/sampler.py  | 21 +++++++++++++++++++++
 5 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/ldm/invoke/generator/base.py b/ldm/invoke/generator/base.py
index c70924449b..03f066323c 100644
--- a/ldm/invoke/generator/base.py
+++ b/ldm/invoke/generator/base.py
@@ -60,11 +60,10 @@ class Generator():
         first_seed          = seed
         seed, initial_noise = self.generate_initial_noise(seed, width, height)
 
-        scope = (scope(self.model.device.type), self.model.ema_scope()) if sampler.conditioning_key() not in ('hybrid','concat') else scope(self.model.device.type)
-        
-        with scope:
+        # There used to be an additional self.model.ema_scope() here, but it breaks
+        # the inpaint-1.5 model. Not sure what it did.... ?
+        with scope(self.model.device.type):
             for n in trange(iterations, desc='Generating'):
-                print('DEBUG: in iterations loop() called')
                 x_T = None
                 if self.variation_amount > 0:
                     seed_everything(seed)
diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index 99fe046654..c8de01addb 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -67,8 +67,8 @@ class Omnibus(Img2Img,Txt2Img):
             t_enc = int(strength * steps)
 
         else: # txt2img
-            init_image = torch.zeros(1, 3, width, height, device=self.model.device)
-            mask_image = torch.ones(1, 1, width, height, device=self.model.device)
+            init_image = torch.zeros(1, 3, height, width, device=self.model.device)
+            mask_image = torch.ones(1, 1, height, width, device=self.model.device)
             masked_image = init_image
 
         model = self.model
diff --git a/ldm/models/diffusion/ksampler.py b/ldm/models/diffusion/ksampler.py
index 59a3bebe4d..5f223cdf46 100644
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@@ -12,22 +12,6 @@ from ldm.modules.diffusionmodules.util import (
     extract_into_tensor,
 )
 
-def make_cond_in(uncond, cond):
-    if isinstance(cond, dict):
-        assert isinstance(uncond, dict)
-        cond_in = dict()
-        for k in cond:
-            if isinstance(cond[k], list):
-                cond_in[k] = [
-                    torch.cat([uncond[k][i], cond[k][i]])
-                    for i in range(len(cond[k]))
-                ]
-            else:
-                cond_in[k] = torch.cat([uncond[k], cond[k]])
-    else:
-        cond_in = torch.cat([uncond, cond])
-    return cond_in
-
 def cfg_apply_threshold(result, threshold = 0.0, scale = 0.7):
     if threshold <= 0.0:
         return result
@@ -43,9 +27,10 @@ def cfg_apply_threshold(result, threshold = 0.0, scale = 0.7):
 
 
 class CFGDenoiser(nn.Module):
-    def __init__(self, model, threshold = 0, warmup = 0):
+    def __init__(self, sampler, threshold = 0, warmup = 0):
         super().__init__()
-        self.inner_model = model
+        self.inner_model = sampler.model
+        self.sampler = sampler
         self.threshold = threshold
         self.warmup_max = warmup
         self.warmup = max(warmup / 10, 1)
@@ -53,7 +38,7 @@ class CFGDenoiser(nn.Module):
     def forward(self, x, sigma, uncond, cond, cond_scale):
         x_in = torch.cat([x] * 2)
         sigma_in = torch.cat([sigma] * 2)
-        cond_in = make_cond_in(uncond,cond)
+        cond_in = self.sampler.make_cond_in(uncond,cond)
         uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
         if self.warmup < self.warmup_max:
             thresh = max(1, 1 + (self.threshold - 1) * (self.warmup / self.warmup_max))
@@ -80,7 +65,7 @@ class KSampler(Sampler):
         def forward(self, x, sigma, uncond, cond, cond_scale):
             x_in = torch.cat([x] * 2)
             sigma_in = torch.cat([sigma] * 2)
-            cond_in  = make_cond_in(uncond, cond)
+            cond_in  = self.make_cond_in(uncond, cond)
             uncond, cond = self.inner_model(
                 x_in, sigma_in, cond=cond_in
             ).chunk(2)
@@ -209,7 +194,7 @@ class KSampler(Sampler):
         else:
             x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0]
 
-        model_wrap_cfg = CFGDenoiser(self.model, threshold=threshold, warmup=max(0.8*S,S-10))
+        model_wrap_cfg = CFGDenoiser(self, threshold=threshold, warmup=max(0.8*S,S-10))
         extra_args = {
             'cond': conditioning,
             'uncond': unconditional_conditioning,
diff --git a/ldm/models/diffusion/plms.py b/ldm/models/diffusion/plms.py
index 9e722eb932..4261f549d2 100644
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@@ -45,7 +45,7 @@ class PLMSSampler(Sampler):
             else:
                 x_in = torch.cat([x] * 2)
                 t_in = torch.cat([t] * 2)
-                c_in = torch.cat([unconditional_conditioning, c])
+                c_in = self.make_cond_in(unconditional_conditioning, c)
                 e_t_uncond, e_t = self.model.apply_model(
                     x_in, t_in, c_in
                 ).chunk(2)
diff --git a/ldm/models/diffusion/sampler.py b/ldm/models/diffusion/sampler.py
index 9e57bc25d4..fd7ba106c1 100644
--- a/ldm/models/diffusion/sampler.py
+++ b/ldm/models/diffusion/sampler.py
@@ -439,3 +439,24 @@ class Sampler(object):
 
     def conditioning_key(self)->str:
         return self.model.model.conditioning_key
+
+    def make_cond_in(self, uncond, cond):
+        '''
+        This handles the choice between a conditional conditioning
+        that is a tensor (used by cross attention) vs one that is a dict
+        used by 'hybrid'
+        '''
+        if isinstance(cond, dict):
+            assert isinstance(uncond, dict)
+            cond_in = dict()
+            for k in cond:
+                if isinstance(cond[k], list):
+                    cond_in[k] = [
+                        torch.cat([uncond[k][i], cond[k][i]])
+                        for i in range(len(cond[k]))
+                    ]
+                else:
+                    cond_in[k] = torch.cat([uncond[k], cond[k]])
+        else:
+            cond_in = torch.cat([uncond, cond])
+        return cond_in

From 4352eb6628bcc0369a739c5b1055cc2c7e407da6 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 13:17:06 -0400
Subject: [PATCH 09/15] stop crashes on non-square images

---
 ldm/invoke/generator/omnibus.py   | 3 ++-
 ldm/invoke/restoration/outcrop.py | 7 ++++++-
 ldm/models/diffusion/sampler.py   | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index c8de01addb..2fccd7c6f7 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -71,6 +71,8 @@ class Omnibus(Img2Img,Txt2Img):
             mask_image = torch.ones(1, 1, height, width, device=self.model.device)
             masked_image = init_image
 
+        height = init_image.shape[2]
+        width = init_image.shape[3]
         model = self.model
 
         def make_image(x_T):
@@ -88,7 +90,6 @@ class Omnibus(Img2Img,Txt2Img):
                     )
                     
                     c = model.cond_stage_model.encode(batch["txt"])
-
                     c_cat = list()
                     for ck in model.concat_keys:
                         cc = batch[ck].float()
diff --git a/ldm/invoke/restoration/outcrop.py b/ldm/invoke/restoration/outcrop.py
index 017d9de7e1..0c4831dd84 100644
--- a/ldm/invoke/restoration/outcrop.py
+++ b/ldm/invoke/restoration/outcrop.py
@@ -89,6 +89,9 @@ class Outcrop(object):
     def _extend(self,image:Image,pixels:int)-> Image:
         extended_img = Image.new('RGBA',(image.width,image.height+pixels))
 
+        mask_height = pixels if self.generate.model.model.conditioning_key in ('hybrid','concat') \
+                      else pixels *2
+
         # first paste places old image at top of extended image, stretch
         # it, and applies a gaussian blur to it
         # take the top half region, stretch and paste it
@@ -105,7 +108,9 @@ class Outcrop(object):
         
         # now make the top part transparent to use as a mask
         alpha = extended_img.getchannel('A')
-        alpha.paste(0,(0,0,extended_img.width,pixels*2))
+        alpha.paste(0,(0,0,extended_img.width,mask_height))
         extended_img.putalpha(alpha)
 
+        extended_img.save('outputs/curly_extended.png')
+
         return extended_img
diff --git a/ldm/models/diffusion/sampler.py b/ldm/models/diffusion/sampler.py
index fd7ba106c1..4e9fce0102 100644
--- a/ldm/models/diffusion/sampler.py
+++ b/ldm/models/diffusion/sampler.py
@@ -265,6 +265,7 @@ class Sampler(object):
             )
 
             if mask is not None:
+                print('DEBUG: in masking routine')
                 assert x0 is not None
                 img_orig = self.model.q_sample(
                     x0, ts

From 3c1ef48fe24c027131b7c846970bec963b6c2b69 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 13:57:42 -0400
Subject: [PATCH 10/15] fix crash when doing img2img with ddim sampler and SD
 1.5

---
 ldm/models/diffusion/sampler.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/ldm/models/diffusion/sampler.py b/ldm/models/diffusion/sampler.py
index 4e9fce0102..ccebed3bfc 100644
--- a/ldm/models/diffusion/sampler.py
+++ b/ldm/models/diffusion/sampler.py
@@ -319,20 +319,6 @@ class Sampler(object):
             init_latent       = None,
             mask              = None,
     ):
-
-        print(f'DEBUG(sampler): cond = {cond}')
-        if cond is not None:
-            if isinstance(cond, dict):
-                ctmp = cond[list(cond.keys())[0]]
-                while isinstance(ctmp, list):
-                    ctmp = ctmp[0]
-                cbs = ctmp.shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conds but batch-size is {batch_size}")
-            else:
-                if cond.shape[0] != batch_size:
-                    print(f"Warning: Got {cond.shape[0]} conditionings but batch-size is {batch_size}")
-
         timesteps = (
             np.arange(self.ddpm_num_timesteps)
             if use_original_steps

From ca2f579f434269a8061f3fa85c9b2772330cbd30 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 15:56:07 -0400
Subject: [PATCH 11/15] prevent crash when providing empty quoted prompt ("")

---
 ldm/invoke/args.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ldm/invoke/args.py b/ldm/invoke/args.py
index 2f8e2303cb..7068bd83c1 100644
--- a/ldm/invoke/args.py
+++ b/ldm/invoke/args.py
@@ -181,7 +181,9 @@ class Args(object):
         switches_started = False
 
         for element in elements:
-            if element[0] == '-' and not switches_started:
+            if len(element) == 0:  # empty prompt
+                pass
+            elif element[0] == '-' and not switches_started:
                 switches_started = True
             if switches_started:
                 switches.append(element)

From 8d5a2250117b8542bddce3e1733207382051f326 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 17:26:00 -0400
Subject: [PATCH 12/15] allow for empty prompts (useful for inpaint removal)

---
 scripts/invoke.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/invoke.py b/scripts/invoke.py
index faa85de80e..466536bc46 100644
--- a/scripts/invoke.py
+++ b/scripts/invoke.py
@@ -171,9 +171,9 @@ def main_loop(gen, opt):
             except (OSError, AttributeError, KeyError):
                 pass
 
-        if len(opt.prompt) == 0:
-            print('\nTry again with a prompt!')
-            continue
+#        if len(opt.prompt) == 0:
+#            print('\nTry again with a prompt!')
+#            continue
 
         # width and height are set by model if not specified
         if not opt.width:

From d3047c7cb096681226df5e2f5615923891af41e1 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Tue, 25 Oct 2022 22:44:42 -0400
Subject: [PATCH 13/15] do not encode init image in starting latent

---
 ldm/invoke/generator/omnibus.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index 2fccd7c6f7..d841cac64f 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -64,7 +64,6 @@ class Omnibus(Img2Img,Txt2Img):
             mask_image = torch.ones(1, 1, init_image.shape[2], init_image.shape[3], device=self.model.device)
             # and the masked image is just a copy of the original
             masked_image = init_image
-            t_enc = int(strength * steps)
 
         else: # txt2img
             init_image = torch.zeros(1, 3, height, width, device=self.model.device)
@@ -111,7 +110,7 @@ class Omnibus(Img2Img,Txt2Img):
 
                     samples, _ = sampler.sample(
                         batch_size = 1,
-                        S = t_enc,
+                        S = steps,
                         x_T = x_T,
                         conditioning = cond,
                         shape = shape,
@@ -145,7 +144,4 @@ class Omnibus(Img2Img,Txt2Img):
         return batch
 
     def get_noise(self, width:int, height:int):
-        if self.init_latent:
-            return super(Img2Img,self).get_noise(width,height)
-        else:
-            return super(Txt2Img,self).get_noise(width,height)
+        return super(Txt2Img,self).get_noise(width,height)

From 906dafe3cd286a0c7fc079d1d37fcec6756e4aec Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Wed, 26 Oct 2022 00:18:31 -0400
Subject: [PATCH 14/15] make variations work with inpainting model

---
 ldm/invoke/generator/base.py    | 5 ++++-
 ldm/invoke/generator/img2img.py | 2 +-
 ldm/invoke/generator/omnibus.py | 6 +++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/ldm/invoke/generator/base.py b/ldm/invoke/generator/base.py
index e326bcfe8d..2e96c93cbb 100644
--- a/ldm/invoke/generator/base.py
+++ b/ldm/invoke/generator/base.py
@@ -6,6 +6,7 @@ import torch
 import numpy as  np
 import random
 import os
+import traceback
 from tqdm import tqdm, trange
 from PIL import Image, ImageFilter
 from einops import rearrange, repeat
@@ -82,7 +83,9 @@ class Generator():
                     try:
                         x_T = self.get_noise(width,height)
                     except:
-                        pass
+                        print('** An error occurred while getting initial noise **')
+                        print(traceback.format_exc())
+
                 image = make_image(x_T)
 
                 if self.safety_checker is not None:
diff --git a/ldm/invoke/generator/img2img.py b/ldm/invoke/generator/img2img.py
index 31c3ca256e..73eb2e6a06 100644
--- a/ldm/invoke/generator/img2img.py
+++ b/ldm/invoke/generator/img2img.py
@@ -14,7 +14,7 @@ from ldm.models.diffusion.ddim import DDIMSampler
 class Img2Img(Generator):
     def __init__(self, model, precision):
         super().__init__(model, precision)
-        self.init_latent         = None    # by get_noise()
+        self.init_latent = None    # by get_noise()
 
     def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
                        conditioning,init_image,strength,step_callback=None,threshold=0.0,perlin=0.0,**kwargs):
diff --git a/ldm/invoke/generator/omnibus.py b/ldm/invoke/generator/omnibus.py
index d841cac64f..e0705ec397 100644
--- a/ldm/invoke/generator/omnibus.py
+++ b/ldm/invoke/generator/omnibus.py
@@ -70,6 +70,7 @@ class Omnibus(Img2Img,Txt2Img):
             mask_image = torch.ones(1, 1, height, width, device=self.model.device)
             masked_image = init_image
 
+        self.init_latent = init_image
         height = init_image.shape[2]
         width = init_image.shape[3]
         model = self.model
@@ -144,4 +145,7 @@ class Omnibus(Img2Img,Txt2Img):
         return batch
 
     def get_noise(self, width:int, height:int):
-        return super(Txt2Img,self).get_noise(width,height)
+        if self.init_latent is not None:
+            height = self.init_latent.shape[2]
+            width = self.init_latent.shape[3]
+        return Txt2Img.get_noise(self,width,height)

From b1da13a984b7e0c8e5d7fa09f250b355040db303 Mon Sep 17 00:00:00 2001
From: Lincoln Stein <lincoln.stein@gmail.com>
Date: Wed, 26 Oct 2022 08:29:56 -0400
Subject: [PATCH 15/15] minor cleanups

- change default model back to 1.4
- remove --fnformat from canonicalized dream prompt arguments
  (not needed for image reproducibility)
- add -tm to canonicalized dream prompt arguments
  (definitely needed for image reproducibility)
---
 configs/models.yaml | 2 +-
 ldm/invoke/args.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/models.yaml b/configs/models.yaml
index 40bdc88cd8..162da38da2 100644
--- a/configs/models.yaml
+++ b/configs/models.yaml
@@ -12,6 +12,7 @@ stable-diffusion-1.4:
   description: Stable Diffusion inference model version 1.4
   width: 512
   height: 512
+  default: true
 inpainting-1.5:
   description: runwayML tuned inpainting model v1.5
   weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
@@ -19,7 +20,6 @@ inpainting-1.5:
 #  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
   width: 512
   height: 512
-  default: true
 stable-diffusion-1.5:
   config: configs/stable-diffusion/v1-inference.yaml
   weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
diff --git a/ldm/invoke/args.py b/ldm/invoke/args.py
index 7068bd83c1..a57928e22f 100644
--- a/ldm/invoke/args.py
+++ b/ldm/invoke/args.py
@@ -219,7 +219,6 @@ class Args(object):
         switches.append(f'-W {a["width"]}')
         switches.append(f'-H {a["height"]}')
         switches.append(f'-C {a["cfg_scale"]}')
-        switches.append(f'--fnformat {a["fnformat"]}')
         if a['perlin'] > 0:
             switches.append(f'--perlin {a["perlin"]}')
         if a['threshold'] > 0:
@@ -245,6 +244,8 @@ class Args(object):
                 switches.append(f'-f {a["strength"]}')
             if a['inpaint_replace']:
                 switches.append(f'--inpaint_replace')
+            if a['text_mask']:
+                switches.append(f'-tm {" ".join([str(u) for u in a["text_mask"]])}')
         else:
             switches.append(f'-A {a["sampler_name"]}')