diff --git a/configs/stable-diffusion/v1-inpainting-inference.yaml b/configs/stable-diffusion/v1-inpainting-inference.yaml index 5652e04374..3ea164a359 100644 --- a/configs/stable-diffusion/v1-inpainting-inference.yaml +++ b/configs/stable-diffusion/v1-inpainting-inference.yaml @@ -76,4 +76,4 @@ model: target: torch.nn.Identity cond_stage_config: - target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + target: ldm.modules.encoders.modules.WeightedFrozenCLIPEmbedder diff --git a/ldm/models/diffusion/ksampler.py b/ldm/models/diffusion/ksampler.py index e3c8729dc3..80404179a8 100644 --- a/ldm/models/diffusion/ksampler.py +++ b/ldm/models/diffusion/ksampler.py @@ -43,14 +43,7 @@ class CFGDenoiser(nn.Module): def forward(self, x, sigma, uncond, cond, cond_scale): - if isinstance(cond,dict): # hybrid model - x_in = torch.cat([x] * 2) - sigma_in = torch.cat([sigma] * 2) - cond_in = self.sampler.make_cond_in(uncond,cond) - uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2) - next_x = uncond + (cond - uncond) * cond_scale - else: # cross attention model - next_x = self.invokeai_diffuser.do_diffusion_step(x, sigma, uncond, cond, cond_scale) + next_x = self.invokeai_diffuser.do_diffusion_step(x, sigma, uncond, cond, cond_scale) if self.warmup < self.warmup_max: thresh = max(1, 1 + (self.threshold - 1) * (self.warmup / self.warmup_max)) self.warmup += 1 diff --git a/ldm/models/diffusion/shared_invokeai_diffusion.py b/ldm/models/diffusion/shared_invokeai_diffusion.py index b8a7a04d0e..e985417b2b 100644 --- a/ldm/models/diffusion/shared_invokeai_diffusion.py +++ b/ldm/models/diffusion/shared_invokeai_diffusion.py @@ -90,7 +90,19 @@ class InvokeAIDiffuserComponent: # faster batched path x_twice = torch.cat([x]*2) sigma_twice = torch.cat([sigma]*2) - both_conditionings = torch.cat([unconditioning, conditioning]) + if isinstance(conditioning, dict): + assert isinstance(unconditioning, dict) + both_conditionings = dict() + for k in conditioning: + if isinstance(conditioning[k], list): + both_conditionings[k] = [ + torch.cat([unconditioning[k][i], conditioning[k][i]]) + for i in range(len(conditioning[k])) + ] + else: + both_conditionings[k] = torch.cat([unconditioning[k], conditioning[k]]) + else: + both_conditionings = torch.cat([unconditioning, conditioning]) unconditioned_next_x, conditioned_next_x = self.model_forward_callback(x_twice, sigma_twice, both_conditionings).chunk(2) else: #print('pct', percent_through, ': doing cross attention control on', cross_attention_control_types_to_do) diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py index 8917a27a40..670ab3f298 100644 --- a/ldm/modules/encoders/modules.py +++ b/ldm/modules/encoders/modules.py @@ -439,7 +439,7 @@ class FrozenCLIPEmbedder(AbstractEncoder): param.requires_grad = False def forward(self, text, **kwargs): - + print(f'DEBUG text={text}, max_length={self.max_length}') batch_encoding = self.tokenizer( text, truncation=True,