diff --git a/backend/server.py b/backend/server.py index f14c141e12..7b8a8a5a69 100644 --- a/backend/server.py +++ b/backend/server.py @@ -527,7 +527,7 @@ def parameters_to_generated_image_metadata(parameters): rfc_dict["sampler"] = parameters["sampler_name"] # display weighted subprompts (liable to change) - subprompts = split_weighted_subprompts(parameters["prompt"], skip_normalize=True) + subprompts = split_weighted_subprompts(parameters["prompt"]) subprompts = [{"prompt": x[0], "weight": x[1]} for x in subprompts] rfc_dict["prompt"] = subprompts diff --git a/cross_attention_loop.py b/cross_attention_loop.py deleted file mode 100644 index ed3e3b0462..0000000000 --- a/cross_attention_loop.py +++ /dev/null @@ -1,186 +0,0 @@ -import random -import traceback - -import numpy as np -import torch - -from diffusers import (LMSDiscreteScheduler) -from PIL import Image -from torch import autocast -from tqdm.auto import tqdm - -import .ldm.models.diffusion.cross_attention - - -@torch.no_grad() -def stablediffusion( - clip, - clip_tokenizer, - device, - vae, - unet, - prompt='', - prompt_edit=None, - prompt_edit_token_weights=None, - prompt_edit_tokens_start=0.0, - prompt_edit_tokens_end=1.0, - prompt_edit_spatial_start=0.0, - prompt_edit_spatial_end=1.0, - guidance_scale=7.5, - steps=50, - seed=None, - width=512, - height=512, - init_image=None, - init_image_strength=0.5, - ): - if prompt_edit_token_weights is None: - prompt_edit_token_weights = [] - # Change size to multiple of 64 to prevent size mismatches inside model - width = width - width % 64 - height = height - height % 64 - - # If seed is None, randomly select seed from 0 to 2^32-1 - if seed is None: seed = random.randrange(2**32 - 1) - generator = torch.manual_seed(seed) - - # Set inference timesteps to scheduler - scheduler = LMSDiscreteScheduler(beta_start=0.00085, - beta_end=0.012, - beta_schedule='scaled_linear', - num_train_timesteps=1000, - ) - scheduler.set_timesteps(steps) - - # Preprocess image if it exists (img2img) - if init_image is not None: - # Resize and transpose for numpy b h w c -> torch b c h w - init_image = init_image.resize((width, height), resample=Image.Resampling.LANCZOS) - init_image = np.array(init_image).astype(np.float32) / 255.0 * 2.0 - 1.0 - init_image = torch.from_numpy(init_image[np.newaxis, ...].transpose(0, 3, 1, 2)) - - # If there is alpha channel, composite alpha for white, as the diffusion - # model does not support alpha channel - if init_image.shape[1] > 3: - init_image = init_image[:, :3] * init_image[:, 3:] + (1 - init_image[:, 3:]) - - # Move image to GPU - init_image = init_image.to(device) - - # Encode image - with autocast(device): - init_latent = (vae.encode(init_image) - .latent_dist - .sample(generator=generator) - * 0.18215) - - t_start = steps - int(steps * init_image_strength) - - else: - init_latent = torch.zeros((1, unet.in_channels, height // 8, width // 8), - device=device) - t_start = 0 - - # Generate random normal noise - noise = torch.randn(init_latent.shape, generator=generator, device=device) - latent = scheduler.add_noise(init_latent, - noise, - torch.tensor([scheduler.timesteps[t_start]], device=device) - ).to(device) - - # Process clip - with autocast(device): - tokens_uncond = clip_tokenizer('', padding='max_length', - max_length=clip_tokenizer.model_max_length, - truncation=True, return_tensors='pt', - return_overflowing_tokens=True - ) - embedding_uncond = clip(tokens_uncond.input_ids.to(device)).last_hidden_state - - tokens_cond = clip_tokenizer(prompt, padding='max_length', - max_length=clip_tokenizer.model_max_length, - truncation=True, return_tensors='pt', - return_overflowing_tokens=True - ) - embedding_cond = clip(tokens_cond.input_ids.to(device)).last_hidden_state - - # Process prompt editing - if prompt_edit is not None: - tokens_cond_edit = clip_tokenizer(prompt_edit, padding='max_length', - max_length=clip_tokenizer.model_max_length, - truncation=True, return_tensors='pt', - return_overflowing_tokens=True - ) - embedding_cond_edit = clip(tokens_cond_edit.input_ids.to(device)).last_hidden_state - - c_a_c.init_attention_edit(tokens_cond, tokens_cond_edit) - - c_a_c.init_attention_func() - c_a_c.init_attention_weights(prompt_edit_token_weights) - - timesteps = scheduler.timesteps[t_start:] - - for idx, timestep in tqdm(enumerate(timesteps), total=len(timesteps)): - t_index = t_start + idx - - latent_model_input = latent - latent_model_input = scheduler.scale_model_input(latent_model_input, timestep) - - # Predict the unconditional noise residual - noise_pred_uncond = unet(latent_model_input, - timestep, - encoder_hidden_states=embedding_uncond - ).sample - - # Prepare the Cross-Attention layers - if prompt_edit is not None: - c_a_c.save_last_tokens_attention() - c_a_c.save_last_self_attention() - else: - # Use weights on non-edited prompt when edit is None - c_a_c.use_last_tokens_attention_weights() - - # Predict the conditional noise residual and save the - # cross-attention layer activations - noise_pred_cond = unet(latent_model_input, - timestep, - encoder_hidden_states=embedding_cond - ).sample - - # Edit the Cross-Attention layer activations - if prompt_edit is not None: - t_scale = timestep / scheduler.num_train_timesteps - if (t_scale >= prompt_edit_tokens_start - and t_scale <= prompt_edit_tokens_end): - c_a_c.use_last_tokens_attention() - if (t_scale >= prompt_edit_spatial_start - and t_scale <= prompt_edit_spatial_end): - c_a_c.use_last_self_attention() - - # Use weights on edited prompt - c_a_c.use_last_tokens_attention_weights() - - # Predict the edited conditional noise residual using the - # cross-attention masks - noise_pred_cond = unet(latent_model_input, - timestep, - encoder_hidden_states=embedding_cond_edit - ).sample - - # Perform guidance - noise_pred = (noise_pred_uncond + guidance_scale - * (noise_pred_cond - noise_pred_uncond)) - - latent = scheduler.step(noise_pred, - t_index, - latent - ).prev_sample - - # scale and decode the image latents with vae - latent = latent / 0.18215 - image = vae.decode(latent.to(vae.dtype)).sample - - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).numpy() - image = (image[0] * 255).round().astype('uint8') - return Image.fromarray(image) diff --git a/ldm/invoke/generator/img2img.py b/ldm/invoke/generator/img2img.py index cfe3ff99bc..2f5e6e61d0 100644 --- a/ldm/invoke/generator/img2img.py +++ b/ldm/invoke/generator/img2img.py @@ -51,9 +51,8 @@ class Img2Img(Generator): img_callback = step_callback, unconditional_guidance_scale=cfg_scale, unconditional_conditioning=uc, - init_latent = self.init_latent, + init_latent = self.init_latent, # changes how noising is performed in ksampler extra_conditioning_info = extra_conditioning_info - # changes how noising is performed in ksampler ) return self.sample_to_image(samples) diff --git a/ldm/invoke/txt2mask.py b/ldm/invoke/txt2mask.py index 01d93546e3..bc8251abde 100644 --- a/ldm/invoke/txt2mask.py +++ b/ldm/invoke/txt2mask.py @@ -29,9 +29,9 @@ work fine. import torch import numpy as np -from models.clipseg import CLIPDensePredT +from clipseg_models.clipseg import CLIPDensePredT from einops import rearrange, repeat -from PIL import Image +from PIL import Image, ImageOps from torchvision import transforms CLIP_VERSION = 'ViT-B/16' @@ -50,9 +50,14 @@ class SegmentedGrayscale(object): discrete_heatmap = self.heatmap.lt(threshold).int() return self._rescale(Image.fromarray(np.uint8(discrete_heatmap*255),mode='L')) - def to_transparent(self)->Image: + def to_transparent(self,invert:bool=False)->Image: transparent_image = self.image.copy() - transparent_image.putalpha(self.to_grayscale()) + gs = self.to_grayscale() + # The following line looks like a bug, but isn't. + # For img2img, we want the selected regions to be transparent, + # but to_grayscale() returns the opposite. + gs = ImageOps.invert(gs) if not invert else gs + transparent_image.putalpha(gs) return transparent_image # unscales and uncrops the 352x352 heatmap so that it matches the image again @@ -79,7 +84,7 @@ class Txt2Mask(object): self.model.load_state_dict(torch.load(CLIPSEG_WEIGHTS, map_location=torch.device('cpu')), strict=False) @torch.no_grad() - def segment(self, image:Image, prompt:str) -> SegmentedGrayscale: + def segment(self, image, prompt:str) -> SegmentedGrayscale: ''' Given a prompt string such as "a bagel", tries to identify the object in the provided image and returns a SegmentedGrayscale object in which the brighter @@ -94,6 +99,10 @@ class Txt2Mask(object): transforms.Resize((CLIPSEG_SIZE, CLIPSEG_SIZE)), # must be multiple of 64... ]) + if type(image) is str: + image = Image.open(image).convert('RGB') + + image = ImageOps.exif_transpose(image) img = self._scale_and_crop(image) img = transform(img).unsqueeze(0) diff --git a/ldm/models/diffusion/ddim.py b/ldm/models/diffusion/ddim.py index 98219fb62e..71944a9b7e 100644 --- a/ldm/models/diffusion/ddim.py +++ b/ldm/models/diffusion/ddim.py @@ -1,5 +1,4 @@ """SAMPLING ONLY.""" -from typing import Union import torch from ldm.models.diffusion.shared_invokeai_diffusion import InvokeAIDiffuserComponent @@ -29,7 +28,7 @@ class DDIMSampler(Sampler): def p_sample( self, x, - c: Union[torch.Tensor, list], + c, t, index, repeat_noise=False, diff --git a/ldm/modules/diffusionmodules/model.py b/ldm/modules/diffusionmodules/model.py index 73218d36f8..739710d006 100644 --- a/ldm/modules/diffusionmodules/model.py +++ b/ldm/modules/diffusionmodules/model.py @@ -8,7 +8,7 @@ import numpy as np from einops import rearrange from ldm.util import instantiate_from_config -#from ldm.modules.attention import LinearAttention +from ldm.modules.attention import LinearAttention import psutil @@ -151,10 +151,10 @@ class ResnetBlock(nn.Module): return x + h -#class LinAttnBlock(LinearAttention): -# """to match AttnBlock usage""" -# def __init__(self, in_channels): -# super().__init__(dim=in_channels, heads=1, dim_head=in_channels) +class LinAttnBlock(LinearAttention): + """to match AttnBlock usage""" + def __init__(self, in_channels): + super().__init__(dim=in_channels, heads=1, dim_head=in_channels) class AttnBlock(nn.Module):