Merge branch 'development' into main

This commit is contained in:
Lincoln Stein
2022-09-26 02:26:13 -04:00
committed by GitHub
185 changed files with 10601 additions and 5947 deletions

View File

@@ -2,7 +2,10 @@
The Args class parses both the command line (shell) arguments, as well as the
command string passed at the dream> prompt. It serves as the definitive repository
of all the arguments used by Generate and their default values.
of all the arguments used by Generate and their default values, and implements the
preliminary metadata standards discussed here:
https://github.com/lstein/stable-diffusion/issues/266
To use:
opt = Args()
@@ -52,15 +55,40 @@ you wish to apply logic as to which one to use. For example:
To add new attributes, edit the _create_arg_parser() and
_create_dream_cmd_parser() methods.
We also export the function build_metadata
**Generating and retrieving sd-metadata**
To generate a dict representing RFC266 metadata:
metadata = metadata_dumps(opt,<seeds,model_hash,postprocesser>)
This will generate an RFC266 dictionary that can then be turned into a JSON
and written to the PNG file. The optional seeds, weights, model_hash and
postprocesser arguments are not available to the opt object and so must be
provided externally. See how dream.py does it.
Note that this function was originally called format_metadata() and a wrapper
is provided that issues a deprecation notice.
To retrieve a (series of) opt objects corresponding to the metadata, do this:
opt_list = metadata_loads(metadata)
The metadata should be pulled out of the PNG image. pngwriter has a method
retrieve_metadata that will do this, or you can do it in one swell foop
with metadata_from_png():
opt_list = metadata_from_png('/path/to/image_file.png')
"""
import argparse
from argparse import Namespace
import shlex
import json
import hashlib
import os
import copy
import base64
import ldm.dream.pngwriter
from ldm.dream.conditioning import split_weighted_subprompts
SAMPLER_CHOICES = [
@@ -74,6 +102,13 @@ SAMPLER_CHOICES = [
'plms',
]
PRECISION_CHOICES = [
'auto',
'float32',
'autocast',
'float16',
]
# is there a way to pick this up during git commits?
APP_ID = 'lstein/stable-diffusion'
APP_VERSION = 'v1.15'
@@ -105,6 +140,7 @@ class Args(object):
try:
elements = shlex.split(command)
except ValueError:
import sys, traceback
print(traceback.format_exc(), file=sys.stderr)
return
switches = ['']
@@ -141,35 +177,49 @@ class Args(object):
a = vars(self)
a.update(kwargs)
switches = list()
switches.append(f'"{a["prompt"]}')
switches.append(f'"{a["prompt"]}"')
switches.append(f'-s {a["steps"]}')
switches.append(f'-S {a["seed"]}')
switches.append(f'-W {a["width"]}')
switches.append(f'-H {a["height"]}')
switches.append(f'-C {a["cfg_scale"]}')
switches.append(f'-A {a["sampler_name"]}')
switches.append(f'-S {a["seed"]}')
if a['grid']:
switches.append('--grid')
if a['iterations'] and a['iterations']>0:
switches.append(f'-n {a["iterations"]}')
if a['seamless']:
switches.append('--seamless')
# img2img generations have parameters relevant only to them and have special handling
if a['init_img'] and len(a['init_img'])>0:
switches.append(f'-I {a["init_img"]}')
if a['fit']:
switches.append(f'--fit')
if a['strength'] and a['strength']>0:
switches.append(f'-f {a["strength"]}')
switches.append(f'-A ddim') # TODO: FIX ME WHEN IMG2IMG SUPPORTS ALL SAMPLERS
if a['fit']:
switches.append(f'--fit')
if a['init_mask'] and len(a['init_mask'])>0:
switches.append(f'-M {a["init_mask"]}')
if a['init_color'] and len(a['init_color'])>0:
switches.append(f'--init_color {a["init_color"]}')
if a['strength'] and a['strength']>0:
switches.append(f'-f {a["strength"]}')
else:
switches.append(f'-A {a["sampler_name"]}')
# gfpgan-specific parameters
if a['gfpgan_strength']:
switches.append(f'-G {a["gfpgan_strength"]}')
# esrgan-specific parameters
if a['upscale']:
switches.append(f'-U {" ".join([str(u) for u in a["upscale"]])}')
# embiggen parameters
if a['embiggen']:
switches.append(f'--embiggen {" ".join([str(u) for u in a["embiggen"]])}')
if a['embiggen_tiles']:
switches.append(f'--embiggen_tiles {" ".join([str(u) for u in a["embiggen_tiles"]])}')
if a['variation_amount'] > 0:
switches.append(f'-v {a["variation_amount"]}')
# outpainting parameters
if a['out_direction']:
switches.append(f'-D {" ".join([str(u) for u in a["out_direction"]])}')
if a['with_variations']:
formatted_variations = ','.join(f'{seed}:{weight}' for seed, weight in (a["with_variations"]))
switches.append(f'-V {formatted_variations}')
@@ -189,10 +239,10 @@ class Args(object):
pass
if cmd_switches and arg_switches and name=='__dict__':
a = arg_switches.__dict__
a.update(cmd_switches.__dict__)
return a
return self._merge_dict(
arg_switches.__dict__,
cmd_switches.__dict__,
)
try:
return object.__getattribute__(self,name)
except AttributeError:
@@ -216,13 +266,8 @@ class Args(object):
# the arg value. For example, the --grid and --individual options are a little
# funny because of their push/pull relationship. This is how to handle it.
if name=='grid':
return value_arg or value_cmd # arg supersedes cmd
if name=='individual':
return value_cmd or value_arg # cmd supersedes arg
if value_cmd is not None:
return value_cmd
else:
return value_arg
return not cmd_switches.individual and value_arg # arg supersedes cmd
return value_cmd if value_cmd is not None else value_arg
def __setattr__(self,name,value):
if name.startswith('_'):
@@ -230,6 +275,14 @@ class Args(object):
else:
self._cmd_switches.__dict__[name] = value
def _merge_dict(self,dict1,dict2):
new_dict = {}
for k in set(list(dict1.keys())+list(dict2.keys())):
value1 = dict1.get(k,None)
value2 = dict2.get(k,None)
new_dict[k] = value2 if value2 is not None else value1
return new_dict
def _create_arg_parser(self):
'''
This defines all the arguments used on the command line when you launch
@@ -268,12 +321,38 @@ class Args(object):
default='stable-diffusion-1.4',
help='Indicates which diffusion model to load. (currently "stable-diffusion-1.4" (default) or "laion400m")',
)
model_group.add_argument(
'--sampler',
'-A',
'-m',
dest='sampler_name',
type=str,
choices=SAMPLER_CHOICES,
metavar='SAMPLER_NAME',
help=f'Switch to a different sampler. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
default='k_lms',
)
model_group.add_argument(
'-F',
'--full_precision',
dest='full_precision',
action='store_true',
help='Use more memory-intensive full precision math for calculations',
help='Deprecated way to set --precision=float32',
)
model_group.add_argument(
'--free_gpu_mem',
dest='free_gpu_mem',
action='store_true',
help='Force free gpu memory before final decoding',
)
model_group.add_argument(
'--precision',
dest='precision',
type=str,
choices=PRECISION_CHOICES,
metavar='PRECISION',
help=f'Set model precision. Defaults to auto selected based on device. Options: {", ".join(PRECISION_CHOICES)}',
default='auto',
)
file_group.add_argument(
'--from_file',
@@ -294,11 +373,6 @@ class Args(object):
action='store_true',
help='Place images in subdirectories named after the prompt.',
)
render_group.add_argument(
'--seamless',
action='store_true',
help='Change the model to seamless tiling (circular) mode',
)
render_group.add_argument(
'--grid',
'-g',
@@ -310,16 +384,21 @@ class Args(object):
type=str,
help='Path to a pre-trained embedding manager checkpoint - can only be set on command line',
)
# GFPGAN related args
# Restoration related args
postprocessing_group.add_argument(
'--gfpgan_bg_upsampler',
type=str,
default='realesrgan',
help='Background upsampler. Default: realesrgan. Options: realesrgan, none.',
'--no_restore',
dest='restore',
action='store_false',
help='Disable face restoration with GFPGAN or codeformer',
)
postprocessing_group.add_argument(
'--gfpgan_bg_tile',
'--no_upscale',
dest='esrgan',
action='store_false',
help='Disable upscaling with ESRGAN',
)
postprocessing_group.add_argument(
'--esrgan_bg_tile',
type=int,
default=400,
help='Tile size for background sampler, 0 for no tile during testing. Default: 400.',
@@ -327,7 +406,7 @@ class Args(object):
postprocessing_group.add_argument(
'--gfpgan_model_path',
type=str,
default='experiments/pretrained_models/GFPGANv1.3.pth',
default='experiments/pretrained_models/GFPGANv1.4.pth',
help='Indicates the path to the GFPGAN model, relative to --gfpgan_dir.',
)
postprocessing_group.add_argument(
@@ -359,7 +438,10 @@ class Args(object):
# This creates the parser that processes commands on the dream> command line
def _create_dream_cmd_parser(self):
parser = argparse.ArgumentParser(
description='Example: dream> a fantastic alien landscape -W1024 -H960 -s100 -n12'
description="""
Generate example: dream> a fantastic alien landscape -W576 -H512 -s60 -n4
Postprocess example: dream> !pp 0000045.4829112.png -G1 -U4 -ft codeformer
"""
)
render_group = parser.add_argument_group('General rendering')
img2img_group = parser.add_argument_group('Image-to-image and inpainting')
@@ -393,14 +475,12 @@ class Args(object):
'--width',
type=int,
help='Image width, multiple of 64',
default=512
)
render_group.add_argument(
'-H',
'--height',
type=int,
help='Image height, multiple of 64',
default=512,
)
render_group.add_argument(
'-C',
@@ -416,8 +496,8 @@ class Args(object):
help='generate a grid'
)
render_group.add_argument(
'--individual',
'-i',
'--individual',
action='store_true',
help='override command-line --grid setting and generate individual images'
)
@@ -436,7 +516,6 @@ class Args(object):
choices=SAMPLER_CHOICES,
metavar='SAMPLER_NAME',
help=f'Switch to a different sampler. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
default='k_lms',
)
render_group.add_argument(
'-t',
@@ -448,7 +527,6 @@ class Args(object):
'--outdir',
'-o',
type=str,
default='outputs/img-samples',
help='Directory to save generated images and a log of prompts and seeds',
)
img2img_group.add_argument(
@@ -463,6 +541,11 @@ class Args(object):
type=str,
help='Path to input mask for inpainting mode (supersedes width and height)',
)
img2img_group.add_argument(
'--init_color',
type=str,
help='Path to reference image for color correction (used for repeated img2img and inpainting)'
)
img2img_group.add_argument(
'-T',
'-fit',
@@ -477,12 +560,34 @@ class Args(object):
help='Strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely',
default=0.75,
)
img2img_group.add_argument(
'-D',
'--out_direction',
nargs='+',
type=str,
metavar=('direction', 'pixels'),
help='Direction to extend the given image (left|right|top|bottom). If a distance pixel value is not specified it defaults to half the image size'
)
postprocessing_group.add_argument(
'-ft',
'--facetool',
type=str,
default='gfpgan',
help='Select the face restoration AI to use: gfpgan, codeformer',
)
postprocessing_group.add_argument(
'-G',
'--gfpgan_strength',
type=float,
help='The strength at which to apply the GFPGAN model to the result, in order to improve faces.',
default=0,
help='The strength at which to apply the face restoration to the result.',
default=0.0,
)
postprocessing_group.add_argument(
'-cf',
'--codeformer_fidelity',
type=float,
help='Used along with CodeFormer. Takes values between 0 and 1. 0 produces high quality but low accuracy. 1 produces high accuracy but low quality.',
default=0.75
)
postprocessing_group.add_argument(
'-U',
@@ -535,36 +640,55 @@ class Args(object):
)
return parser
# very partial implementation of https://github.com/lstein/stable-diffusion/issues/266
# it does not write all the required top-level metadata, writes too much image
# data, and doesn't support grids yet. But you gotta start somewhere, no?
def format_metadata(opt,
seeds=[],
weights=None,
model_hash=None,
postprocessing=None):
def format_metadata(**kwargs):
print(f'format_metadata() is deprecated. Please use metadata_dumps()')
return metadata_dumps(kwargs)
def metadata_dumps(opt,
seeds=[],
model_hash=None,
postprocessing=None):
'''
Given an Args object, returns a partial implementation of
the stable diffusion metadata standard
Given an Args object, returns a dict containing the keys and
structure of the proposed stable diffusion metadata standard
https://github.com/lstein/stable-diffusion/discussions/392
This is intended to be turned into JSON and stored in the
"sd
'''
# top-level metadata minus `image` or `images`
metadata = {
'model' : 'stable diffusion',
'model_id' : opt.model,
'model_hash' : model_hash,
'app_id' : APP_ID,
'app_version' : APP_VERSION,
}
# add some RFC266 fields that are generated internally, and not as
# user args
image_dict = opt.to_dict(
postprocessing=postprocessing
)
# TODO: This is just a hack until postprocessing pipeline work completed
image_dict['postprocessing'] = []
if image_dict['gfpgan_strength'] and image_dict['gfpgan_strength'] > 0:
image_dict['postprocessing'].append('GFPGAN (not RFC compliant)')
if image_dict['upscale'] and image_dict['upscale'][0] > 0:
image_dict['postprocessing'].append('ESRGAN (not RFC compliant)')
# 'postprocessing' is either null or an array of postprocessing metadatal
if postprocessing:
# TODO: This is just a hack until postprocessing pipeline work completed
image_dict['postprocessing'] = []
if image_dict['gfpgan_strength'] and image_dict['gfpgan_strength'] > 0:
image_dict['postprocessing'].append('GFPGAN (not RFC compliant)')
if image_dict['upscale'] and image_dict['upscale'][0] > 0:
image_dict['postprocessing'].append('ESRGAN (not RFC compliant)')
else:
image_dict['postprocessing'] = None
# remove any image keys not mentioned in RFC #266
rfc266_img_fields = ['type','postprocessing','sampler','prompt','seed','variations','steps',
'cfg_scale','step_number','width','height','extra','strength']
rfc_dict ={}
for item in image_dict.items():
key,value = item
if key in rfc266_img_fields:
@@ -572,39 +696,95 @@ def format_metadata(opt,
# semantic drift
rfc_dict['sampler'] = image_dict.get('sampler_name',None)
# display weighted subprompts (liable to change)
if opt.prompt:
subprompts = split_weighted_subprompts(opt.prompt)
subprompts = [{'prompt':x[0],'weight':x[1]} for x in subprompts]
rfc_dict['prompt'] = subprompts
# variations
if opt.with_variations:
variations = [{'seed':x[0],'weight':x[1]} for x in opt.with_variations]
rfc_dict['variations'] = variations
# 'variations' should always exist and be an array, empty or consisting of {'seed': seed, 'weight': weight} pairs
rfc_dict['variations'] = [{'seed':x[0],'weight':x[1]} for x in opt.with_variations] if opt.with_variations else []
if opt.init_img:
rfc_dict['type'] = 'img2img'
rfc_dict['strength_steps'] = rfc_dict.pop('strength')
rfc_dict['orig_hash'] = sha256(image_dict['init_img'])
rfc_dict['sampler'] = 'ddim' # FIX ME WHEN IMG2IMG SUPPORTS ALL SAMPLERS
rfc_dict['orig_hash'] = calculate_init_img_hash(opt.init_img)
rfc_dict['sampler'] = 'ddim' # TODO: FIX ME WHEN IMG2IMG SUPPORTS ALL SAMPLERS
else:
rfc_dict['type'] = 'txt2img'
rfc_dict.pop('strength')
images = []
for seed in seeds:
rfc_dict['seed'] = seed
images.append(copy.copy(rfc_dict))
if len(seeds)==0 and opt.seed:
seeds=[seed]
return {
'model' : 'stable diffusion',
'model_id' : opt.model,
'model_hash' : model_hash,
'app_id' : APP_ID,
'app_version' : APP_VERSION,
'images' : images,
}
if opt.grid:
images = []
for seed in seeds:
rfc_dict['seed'] = seed
images.append(copy.copy(rfc_dict))
metadata['images'] = images
else:
# there should only ever be a single seed if we did not generate a grid
assert len(seeds) == 1, 'Expected a single seed'
rfc_dict['seed'] = seeds[0]
metadata['image'] = rfc_dict
return metadata
def metadata_from_png(png_file_path):
'''
Given the path to a PNG file created by dream.py, retrieves
an Args object containing the image metadata
'''
meta = ldm.dream.pngwriter.retrieve_metadata(png_file_path)
opts = metadata_loads(meta)
return opts[0]
def metadata_loads(metadata):
'''
Takes the dictionary corresponding to RFC266 (https://github.com/lstein/stable-diffusion/issues/266)
and returns a series of opt objects for each of the images described in the dictionary.
'''
results = []
try:
if 'grid' in metadata['sd-metadata']:
images = metadata['sd-metadata']['images']
else:
images = [metadata['sd-metadata']['image']]
for image in images:
# repack the prompt and variations
if 'prompt' in image:
image['prompt'] = ','.join([':'.join([x['prompt'], str(x['weight'])]) for x in image['prompt']])
if 'variations' in image:
image['variations'] = ','.join([':'.join([str(x['seed']),str(x['weight'])]) for x in image['variations']])
# fix a bit of semantic drift here
image['sampler_name']=image.pop('sampler')
opt = Args()
opt._cmd_switches = Namespace(**image)
results.append(opt)
except KeyError as e:
import sys, traceback
print('>> badly-formatted metadata',file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
return results
# image can either be a file path on disk or a base64-encoded
# representation of the file's contents
def calculate_init_img_hash(image_string):
prefix = 'data:image/png;base64,'
hash = None
if image_string.startswith(prefix):
imagebase64 = image_string[len(prefix):]
imagedata = base64.b64decode(imagebase64)
with open('outputs/test.png','wb') as file:
file.write(imagedata)
sha = hashlib.sha256()
sha.update(imagedata)
hash = sha.hexdigest()
else:
hash = sha256(image_string)
return hash
# Bah. This should be moved somewhere else...
def sha256(path):

View File

@@ -13,7 +13,20 @@ import re
import torch
def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
uc = model.get_learned_conditioning([''])
# Extract Unconditioned Words From Prompt
unconditioned_words = ''
unconditional_regex = r'\[(.*?)\]'
unconditionals = re.findall(unconditional_regex, prompt)
if len(unconditionals) > 0:
unconditioned_words = ' '.join(unconditionals)
# Remove Unconditioned Words From Prompt
unconditional_regex_compile = re.compile(unconditional_regex)
clean_prompt = unconditional_regex_compile.sub(' ', prompt)
prompt = re.sub(' +', ' ', clean_prompt)
uc = model.get_learned_conditioning([unconditioned_words])
# get weighted sub-prompts
weighted_subprompts = split_weighted_subprompts(
@@ -25,15 +38,16 @@ def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
c = torch.zeros_like(uc)
# normalize each "sub prompt" and add it
for subprompt, weight in weighted_subprompts:
log_tokenization(subprompt, model, log_tokens)
log_tokenization(subprompt, model, log_tokens, weight)
c = torch.add(
c,
model.get_learned_conditioning([subprompt]),
alpha=weight,
)
else: # just standard 1 prompt
log_tokenization(prompt, model, log_tokens)
log_tokenization(prompt, model, log_tokens, 1)
c = model.get_learned_conditioning([prompt])
uc = model.get_learned_conditioning([unconditioned_words])
return (uc, c)
def split_weighted_subprompts(text, skip_normalize=False)->list:
@@ -65,14 +79,14 @@ def split_weighted_subprompts(text, skip_normalize=False)->list:
if weight_sum == 0:
print(
"Warning: Subprompt weights add up to zero. Discarding and using even weights instead.")
equal_weight = 1 / len(parsed_prompts)
equal_weight = 1 / max(len(parsed_prompts), 1)
return [(x[0], equal_weight) for x in parsed_prompts]
return [(x[0], x[1] / weight_sum) for x in parsed_prompts]
# shows how the prompt is tokenized
# usually tokens have '</w>' to indicate end-of-word,
# but for readability it has been replaced with ' '
def log_tokenization(text, model, log=False):
def log_tokenization(text, model, log=False, weight=1):
if not log:
return
tokens = model.cond_stage_model.tokenizer._tokenize(text)
@@ -89,8 +103,8 @@ def log_tokenization(text, model, log=False):
usedTokens += 1
else: # over max token length
discarded = discarded + f"\x1b[0;3{s};40m{token}"
print(f"\n>> Tokens ({usedTokens}):\n{tokenized}\x1b[0m")
if discarded != "":
print(
f">> Tokens Discarded ({totalTokens-usedTokens}):\n{discarded}\x1b[0m"
)
print(f"\n>> Tokens ({usedTokens}), Weight ({weight:.2f}):\n{tokenized}\x1b[0m")
if discarded != "":
print(
f">> Tokens Discarded ({totalTokens-usedTokens}):\n{discarded}\x1b[0m"
)

View File

@@ -1,6 +1,6 @@
import torch
from torch import autocast
from contextlib import contextmanager, nullcontext
from contextlib import nullcontext
def choose_torch_device() -> str:
'''Convenience routine for guessing which GPU device to run model on'''
@@ -10,15 +10,18 @@ def choose_torch_device() -> str:
return 'mps'
return 'cpu'
def choose_autocast_device(device):
'''Returns an autocast compatible device from a torch device'''
device_type = device.type # this returns 'mps' on M1
# autocast only for cuda, but GTX 16xx have issues with it
if device_type == 'cuda':
device_name = torch.cuda.get_device_name()
if 'GeForce GTX 1660' in device_name or 'GeForce GTX 1650' in device_name:
return device_type,nullcontext
else:
return device_type,autocast
else:
return 'cpu',nullcontext
def choose_precision(device) -> str:
'''Returns an appropriate precision for the given torch device'''
if device.type == 'cuda':
device_name = torch.cuda.get_device_name(device)
if not ('GeForce GTX 1660' in device_name or 'GeForce GTX 1650' in device_name):
return 'float16'
return 'float32'
def choose_autocast(precision):
'''Returns an autocast context or nullcontext for the given precision string'''
# float16 currently requires autocast to avoid errors like:
# 'expected scalar type Half but found Float'
if precision == 'autocast' or precision == 'float16':
return autocast
return nullcontext

View File

@@ -9,13 +9,14 @@ from tqdm import tqdm, trange
from PIL import Image
from einops import rearrange, repeat
from pytorch_lightning import seed_everything
from ldm.dream.devices import choose_autocast_device
from ldm.dream.devices import choose_autocast
downsampling = 8
class Generator():
def __init__(self,model):
def __init__(self, model, precision):
self.model = model
self.precision = precision
self.seed = None
self.latent_channels = model.channels
self.downsampling_factor = downsampling # BUG: should come from model or config
@@ -38,7 +39,7 @@ class Generator():
def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
image_callback=None, step_callback=None,
**kwargs):
device_type,scope = choose_autocast_device(self.model.device)
scope = choose_autocast(self.precision)
make_image = self.get_make_image(
prompt,
init_image = init_image,
@@ -50,8 +51,9 @@ class Generator():
results = []
seed = seed if seed else self.new_seed()
first_seed = seed
seed, initial_noise = self.generate_initial_noise(seed, width, height)
with scope(device_type), self.model.ema_scope():
with scope(self.model.device.type), self.model.ema_scope():
for n in trange(iterations, desc='Generating'):
x_T = None
if self.variation_amount > 0:
@@ -70,7 +72,7 @@ class Generator():
image = make_image(x_T)
results.append([image, seed])
if image_callback is not None:
image_callback(image, seed)
image_callback(image, seed, first_seed=first_seed)
seed = self.new_seed()
return results

View File

@@ -12,8 +12,8 @@ from ldm.dream.generator.base import Generator
from ldm.dream.generator.img2img import Img2Img
class Embiggen(Generator):
def __init__(self,model):
super().__init__(model)
def __init__(self, model, precision):
super().__init__(model, precision)
self.init_latent = None
# Replace generate because Embiggen doesn't need/use most of what it does normallly
@@ -62,19 +62,20 @@ class Embiggen(Generator):
Return value depends on the seed at the time you call it
"""
# Construct embiggen arg array, and sanity check arguments
if embiggen == None: # embiggen can also be called with just embiggen_tiles
embiggen = [1.0] # If not specified, assume no scaling
elif embiggen[0] < 0 :
if embiggen == None: # embiggen can also be called with just embiggen_tiles
embiggen = [1.0] # If not specified, assume no scaling
elif embiggen[0] < 0:
embiggen[0] = 1.0
print('>> Embiggen scaling factor cannot be negative, fell back to the default of 1.0 !')
print(
'>> Embiggen scaling factor cannot be negative, fell back to the default of 1.0 !')
if len(embiggen) < 2:
embiggen.append(0.75)
elif embiggen[1] > 1.0 or embiggen[1] < 0 :
elif embiggen[1] > 1.0 or embiggen[1] < 0:
embiggen[1] = 0.75
print('>> Embiggen upscaling strength for ESRGAN must be between 0 and 1, fell back to the default of 0.75 !')
if len(embiggen) < 3:
embiggen.append(0.25)
elif embiggen[2] < 0 :
elif embiggen[2] < 0:
embiggen[2] = 0.25
print('>> Overlap size for Embiggen must be a positive ratio between 0 and 1 OR a number of pixels, fell back to the default of 0.25 !')
@@ -84,8 +85,11 @@ class Embiggen(Generator):
embiggen_tiles = list(map(lambda n: n-1, embiggen_tiles))
embiggen_tiles.sort()
if strength >= 0.5:
print(f'* WARNING: Embiggen may produce mirror motifs if the strength (-f) is too high (currently {strength}). Try values between 0.35-0.45.')
# Prep img2img generator, since we wrap over it
gen_img2img = Img2Img(self.model)
gen_img2img = Img2Img(self.model,self.precision)
# Open original init image (not a tensor) to manipulate
initsuperimage = Image.open(init_img)
@@ -100,29 +104,30 @@ class Embiggen(Generator):
if embiggen[0] != 1.0:
initsuperwidth = round(initsuperwidth*embiggen[0])
initsuperheight = round(initsuperheight*embiggen[0])
if embiggen[1] > 0: # No point in ESRGAN upscaling if strength is set zero
from ldm.gfpgan.gfpgan_tools import (
real_esrgan_upscale,
)
print(f'>> ESRGAN upscaling init image prior to cutting with Embiggen with strength {embiggen[1]}')
if embiggen[1] > 0: # No point in ESRGAN upscaling if strength is set zero
from ldm.dream.restoration.realesrgan import ESRGAN
esrgan = ESRGAN()
print(
f'>> ESRGAN upscaling init image prior to cutting with Embiggen with strength {embiggen[1]}')
if embiggen[0] > 2:
initsuperimage = real_esrgan_upscale(
initsuperimage = esrgan.process(
initsuperimage,
embiggen[1], # upscale strength
4, # upscale scale
embiggen[1], # upscale strength
self.seed,
4, # upscale scale
)
else:
initsuperimage = real_esrgan_upscale(
initsuperimage = esrgan.process(
initsuperimage,
embiggen[1], # upscale strength
2, # upscale scale
embiggen[1], # upscale strength
self.seed,
2, # upscale scale
)
# We could keep recursively re-running ESRGAN for a requested embiggen[0] larger than 4x
# but from personal experiance it doesn't greatly improve anything after 4x
# Resize to target scaling factor resolution
initsuperimage = initsuperimage.resize((initsuperwidth, initsuperheight), Image.Resampling.LANCZOS)
initsuperimage = initsuperimage.resize(
(initsuperwidth, initsuperheight), Image.Resampling.LANCZOS)
# Use width and height as tile widths and height
# Determine buffer size in pixels
@@ -145,25 +150,28 @@ class Embiggen(Generator):
emb_tiles_x = 1
emb_tiles_y = 1
if (initsuperwidth - width) > 0:
emb_tiles_x = ceildiv(initsuperwidth - width, width - overlap_size_x) + 1
emb_tiles_x = ceildiv(initsuperwidth - width,
width - overlap_size_x) + 1
if (initsuperheight - height) > 0:
emb_tiles_y = ceildiv(initsuperheight - height, height - overlap_size_y) + 1
emb_tiles_y = ceildiv(initsuperheight - height,
height - overlap_size_y) + 1
# Sanity
assert emb_tiles_x > 1 or emb_tiles_y > 1, f'ERROR: Based on the requested dimensions of {initsuperwidth}x{initsuperheight} and tiles of {width}x{height} you don\'t need to Embiggen! Check your arguments.'
# Prep alpha layers --------------
# https://stackoverflow.com/questions/69321734/how-to-create-different-transparency-like-gradient-with-python-pil
# agradientL is Left-side transparent
agradientL = Image.linear_gradient('L').rotate(90).resize((overlap_size_x, height))
agradientL = Image.linear_gradient('L').rotate(
90).resize((overlap_size_x, height))
# agradientT is Top-side transparent
agradientT = Image.linear_gradient('L').resize((width, overlap_size_y))
# radial corner is the left-top corner, made full circle then cut to just the left-top quadrant
agradientC = Image.new('L', (256, 256))
for y in range(256):
for x in range(256):
#Find distance to lower right corner (numpy takes arrays)
# Find distance to lower right corner (numpy takes arrays)
distanceToLR = np.sqrt([(255 - x) ** 2 + (255 - y) ** 2])[0]
#Clamp values to max 255
# Clamp values to max 255
if distanceToLR > 255:
distanceToLR = 255
#Place the pixel as invert of distance
@@ -200,54 +208,73 @@ class Embiggen(Generator):
if embiggen_tiles:
# Individual unconnected sides
alphaLayerR = Image.new("L", (width, height), 255)
alphaLayerR.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
alphaLayerR.paste(agradientL.rotate(
180), (width - overlap_size_x, 0))
alphaLayerB = Image.new("L", (width, height), 255)
alphaLayerB.paste(agradientT.rotate(180), (0, height - overlap_size_y))
alphaLayerB.paste(agradientT.rotate(
180), (0, height - overlap_size_y))
alphaLayerTB = Image.new("L", (width, height), 255)
alphaLayerTB.paste(agradientT, (0, 0))
alphaLayerTB.paste(agradientT.rotate(180), (0, height - overlap_size_y))
alphaLayerTB.paste(agradientT.rotate(
180), (0, height - overlap_size_y))
alphaLayerLR = Image.new("L", (width, height), 255)
alphaLayerLR.paste(agradientL, (0, 0))
alphaLayerLR.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
alphaLayerLR.paste(agradientL.rotate(
180), (width - overlap_size_x, 0))
# Sides and corner Layers
alphaLayerRBC = Image.new("L", (width, height), 255)
alphaLayerRBC.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
alphaLayerRBC.paste(agradientT.rotate(180), (0, height - overlap_size_y))
alphaLayerRBC.paste(agradientC.rotate(180).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
alphaLayerRBC.paste(agradientL.rotate(
180), (width - overlap_size_x, 0))
alphaLayerRBC.paste(agradientT.rotate(
180), (0, height - overlap_size_y))
alphaLayerRBC.paste(agradientC.rotate(180).resize(
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
alphaLayerLBC = Image.new("L", (width, height), 255)
alphaLayerLBC.paste(agradientL, (0, 0))
alphaLayerLBC.paste(agradientT.rotate(180), (0, height - overlap_size_y))
alphaLayerLBC.paste(agradientC.rotate(90).resize((overlap_size_x, overlap_size_y)), (0, height - overlap_size_y))
alphaLayerLBC.paste(agradientT.rotate(
180), (0, height - overlap_size_y))
alphaLayerLBC.paste(agradientC.rotate(90).resize(
(overlap_size_x, overlap_size_y)), (0, height - overlap_size_y))
alphaLayerRTC = Image.new("L", (width, height), 255)
alphaLayerRTC.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
alphaLayerRTC.paste(agradientL.rotate(
180), (width - overlap_size_x, 0))
alphaLayerRTC.paste(agradientT, (0, 0))
alphaLayerRTC.paste(agradientC.rotate(270).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
alphaLayerRTC.paste(agradientC.rotate(270).resize(
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
# All but X layers
alphaLayerABT = Image.new("L", (width, height), 255)
alphaLayerABT.paste(alphaLayerLBC, (0, 0))
alphaLayerABT.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
alphaLayerABT.paste(agradientC.rotate(180).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
alphaLayerABT.paste(agradientL.rotate(
180), (width - overlap_size_x, 0))
alphaLayerABT.paste(agradientC.rotate(180).resize(
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
alphaLayerABL = Image.new("L", (width, height), 255)
alphaLayerABL.paste(alphaLayerRTC, (0, 0))
alphaLayerABL.paste(agradientT.rotate(180), (0, height - overlap_size_y))
alphaLayerABL.paste(agradientC.rotate(180).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
alphaLayerABL.paste(agradientT.rotate(
180), (0, height - overlap_size_y))
alphaLayerABL.paste(agradientC.rotate(180).resize(
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
alphaLayerABR = Image.new("L", (width, height), 255)
alphaLayerABR.paste(alphaLayerLBC, (0, 0))
alphaLayerABR.paste(agradientT, (0, 0))
alphaLayerABR.paste(agradientC.resize((overlap_size_x, overlap_size_y)), (0, 0))
alphaLayerABR.paste(agradientC.resize(
(overlap_size_x, overlap_size_y)), (0, 0))
alphaLayerABB = Image.new("L", (width, height), 255)
alphaLayerABB.paste(alphaLayerRTC, (0, 0))
alphaLayerABB.paste(agradientL, (0, 0))
alphaLayerABB.paste(agradientC.resize((overlap_size_x, overlap_size_y)), (0, 0))
alphaLayerABB.paste(agradientC.resize(
(overlap_size_x, overlap_size_y)), (0, 0))
# All-around layer
alphaLayerAA = Image.new("L", (width, height), 255)
alphaLayerAA.paste(alphaLayerABT, (0, 0))
alphaLayerAA.paste(agradientT, (0, 0))
alphaLayerAA.paste(agradientC.resize((overlap_size_x, overlap_size_y)), (0, 0))
alphaLayerAA.paste(agradientC.rotate(270).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
alphaLayerAA.paste(agradientC.resize(
(overlap_size_x, overlap_size_y)), (0, 0))
alphaLayerAA.paste(agradientC.rotate(270).resize(
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
# Clean up temporary gradients
del agradientL
@@ -259,7 +286,8 @@ class Embiggen(Generator):
if embiggen_tiles:
print(f'>> Making {len(embiggen_tiles)} Embiggen tiles...')
else:
print(f'>> Making {(emb_tiles_x * emb_tiles_y)} Embiggen tiles ({emb_tiles_x}x{emb_tiles_y})...')
print(
f'>> Making {(emb_tiles_x * emb_tiles_y)} Embiggen tiles ({emb_tiles_x}x{emb_tiles_y})...')
emb_tile_store = []
# Although we could use the same seed for every tile for determinism, at higher strengths this may
@@ -294,20 +322,23 @@ class Embiggen(Generator):
top = round(emb_row_i * (height - overlap_size_y))
right = left + width
bottom = top + height
# Cropped image of above dimension (does not modify the original)
newinitimage = initsuperimage.crop((left, top, right, bottom))
# DEBUG:
# newinitimagepath = init_img[0:-4] + f'_emb_Ti{tile}.png'
# newinitimage.save(newinitimagepath)
if embiggen_tiles:
print(f'Making tile #{tile + 1} ({embiggen_tiles.index(tile) + 1} of {len(embiggen_tiles)} requested)')
print(
f'Making tile #{tile + 1} ({embiggen_tiles.index(tile) + 1} of {len(embiggen_tiles)} requested)')
else:
print(f'Starting {tile + 1} of {(emb_tiles_x * emb_tiles_y)} tiles')
print(
f'Starting {tile + 1} of {(emb_tiles_x * emb_tiles_y)} tiles')
# create a torch tensor from an Image
newinitimage = np.array(newinitimage).astype(np.float32) / 255.0
newinitimage = np.array(
newinitimage).astype(np.float32) / 255.0
newinitimage = newinitimage[None].transpose(0, 3, 1, 2)
newinitimage = torch.from_numpy(newinitimage)
newinitimage = 2.0 * newinitimage - 1.0
@@ -329,18 +360,35 @@ class Embiggen(Generator):
init_image = newinitimage, # notice that init_image is different from init_img
mask_image = None,
strength = strength,
iterations=1,
seed=self.seed,
sampler=sampler,
steps=steps,
cfg_scale=cfg_scale,
conditioning=conditioning,
ddim_eta=ddim_eta,
image_callback=None, # called only after the final image is generated
step_callback=step_callback, # called after each intermediate image is generated
width=width,
height=height,
init_img=init_img, # img2img doesn't need this, but it might in the future
init_image=newinitimage, # notice that init_image is different from init_img
mask_image=None,
strength=strength,
)
emb_tile_store.append(tile_results[0][0])
# DEBUG (but, also has other uses), worth saving if you want tiles without a transparency overlap to manually composite
# emb_tile_store[-1].save(init_img[0:-4] + f'_emb_To{tile}.png')
del newinitimage
# Sanity check we have them all
if len(emb_tile_store) == (emb_tiles_x * emb_tiles_y) or (embiggen_tiles != [] and len(emb_tile_store) == len(embiggen_tiles)):
outputsuperimage = Image.new("RGBA", (initsuperwidth, initsuperheight))
outputsuperimage = Image.new(
"RGBA", (initsuperwidth, initsuperheight))
if embiggen_tiles:
outputsuperimage.alpha_composite(initsuperimage.convert('RGBA'), (0, 0))
outputsuperimage.alpha_composite(
initsuperimage.convert('RGBA'), (0, 0))
for tile in range(emb_tiles_x * emb_tiles_y):
if embiggen_tiles:
if tile in embiggen_tiles:
@@ -361,7 +409,8 @@ class Embiggen(Generator):
if emb_column_i + 1 == emb_tiles_x:
left = initsuperwidth - width
else:
left = round(emb_column_i * (width - overlap_size_x))
left = round(emb_column_i *
(width - overlap_size_x))
if emb_row_i + 1 == emb_tiles_y:
top = initsuperheight - height
else:
@@ -372,26 +421,26 @@ class Embiggen(Generator):
# top of image
if emb_row_i == 0:
if emb_column_i == 0:
if (tile+1) in embiggen_tiles: # Look-ahead right
if (tile+emb_tiles_x) not in embiggen_tiles: # Look-ahead down
if (tile+1) in embiggen_tiles: # Look-ahead right
if (tile+emb_tiles_x) not in embiggen_tiles: # Look-ahead down
intileimage.putalpha(alphaLayerB)
# Otherwise do nothing on this tile
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
intileimage.putalpha(alphaLayerR)
else:
intileimage.putalpha(alphaLayerRBC)
elif emb_column_i == emb_tiles_x - 1:
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
intileimage.putalpha(alphaLayerL)
else:
intileimage.putalpha(alphaLayerLBC)
else:
if (tile+1) in embiggen_tiles: # Look-ahead right
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
if (tile+1) in embiggen_tiles: # Look-ahead right
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
intileimage.putalpha(alphaLayerL)
else:
intileimage.putalpha(alphaLayerLBC)
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
intileimage.putalpha(alphaLayerLR)
else:
intileimage.putalpha(alphaLayerABT)
@@ -418,12 +467,12 @@ class Embiggen(Generator):
intileimage.putalpha(alphaLayerTaC)
else:
intileimage.putalpha(alphaLayerTB)
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
intileimage.putalpha(alphaLayerRTC)
else:
intileimage.putalpha(alphaLayerABL)
elif emb_column_i == emb_tiles_x - 1:
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
intileimage.putalpha(alphaLayerLTC)
else:
intileimage.putalpha(alphaLayerABR)
@@ -433,7 +482,7 @@ class Embiggen(Generator):
intileimage.putalpha(alphaLayerLTaC)
else:
intileimage.putalpha(alphaLayerABR)
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
intileimage.putalpha(alphaLayerABB)
else:
intileimage.putalpha(alphaLayerAA)
@@ -459,4 +508,4 @@ class Embiggen(Generator):
# after internal loops and patching up return Embiggen image
return outputsuperimage
# end of function declaration
return make_image
return make_image

View File

@@ -4,15 +4,15 @@ ldm.dream.generator.img2img descends from ldm.dream.generator
import torch
import numpy as np
from ldm.dream.devices import choose_autocast_device
from ldm.dream.devices import choose_autocast
from ldm.dream.generator.base import Generator
from ldm.models.diffusion.ddim import DDIMSampler
class Img2Img(Generator):
def __init__(self,model):
super().__init__(model)
def __init__(self, model, precision):
super().__init__(model, precision)
self.init_latent = None # by get_noise()
@torch.no_grad()
def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
conditioning,init_image,strength,step_callback=None,**kwargs):
@@ -32,8 +32,8 @@ class Img2Img(Generator):
ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
)
device_type,scope = choose_autocast_device(self.model.device)
with scope(device_type):
scope = choose_autocast(self.precision)
with scope(self.model.device.type):
self.init_latent = self.model.get_first_stage_encoding(
self.model.encode_first_stage(init_image)
) # move to latent space

View File

@@ -5,15 +5,15 @@ ldm.dream.generator.inpaint descends from ldm.dream.generator
import torch
import numpy as np
from einops import rearrange, repeat
from ldm.dream.devices import choose_autocast_device
from ldm.dream.devices import choose_autocast
from ldm.dream.generator.img2img import Img2Img
from ldm.models.diffusion.ddim import DDIMSampler
class Inpaint(Img2Img):
def __init__(self,model):
def __init__(self, model, precision):
self.init_latent = None
super().__init__(model)
super().__init__(model, precision)
@torch.no_grad()
def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
conditioning,init_image,mask_image,strength,
@@ -34,12 +34,12 @@ class Inpaint(Img2Img):
)
sampler = DDIMSampler(self.model, device=self.model.device)
sampler.make_schedule(
ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
)
sampler.make_schedule(
ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
)
device_type,scope = choose_autocast_device(self.model.device)
with scope(device_type):
scope = choose_autocast(self.precision)
with scope(self.model.device.type):
self.init_latent = self.model.get_first_stage_encoding(
self.model.encode_first_stage(init_image)
) # move to latent space

View File

@@ -7,9 +7,9 @@ import numpy as np
from ldm.dream.generator.base import Generator
class Txt2Img(Generator):
def __init__(self,model):
super().__init__(model)
def __init__(self, model, precision):
super().__init__(model, precision)
@torch.no_grad()
def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
conditioning,width,height,step_callback=None,**kwargs):
@@ -27,6 +27,10 @@ class Txt2Img(Generator):
height // self.downsampling_factor,
width // self.downsampling_factor,
]
if self.free_gpu_mem and self.model.model.device != self.model.device:
self.model.model.to(self.model.device)
samples, _ = sampler.sample(
batch_size = 1,
S = steps,
@@ -39,6 +43,10 @@ class Txt2Img(Generator):
eta = ddim_eta,
img_callback = step_callback
)
if self.free_gpu_mem:
self.model.model.to("cpu")
return self.sample_to_image(samples)
return make_image

61
ldm/dream/log.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Functions for better format logging
write_log -- logs the name of the output image, prompt, and prompt args to the terminal and different types of file
1 write_log_message -- Writes a message to the console
2 write_log_files -- Writes a message to files
2.1 write_log_default -- File in plain text
2.2 write_log_txt -- File in txt format
2.3 write_log_markdown -- File in markdown format
"""
import os
def write_log(results, log_path, file_types, output_cntr):
"""
logs the name of the output image, prompt, and prompt args to the terminal and files
"""
output_cntr = write_log_message(results, output_cntr)
write_log_files(results, log_path, file_types)
return output_cntr
def write_log_message(results, output_cntr):
"""logs to the terminal"""
log_lines = [f"{path}: {prompt}\n" for path, prompt in results]
for l in log_lines:
output_cntr += 1
print(f"[{output_cntr}] {l}", end="")
return output_cntr
def write_log_files(results, log_path, file_types):
for file_type in file_types:
if file_type == "txt":
write_log_txt(log_path, results)
elif file_type == "md" or file_type == "markdown":
write_log_markdown(log_path, results)
else:
print(f"'{file_type}' format is not supported, so write in plain text")
write_log_default(log_path, results, file_type)
def write_log_default(log_path, results, file_type):
plain_txt_lines = [f"{path}: {prompt}\n" for path, prompt in results]
with open(log_path + "." + file_type, "a", encoding="utf-8") as file:
file.writelines(plain_txt_lines)
def write_log_txt(log_path, results):
txt_lines = [f"{path}: {prompt}\n" for path, prompt in results]
with open(log_path + ".txt", "a", encoding="utf-8") as file:
file.writelines(txt_lines)
def write_log_markdown(log_path, results):
md_lines = []
for path, prompt in results:
file_name = os.path.basename(path)
md_lines.append(f"## {file_name}\n![]({file_name})\n\n{prompt}\n")
with open(log_path + ".md", "a", encoding="utf-8") as file:
file.writelines(md_lines)

View File

@@ -34,7 +34,6 @@ class PngWriter:
# saves image named _image_ to outdir/name, writing metadata from prompt
# returns full path of output
def save_image_and_prompt_to_png(self, image, dream_prompt, name, metadata=None):
print(f'self.outdir={self.outdir}, name={name}')
path = os.path.join(self.outdir, name)
info = PngImagePlugin.PngInfo()
info.add_text('Dream', dream_prompt)

View File

@@ -22,11 +22,12 @@ class Completer:
def complete(self, text, state):
buffer = readline.get_line_buffer()
if text.startswith(('-I', '--init_img','-M','--init_mask')):
if text.startswith(('-I', '--init_img','-M','--init_mask',
'--init_color')):
return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
if buffer.strip().endswith('cd') or text.startswith(('.', '/')):
return self._path_completions(text, state, ())
if buffer.strip().endswith('pp') or text.startswith(('.', '/')):
return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
response = None
if state == 0:
@@ -57,6 +58,8 @@ class Completer:
path = text.replace('--init_mask=', '', 1).lstrip()
elif text.startswith('-M'):
path = text.replace('-M', '', 1).lstrip()
elif text.startswith('--init_color='):
path = text.replace('--init_color=', '', 1).lstrip()
else:
path = text
@@ -100,6 +103,7 @@ if readline_available:
'--individual','-i',
'--init_img','-I',
'--init_mask','-M',
'--init_color',
'--strength','-f',
'--variants','-v',
'--outdir','-o',

View File

@@ -0,0 +1,4 @@
'''
Initialization file for the ldm.dream.restoration package
'''
from .base import Restoration

View File

@@ -0,0 +1,38 @@
class Restoration():
def __init__(self) -> None:
pass
def load_face_restore_models(self, gfpgan_dir='./src/gfpgan', gfpgan_model_path='experiments/pretrained_models/GFPGANv1.4.pth'):
# Load GFPGAN
gfpgan = self.load_gfpgan(gfpgan_dir, gfpgan_model_path)
if gfpgan.gfpgan_model_exists:
print('>> GFPGAN Initialized')
else:
print('>> GFPGAN Disabled')
gfpgan = None
# Load CodeFormer
codeformer = self.load_codeformer()
if codeformer.codeformer_model_exists:
print('>> CodeFormer Initialized')
else:
print('>> CodeFormer Disabled')
codeformer = None
return gfpgan, codeformer
# Face Restore Models
def load_gfpgan(self, gfpgan_dir, gfpgan_model_path):
from ldm.dream.restoration.gfpgan import GFPGAN
return GFPGAN(gfpgan_dir, gfpgan_model_path)
def load_codeformer(self):
from ldm.dream.restoration.codeformer import CodeFormerRestoration
return CodeFormerRestoration()
# Upscale Models
def load_esrgan(self, esrgan_bg_tile=400):
from ldm.dream.restoration.realesrgan import ESRGAN
esrgan = ESRGAN(esrgan_bg_tile)
print('>> ESRGAN Initialized')
return esrgan;

View File

@@ -0,0 +1,84 @@
import os
import torch
import numpy as np
import warnings
import sys
pretrained_model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
class CodeFormerRestoration():
def __init__(self,
codeformer_dir='ldm/dream/restoration/codeformer',
codeformer_model_path='weights/codeformer.pth') -> None:
self.model_path = os.path.join(codeformer_dir, codeformer_model_path)
self.codeformer_model_exists = os.path.isfile(self.model_path)
if not self.codeformer_model_exists:
print('## NOT FOUND: CodeFormer model not found at ' + self.model_path)
sys.path.append(os.path.abspath(codeformer_dir))
def process(self, image, strength, device, seed=None, fidelity=0.75):
if seed is not None:
print(f'>> CodeFormer - Restoring Faces for image seed:{seed}')
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
from basicsr.utils.download_util import load_file_from_url
from basicsr.utils import img2tensor, tensor2img
from facexlib.utils.face_restoration_helper import FaceRestoreHelper
from ldm.dream.restoration.codeformer_arch import CodeFormer
from torchvision.transforms.functional import normalize
from PIL import Image
cf_class = CodeFormer
cf = cf_class(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, connect_list=['32', '64', '128', '256']).to(device)
checkpoint_path = load_file_from_url(url=pretrained_model_url, model_dir=os.path.abspath('ldm/dream/restoration/codeformer/weights'), progress=True)
checkpoint = torch.load(checkpoint_path)['params_ema']
cf.load_state_dict(checkpoint)
cf.eval()
image = image.convert('RGB')
face_helper = FaceRestoreHelper(upscale_factor=1, use_parse=True, device=device)
face_helper.clean_all()
face_helper.read_image(np.array(image, dtype=np.uint8))
face_helper.get_face_landmarks_5(resize=640, eye_dist_threshold=5)
face_helper.align_warp_face()
for idx, cropped_face in enumerate(face_helper.cropped_faces):
cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
cropped_face_t = cropped_face_t.unsqueeze(0).to(device)
try:
with torch.no_grad():
output = cf(cropped_face_t, w=fidelity, adain=True)[0]
restored_face = tensor2img(output.squeeze(0), rgb2bgr=True, min_max=(-1, 1))
del output
torch.cuda.empty_cache()
except RuntimeError as error:
print(f'\tFailed inference for CodeFormer: {error}.')
restored_face = cropped_face
restored_face = restored_face.astype('uint8')
face_helper.add_restored_face(restored_face)
face_helper.get_inverse_affine(None)
restored_img = face_helper.paste_faces_to_input_image()
res = Image.fromarray(restored_img)
if strength < 1.0:
# Resize the image to the new image if the sizes have changed
if restored_img.size != image.size:
image = image.resize(res.size)
res = Image.blend(image, res, strength)
cf = None
return res

View File

@@ -0,0 +1,3 @@
To use codeformer face reconstruction, you will need to copy
https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth
into this directory.

View File

@@ -0,0 +1,276 @@
import math
import numpy as np
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from typing import Optional, List
from ldm.dream.restoration.vqgan_arch import *
from basicsr.utils import get_root_logger
from basicsr.utils.registry import ARCH_REGISTRY
def calc_mean_std(feat, eps=1e-5):
"""Calculate mean and std for adaptive_instance_normalization.
Args:
feat (Tensor): 4D tensor.
eps (float): A small value added to the variance to avoid
divide-by-zero. Default: 1e-5.
"""
size = feat.size()
assert len(size) == 4, 'The input feature should be 4D tensor.'
b, c = size[:2]
feat_var = feat.view(b, c, -1).var(dim=2) + eps
feat_std = feat_var.sqrt().view(b, c, 1, 1)
feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
return feat_mean, feat_std
def adaptive_instance_normalization(content_feat, style_feat):
"""Adaptive instance normalization.
Adjust the reference features to have the similar color and illuminations
as those in the degradate features.
Args:
content_feat (Tensor): The reference feature.
style_feat (Tensor): The degradate features.
"""
size = content_feat.size()
style_mean, style_std = calc_mean_std(style_feat)
content_mean, content_std = calc_mean_std(content_feat)
normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
return normalized_feat * style_std.expand(size) + style_mean.expand(size)
class PositionEmbeddingSine(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, x, mask=None):
if mask is None:
mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
not_mask = ~mask
y_embed = not_mask.cumsum(1, dtype=torch.float32)
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
).flatten(3)
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
class TransformerSALayer(nn.Module):
def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
super().__init__()
self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
# Implementation of Feedforward model - MLP
self.linear1 = nn.Linear(embed_dim, dim_mlp)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_mlp, embed_dim)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward(self, tgt,
tgt_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None):
# self attention
tgt2 = self.norm1(tgt)
q = k = self.with_pos_embed(tgt2, query_pos)
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
tgt = tgt + self.dropout1(tgt2)
# ffn
tgt2 = self.norm2(tgt)
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
tgt = tgt + self.dropout2(tgt2)
return tgt
class Fuse_sft_block(nn.Module):
def __init__(self, in_ch, out_ch):
super().__init__()
self.encode_enc = ResBlock(2*in_ch, out_ch)
self.scale = nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
nn.LeakyReLU(0.2, True),
nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
self.shift = nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
nn.LeakyReLU(0.2, True),
nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
def forward(self, enc_feat, dec_feat, w=1):
enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
scale = self.scale(enc_feat)
shift = self.shift(enc_feat)
residual = w * (dec_feat * scale + shift)
out = dec_feat + residual
return out
@ARCH_REGISTRY.register()
class CodeFormer(VQAutoEncoder):
def __init__(self, dim_embd=512, n_head=8, n_layers=9,
codebook_size=1024, latent_size=256,
connect_list=['32', '64', '128', '256'],
fix_modules=['quantize','generator']):
super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
if fix_modules is not None:
for module in fix_modules:
for param in getattr(self, module).parameters():
param.requires_grad = False
self.connect_list = connect_list
self.n_layers = n_layers
self.dim_embd = dim_embd
self.dim_mlp = dim_embd*2
self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
self.feat_emb = nn.Linear(256, self.dim_embd)
# transformer
self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
for _ in range(self.n_layers)])
# logits_predict head
self.idx_pred_layer = nn.Sequential(
nn.LayerNorm(dim_embd),
nn.Linear(dim_embd, codebook_size, bias=False))
self.channels = {
'16': 512,
'32': 256,
'64': 256,
'128': 128,
'256': 128,
'512': 64,
}
# after second residual block for > 16, before attn layer for ==16
self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
# after first residual block for > 16, before attn layer for ==16
self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
# fuse_convs_dict
self.fuse_convs_dict = nn.ModuleDict()
for f_size in self.connect_list:
in_ch = self.channels[f_size]
self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch)
def _init_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=0.02)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
# ################### Encoder #####################
enc_feat_dict = {}
out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
for i, block in enumerate(self.encoder.blocks):
x = block(x)
if i in out_list:
enc_feat_dict[str(x.shape[-1])] = x.clone()
lq_feat = x
# ################# Transformer ###################
# quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
# BCHW -> BC(HW) -> (HW)BC
feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
query_emb = feat_emb
# Transformer encoder
for layer in self.ft_layers:
query_emb = layer(query_emb, query_pos=pos_emb)
# output logits
logits = self.idx_pred_layer(query_emb) # (hw)bn
logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
if code_only: # for training stage II
# logits doesn't need softmax before cross_entropy loss
return logits, lq_feat
# ################# Quantization ###################
# if self.training:
# quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
# # b(hw)c -> bc(hw) -> bchw
# quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
# ------------
soft_one_hot = F.softmax(logits, dim=2)
_, top_idx = torch.topk(soft_one_hot, 1, dim=2)
quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
# preserve gradients
# quant_feat = lq_feat + (quant_feat - lq_feat).detach()
if detach_16:
quant_feat = quant_feat.detach() # for training stage III
if adain:
quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
# ################## Generator ####################
x = quant_feat
fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
for i, block in enumerate(self.generator.blocks):
x = block(x)
if i in fuse_list: # fuse after i-th block
f_size = str(x.shape[-1])
if w>0:
x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
out = x
# logits doesn't need softmax before cross_entropy loss
return out, logits, lq_feat

View File

@@ -0,0 +1,76 @@
import torch
import warnings
import os
import sys
import numpy as np
from PIL import Image
class GFPGAN():
def __init__(
self,
gfpgan_dir='src/gfpgan',
gfpgan_model_path='experiments/pretrained_models/GFPGANv1.4.pth') -> None:
self.model_path = os.path.join(gfpgan_dir, gfpgan_model_path)
self.gfpgan_model_exists = os.path.isfile(self.model_path)
if not self.gfpgan_model_exists:
print('## NOT FOUND: GFPGAN model not found at ' + self.model_path)
return None
sys.path.append(os.path.abspath(gfpgan_dir))
def model_exists(self):
return os.path.isfile(self.model_path)
def process(self, image, strength: float, seed: str = None):
if seed is not None:
print(f'>> GFPGAN - Restoring Faces for image seed:{seed}')
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
try:
from gfpgan import GFPGANer
self.gfpgan = GFPGANer(
model_path=self.model_path,
upscale=1,
arch='clean',
channel_multiplier=2,
bg_upsampler=None,
)
except Exception:
import traceback
print('>> Error loading GFPGAN:', file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
if self.gfpgan is None:
print(
f'>> WARNING: GFPGAN not initialized.'
)
print(
f'>> Download https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth to {self.model_path}, \nor change GFPGAN directory with --gfpgan_dir.'
)
image = image.convert('RGB')
_, _, restored_img = self.gfpgan.enhance(
np.array(image, dtype=np.uint8),
has_aligned=False,
only_center_face=False,
paste_back=True,
)
res = Image.fromarray(restored_img)
if strength < 1.0:
# Resize the image to the new image if the sizes have changed
if restored_img.size != image.size:
image = image.resize(res.size)
res = Image.blend(image, res, strength)
if torch.cuda.is_available():
torch.cuda.empty_cache()
self.gfpgan = None
return res

View File

@@ -0,0 +1,102 @@
import torch
import warnings
import numpy as np
from PIL import Image
class ESRGAN():
def __init__(self, bg_tile_size=400) -> None:
self.bg_tile_size = bg_tile_size
if not torch.cuda.is_available(): # CPU or MPS on M1
use_half_precision = False
else:
use_half_precision = True
def load_esrgan_bg_upsampler(self, upsampler_scale):
if not torch.cuda.is_available(): # CPU or MPS on M1
use_half_precision = False
else:
use_half_precision = True
model_path = {
2: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
4: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
}
if upsampler_scale not in model_path:
return None
else:
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer
if upsampler_scale == 4:
model = RRDBNet(
num_in_ch=3,
num_out_ch=3,
num_feat=64,
num_block=23,
num_grow_ch=32,
scale=4,
)
if upsampler_scale == 2:
model = RRDBNet(
num_in_ch=3,
num_out_ch=3,
num_feat=64,
num_block=23,
num_grow_ch=32,
scale=2,
)
bg_upsampler = RealESRGANer(
scale=upsampler_scale,
model_path=model_path[upsampler_scale],
model=model,
tile=self.bg_tile_size,
tile_pad=10,
pre_pad=0,
half=use_half_precision,
)
return bg_upsampler
def process(self, image, strength: float, seed: str = None, upsampler_scale: int = 2):
if seed is not None:
print(
f'>> Real-ESRGAN Upscaling seed:{seed} : scale:{upsampler_scale}x'
)
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
try:
upsampler = self.load_esrgan_bg_upsampler(upsampler_scale)
except Exception:
import traceback
import sys
print('>> Error loading Real-ESRGAN:', file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
output, _ = upsampler.enhance(
np.array(image, dtype=np.uint8),
outscale=upsampler_scale,
alpha_upsampler='realesrgan',
)
res = Image.fromarray(output)
if strength < 1.0:
# Resize the image to the new image if the sizes have changed
if output.size != image.size:
image = image.resize(res.size)
res = Image.blend(image, res, strength)
if torch.cuda.is_available():
torch.cuda.empty_cache()
upsampler = None
return res

View File

@@ -0,0 +1,435 @@
'''
VQGAN code, adapted from the original created by the Unleashing Transformers authors:
https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py
'''
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy
from basicsr.utils import get_root_logger
from basicsr.utils.registry import ARCH_REGISTRY
def normalize(in_channels):
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
@torch.jit.script
def swish(x):
return x*torch.sigmoid(x)
# Define VQVAE classes
class VectorQuantizer(nn.Module):
def __init__(self, codebook_size, emb_dim, beta):
super(VectorQuantizer, self).__init__()
self.codebook_size = codebook_size # number of embeddings
self.emb_dim = emb_dim # dimension of embedding
self.beta = beta # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
self.embedding = nn.Embedding(self.codebook_size, self.emb_dim)
self.embedding.weight.data.uniform_(-1.0 / self.codebook_size, 1.0 / self.codebook_size)
def forward(self, z):
# reshape z -> (batch, height, width, channel) and flatten
z = z.permute(0, 2, 3, 1).contiguous()
z_flattened = z.view(-1, self.emb_dim)
# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
d = (z_flattened ** 2).sum(dim=1, keepdim=True) + (self.embedding.weight**2).sum(1) - \
2 * torch.matmul(z_flattened, self.embedding.weight.t())
mean_distance = torch.mean(d)
# find closest encodings
# min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
min_encoding_scores, min_encoding_indices = torch.topk(d, 1, dim=1, largest=False)
# [0-1], higher score, higher confidence
min_encoding_scores = torch.exp(-min_encoding_scores/10)
min_encodings = torch.zeros(min_encoding_indices.shape[0], self.codebook_size).to(z)
min_encodings.scatter_(1, min_encoding_indices, 1)
# get quantized latent vectors
z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
# compute loss for embedding
loss = torch.mean((z_q.detach()-z)**2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
# preserve gradients
z_q = z + (z_q - z).detach()
# perplexity
e_mean = torch.mean(min_encodings, dim=0)
perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
# reshape back to match original input shape
z_q = z_q.permute(0, 3, 1, 2).contiguous()
return z_q, loss, {
"perplexity": perplexity,
"min_encodings": min_encodings,
"min_encoding_indices": min_encoding_indices,
"min_encoding_scores": min_encoding_scores,
"mean_distance": mean_distance
}
def get_codebook_feat(self, indices, shape):
# input indices: batch*token_num -> (batch*token_num)*1
# shape: batch, height, width, channel
indices = indices.view(-1,1)
min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices)
min_encodings.scatter_(1, indices, 1)
# get quantized latent vectors
z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
if shape is not None: # reshape back to match original input shape
z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous()
return z_q
class GumbelQuantizer(nn.Module):
def __init__(self, codebook_size, emb_dim, num_hiddens, straight_through=False, kl_weight=5e-4, temp_init=1.0):
super().__init__()
self.codebook_size = codebook_size # number of embeddings
self.emb_dim = emb_dim # dimension of embedding
self.straight_through = straight_through
self.temperature = temp_init
self.kl_weight = kl_weight
self.proj = nn.Conv2d(num_hiddens, codebook_size, 1) # projects last encoder layer to quantized logits
self.embed = nn.Embedding(codebook_size, emb_dim)
def forward(self, z):
hard = self.straight_through if self.training else True
logits = self.proj(z)
soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard)
z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
# + kl divergence to the prior loss
qy = F.softmax(logits, dim=1)
diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean()
min_encoding_indices = soft_one_hot.argmax(dim=1)
return z_q, diff, {
"min_encoding_indices": min_encoding_indices
}
class Downsample(nn.Module):
def __init__(self, in_channels):
super().__init__()
self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
def forward(self, x):
pad = (0, 1, 0, 1)
x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
x = self.conv(x)
return x
class Upsample(nn.Module):
def __init__(self, in_channels):
super().__init__()
self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
def forward(self, x):
x = F.interpolate(x, scale_factor=2.0, mode="nearest")
x = self.conv(x)
return x
class ResBlock(nn.Module):
def __init__(self, in_channels, out_channels=None):
super(ResBlock, self).__init__()
self.in_channels = in_channels
self.out_channels = in_channels if out_channels is None else out_channels
self.norm1 = normalize(in_channels)
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.norm2 = normalize(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
if self.in_channels != self.out_channels:
self.conv_out = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
def forward(self, x_in):
x = x_in
x = self.norm1(x)
x = swish(x)
x = self.conv1(x)
x = self.norm2(x)
x = swish(x)
x = self.conv2(x)
if self.in_channels != self.out_channels:
x_in = self.conv_out(x_in)
return x + x_in
class AttnBlock(nn.Module):
def __init__(self, in_channels):
super().__init__()
self.in_channels = in_channels
self.norm = normalize(in_channels)
self.q = torch.nn.Conv2d(
in_channels,
in_channels,
kernel_size=1,
stride=1,
padding=0
)
self.k = torch.nn.Conv2d(
in_channels,
in_channels,
kernel_size=1,
stride=1,
padding=0
)
self.v = torch.nn.Conv2d(
in_channels,
in_channels,
kernel_size=1,
stride=1,
padding=0
)
self.proj_out = torch.nn.Conv2d(
in_channels,
in_channels,
kernel_size=1,
stride=1,
padding=0
)
def forward(self, x):
h_ = x
h_ = self.norm(h_)
q = self.q(h_)
k = self.k(h_)
v = self.v(h_)
# compute attention
b, c, h, w = q.shape
q = q.reshape(b, c, h*w)
q = q.permute(0, 2, 1)
k = k.reshape(b, c, h*w)
w_ = torch.bmm(q, k)
w_ = w_ * (int(c)**(-0.5))
w_ = F.softmax(w_, dim=2)
# attend to values
v = v.reshape(b, c, h*w)
w_ = w_.permute(0, 2, 1)
h_ = torch.bmm(v, w_)
h_ = h_.reshape(b, c, h, w)
h_ = self.proj_out(h_)
return x+h_
class Encoder(nn.Module):
def __init__(self, in_channels, nf, emb_dim, ch_mult, num_res_blocks, resolution, attn_resolutions):
super().__init__()
self.nf = nf
self.num_resolutions = len(ch_mult)
self.num_res_blocks = num_res_blocks
self.resolution = resolution
self.attn_resolutions = attn_resolutions
curr_res = self.resolution
in_ch_mult = (1,)+tuple(ch_mult)
blocks = []
# initial convultion
blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1))
# residual and downsampling blocks, with attention on smaller res (16x16)
for i in range(self.num_resolutions):
block_in_ch = nf * in_ch_mult[i]
block_out_ch = nf * ch_mult[i]
for _ in range(self.num_res_blocks):
blocks.append(ResBlock(block_in_ch, block_out_ch))
block_in_ch = block_out_ch
if curr_res in attn_resolutions:
blocks.append(AttnBlock(block_in_ch))
if i != self.num_resolutions - 1:
blocks.append(Downsample(block_in_ch))
curr_res = curr_res // 2
# non-local attention block
blocks.append(ResBlock(block_in_ch, block_in_ch))
blocks.append(AttnBlock(block_in_ch))
blocks.append(ResBlock(block_in_ch, block_in_ch))
# normalise and convert to latent size
blocks.append(normalize(block_in_ch))
blocks.append(nn.Conv2d(block_in_ch, emb_dim, kernel_size=3, stride=1, padding=1))
self.blocks = nn.ModuleList(blocks)
def forward(self, x):
for block in self.blocks:
x = block(x)
return x
class Generator(nn.Module):
def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
super().__init__()
self.nf = nf
self.ch_mult = ch_mult
self.num_resolutions = len(self.ch_mult)
self.num_res_blocks = res_blocks
self.resolution = img_size
self.attn_resolutions = attn_resolutions
self.in_channels = emb_dim
self.out_channels = 3
block_in_ch = self.nf * self.ch_mult[-1]
curr_res = self.resolution // 2 ** (self.num_resolutions-1)
blocks = []
# initial conv
blocks.append(nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1))
# non-local attention block
blocks.append(ResBlock(block_in_ch, block_in_ch))
blocks.append(AttnBlock(block_in_ch))
blocks.append(ResBlock(block_in_ch, block_in_ch))
for i in reversed(range(self.num_resolutions)):
block_out_ch = self.nf * self.ch_mult[i]
for _ in range(self.num_res_blocks):
blocks.append(ResBlock(block_in_ch, block_out_ch))
block_in_ch = block_out_ch
if curr_res in self.attn_resolutions:
blocks.append(AttnBlock(block_in_ch))
if i != 0:
blocks.append(Upsample(block_in_ch))
curr_res = curr_res * 2
blocks.append(normalize(block_in_ch))
blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
self.blocks = nn.ModuleList(blocks)
def forward(self, x):
for block in self.blocks:
x = block(x)
return x
@ARCH_REGISTRY.register()
class VQAutoEncoder(nn.Module):
def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=[16], codebook_size=1024, emb_dim=256,
beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
super().__init__()
logger = get_root_logger()
self.in_channels = 3
self.nf = nf
self.n_blocks = res_blocks
self.codebook_size = codebook_size
self.embed_dim = emb_dim
self.ch_mult = ch_mult
self.resolution = img_size
self.attn_resolutions = attn_resolutions
self.quantizer_type = quantizer
self.encoder = Encoder(
self.in_channels,
self.nf,
self.embed_dim,
self.ch_mult,
self.n_blocks,
self.resolution,
self.attn_resolutions
)
if self.quantizer_type == "nearest":
self.beta = beta #0.25
self.quantize = VectorQuantizer(self.codebook_size, self.embed_dim, self.beta)
elif self.quantizer_type == "gumbel":
self.gumbel_num_hiddens = emb_dim
self.straight_through = gumbel_straight_through
self.kl_weight = gumbel_kl_weight
self.quantize = GumbelQuantizer(
self.codebook_size,
self.embed_dim,
self.gumbel_num_hiddens,
self.straight_through,
self.kl_weight
)
self.generator = Generator(
self.nf,
self.embed_dim,
self.ch_mult,
self.n_blocks,
self.resolution,
self.attn_resolutions
)
if model_path is not None:
chkpt = torch.load(model_path, map_location='cpu')
if 'params_ema' in chkpt:
self.load_state_dict(torch.load(model_path, map_location='cpu')['params_ema'])
logger.info(f'vqgan is loaded from: {model_path} [params_ema]')
elif 'params' in chkpt:
self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
logger.info(f'vqgan is loaded from: {model_path} [params]')
else:
raise ValueError(f'Wrong params!')
def forward(self, x):
x = self.encoder(x)
quant, codebook_loss, quant_stats = self.quantize(x)
x = self.generator(quant)
return x, codebook_loss, quant_stats
# patch based discriminator
@ARCH_REGISTRY.register()
class VQGANDiscriminator(nn.Module):
def __init__(self, nc=3, ndf=64, n_layers=4, model_path=None):
super().__init__()
layers = [nn.Conv2d(nc, ndf, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, True)]
ndf_mult = 1
ndf_mult_prev = 1
for n in range(1, n_layers): # gradually increase the number of filters
ndf_mult_prev = ndf_mult
ndf_mult = min(2 ** n, 8)
layers += [
nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=2, padding=1, bias=False),
nn.BatchNorm2d(ndf * ndf_mult),
nn.LeakyReLU(0.2, True)
]
ndf_mult_prev = ndf_mult
ndf_mult = min(2 ** n_layers, 8)
layers += [
nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=1, padding=1, bias=False),
nn.BatchNorm2d(ndf * ndf_mult),
nn.LeakyReLU(0.2, True)
]
layers += [
nn.Conv2d(ndf * ndf_mult, 1, kernel_size=4, stride=1, padding=1)] # output 1 channel prediction map
self.main = nn.Sequential(*layers)
if model_path is not None:
chkpt = torch.load(model_path, map_location='cpu')
if 'params_d' in chkpt:
self.load_state_dict(torch.load(model_path, map_location='cpu')['params_d'])
elif 'params' in chkpt:
self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
else:
raise ValueError(f'Wrong params!')
def forward(self, x):
return self.main(x)

View File

@@ -4,7 +4,7 @@ import copy
import base64
import mimetypes
import os
from ldm.dream.args import Args, format_metadata
from ldm.dream.args import Args, metadata_dumps
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from ldm.dream.pngwriter import PngWriter
from threading import Event
@@ -37,6 +37,8 @@ def build_opt(post_data, seed, gfpgan_model_exists):
setattr(opt, 'seed', None if int(post_data['seed']) == -1 else int(post_data['seed']))
setattr(opt, 'variation_amount', float(post_data['variation_amount']) if int(post_data['seed']) != -1 else 0)
setattr(opt, 'with_variations', [])
setattr(opt, 'embiggen', None)
setattr(opt, 'embiggen_tiles', None)
broken = False
if int(post_data['seed']) != -1 and post_data['with_variations'] != '':
@@ -76,16 +78,15 @@ class DreamServer(BaseHTTPRequestHandler):
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
with open("./static/dream_web/index.html", "rb") as content:
with open("./static/legacy_web/index.html", "rb") as content:
self.wfile.write(content.read())
elif self.path == "/config.js":
# unfortunately this import can't be at the top level, since that would cause a circular import
from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
self.send_response(200)
self.send_header("Content-type", "application/javascript")
self.end_headers()
config = {
'gfpgan_model_exists': gfpgan_model_exists
'gfpgan_model_exists': self.gfpgan_model_exists
}
self.wfile.write(bytes("let config = " + json.dumps(config) + ";\n", "utf-8"))
elif self.path == "/run_log.json":
@@ -94,7 +95,7 @@ class DreamServer(BaseHTTPRequestHandler):
self.end_headers()
output = []
log_file = os.path.join(self.outdir, "dream_web_log.txt")
log_file = os.path.join(self.outdir, "legacy_web_log.txt")
if os.path.exists(log_file):
with open(log_file, "r") as log:
for line in log:
@@ -114,7 +115,7 @@ class DreamServer(BaseHTTPRequestHandler):
else:
path_dir = os.path.dirname(self.path)
out_dir = os.path.realpath(self.outdir.rstrip('/'))
if self.path.startswith('/static/dream_web/'):
if self.path.startswith('/static/legacy_web/'):
path = '.' + self.path
elif out_dir.replace('\\', '/').endswith(path_dir):
file = os.path.basename(self.path)
@@ -138,14 +139,12 @@ class DreamServer(BaseHTTPRequestHandler):
self.end_headers()
# unfortunately this import can't be at the top level, since that would cause a circular import
from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
content_length = int(self.headers['Content-Length'])
post_data = json.loads(self.rfile.read(content_length))
opt = build_opt(post_data, self.model.seed, gfpgan_model_exists)
opt = build_opt(post_data, self.model.seed, self.gfpgan_model_exists)
self.canceled.clear()
print(f">> Request to generate with prompt: {opt.prompt}")
# In order to handle upscaled images, the PngWriter needs to maintain state
# across images generated by each call to prompt2img(), so we define it in
# the outer scope of image_done()
@@ -162,7 +161,7 @@ class DreamServer(BaseHTTPRequestHandler):
# is complete. The upscaling replaces the original file, so the second
# entry should not be inserted into the image list.
# LS: This repeats code in dream.py
def image_done(image, seed, upscaled=False):
def image_done(image, seed, upscaled=False, first_seed=None):
name = f'{prefix}.{seed}.png'
iter_opt = copy.copy(opt)
if opt.variation_amount > 0:
@@ -176,10 +175,9 @@ class DreamServer(BaseHTTPRequestHandler):
path = pngwriter.save_image_and_prompt_to_png(
image,
dream_prompt = formatted_prompt,
metadata = format_metadata(iter_opt,
seeds = [seed],
weights = self.model.weights,
model_hash = self.model.model_hash
metadata = metadata_dumps(iter_opt,
seeds = [seed],
model_hash = self.model.model_hash
),
name = name,
)
@@ -188,7 +186,7 @@ class DreamServer(BaseHTTPRequestHandler):
config['seed'] = seed
# Append post_data to log, but only once!
if not upscaled:
with open(os.path.join(self.outdir, "dream_web_log.txt"), "a") as log:
with open(os.path.join(self.outdir, "legacy_web_log.txt"), "a") as log:
log.write(f"{path}: {json.dumps(config)}\n")
self.wfile.write(bytes(json.dumps(

View File

@@ -15,20 +15,59 @@ import traceback
import transformers
import io
import hashlib
import cv2
import skimage
from omegaconf import OmegaConf
from PIL import Image, ImageOps
from torch import nn
from pytorch_lightning import seed_everything, logging
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.plms import PLMSSampler
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.plms import PLMSSampler
from ldm.models.diffusion.ksampler import KSampler
from ldm.dream.pngwriter import PngWriter
from ldm.dream.image_util import InitImageResizer
from ldm.dream.devices import choose_torch_device
from ldm.dream.conditioning import get_uc_and_c
from ldm.dream.pngwriter import PngWriter
from ldm.dream.args import metadata_from_png
from ldm.dream.image_util import InitImageResizer
from ldm.dream.devices import choose_torch_device, choose_precision
from ldm.dream.conditioning import get_uc_and_c
def fix_func(orig):
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
def new_func(*args, **kw):
device = kw.get("device", "mps")
kw["device"]="cpu"
return orig(*args, **kw).to(device)
return new_func
return orig
torch.rand = fix_func(torch.rand)
torch.rand_like = fix_func(torch.rand_like)
torch.randn = fix_func(torch.randn)
torch.randn_like = fix_func(torch.randn_like)
torch.randint = fix_func(torch.randint)
torch.randint_like = fix_func(torch.randint_like)
torch.bernoulli = fix_func(torch.bernoulli)
torch.multinomial = fix_func(torch.multinomial)
def fix_func(orig):
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
def new_func(*args, **kw):
device = kw.get("device", "mps")
kw["device"]="cpu"
return orig(*args, **kw).to(device)
return new_func
return orig
torch.rand = fix_func(torch.rand)
torch.rand_like = fix_func(torch.rand_like)
torch.randn = fix_func(torch.randn)
torch.randn_like = fix_func(torch.randn_like)
torch.randint = fix_func(torch.randint)
torch.randint_like = fix_func(torch.randint_like)
torch.bernoulli = fix_func(torch.bernoulli)
torch.multinomial = fix_func(torch.multinomial)
def fix_func(orig):
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
@@ -102,7 +141,7 @@ gr = Generate(
# these values are set once and shouldn't be changed
conf = path to configuration file ('configs/models.yaml')
model = symbolic name of the model in the configuration file
full_precision = False
precision = float precision to be used
# this value is sticky and maintained between generation calls
sampler_name = ['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'] // k_lms
@@ -128,9 +167,13 @@ class Generate:
sampler_name = 'k_lms',
ddim_eta = 0.0, # deterministic
full_precision = False,
precision = 'auto',
# these are deprecated; if present they override values in the conf file
weights = None,
config = None,
gfpgan=None,
codeformer=None,
esrgan=None
):
models = OmegaConf.load(conf)
mconfig = models[model]
@@ -143,7 +186,7 @@ class Generate:
self.cfg_scale = 7.5
self.sampler_name = sampler_name
self.ddim_eta = 0.0 # same seed always produces same image
self.full_precision = True if choose_torch_device() == 'mps' else full_precision
self.precision = precision
self.strength = 0.75
self.seamless = False
self.embedding_path = embedding_path
@@ -154,12 +197,23 @@ class Generate:
self.generators = {}
self.base_generator = None
self.seed = None
self.gfpgan = gfpgan
self.codeformer = codeformer
self.esrgan = esrgan
# Note that in previous versions, there was an option to pass the
# device to Generate(). However the device was then ignored, so
# it wasn't actually doing anything. This logic could be reinstated.
device_type = choose_torch_device()
self.device = torch.device(device_type)
if full_precision:
if self.precision != 'auto':
raise ValueError('Remove --full_precision / -F if using --precision')
print('Please remove deprecated --full_precision / -F')
print('If auto config does not work you can use --precision=float32')
self.precision = 'float32'
if self.precision == 'auto':
self.precision = choose_precision(self.device)
# for VRAM usage statistics
self.session_peakmem = torch.cuda.max_memory_allocated() if self._has_cuda else None
@@ -220,11 +274,15 @@ class Generate:
init_mask = None,
fit = False,
strength = None,
init_color = None,
# these are specific to embiggen (which also relies on img2img args)
embiggen = None,
embiggen_tiles = None,
out_direction = None,
# these are specific to GFPGAN/ESRGAN
facetool = None,
gfpgan_strength = 0,
codeformer_fidelity = None,
save_original = False,
upscale = None,
# Set this True to handle KeyboardInterrupt internally
@@ -269,16 +327,17 @@ class Generate:
write the prompt into the PNG metadata.
"""
# TODO: convert this into a getattr() loop
steps = steps or self.steps
width = width or self.width
height = height or self.height
seamless = seamless or self.seamless
cfg_scale = cfg_scale or self.cfg_scale
ddim_eta = ddim_eta or self.ddim_eta
iterations = iterations or self.iterations
strength = strength or self.strength
self.seed = seed
steps = steps or self.steps
width = width or self.width
height = height or self.height
seamless = seamless or self.seamless
cfg_scale = cfg_scale or self.cfg_scale
ddim_eta = ddim_eta or self.ddim_eta
iterations = iterations or self.iterations
strength = strength or self.strength
self.seed = seed
self.log_tokenization = log_tokenization
self.step_callback = step_callback
with_variations = [] if with_variations is None else with_variations
# will instantiate the model or return it from cache
@@ -287,16 +346,17 @@ class Generate:
for m in model.modules():
if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
m.padding_mode = 'circular' if seamless else m._orig_padding_mode
assert cfg_scale > 1.0, 'CFG_Scale (-C) must be >1.0'
assert (
0.0 < strength < 1.0
), 'img2img and inpaint strength can only work with 0.0 < strength < 1.0'
assert (
0.0 <= variation_amount <= 1.0
0.0 <= variation_amount <= 1.0
), '-v --variation_amount must be in [0.0, 1.0]'
assert (
(embiggen == None and embiggen_tiles == None) or ((embiggen != None or embiggen_tiles != None) and init_img != None)
(embiggen == None and embiggen_tiles == None) or (
(embiggen != None or embiggen_tiles != None) and init_img != None)
), 'Embiggen requires an init/input image to be specified'
if len(with_variations) > 0 or variation_amount > 1.0:
@@ -318,9 +378,9 @@ class Generate:
if self._has_cuda():
torch.cuda.reset_peak_memory_stats()
results = list()
init_image = None
mask_image = None
results = list()
init_image = None
mask_image = None
try:
uc, c = get_uc_and_c(
@@ -329,8 +389,14 @@ class Generate:
log_tokens =self.log_tokenization
)
(init_image,mask_image) = self._make_images(init_img,init_mask, width, height, fit)
init_image,mask_image = self._make_images(
init_img,
init_mask,
width,
height,
fit=fit,
out_direction=out_direction,
)
if (init_image is not None) and (mask_image is not None):
generator = self._make_inpaint()
elif (embiggen != None or embiggen_tiles != None):
@@ -340,32 +406,40 @@ class Generate:
else:
generator = self._make_txt2img()
generator.set_variation(self.seed, variation_amount, with_variations)
generator.set_variation(
self.seed, variation_amount, with_variations)
results = generator.generate(
prompt,
iterations = iterations,
seed = self.seed,
sampler = self.sampler,
steps = steps,
cfg_scale = cfg_scale,
conditioning = (uc,c),
ddim_eta = ddim_eta,
image_callback = image_callback, # called after the final image is generated
step_callback = step_callback, # called after each intermediate image is generated
width = width,
height = height,
init_img = init_img, # embiggen needs to manipulate from the unmodified init_img
init_image = init_image, # notice that init_image is different from init_img
mask_image = mask_image,
strength = strength,
embiggen = embiggen,
embiggen_tiles = embiggen_tiles,
iterations=iterations,
seed=self.seed,
sampler=self.sampler,
steps=steps,
cfg_scale=cfg_scale,
conditioning=(uc, c),
ddim_eta=ddim_eta,
image_callback=image_callback, # called after the final image is generated
step_callback=step_callback, # called after each intermediate image is generated
width=width,
height=height,
init_img=init_img, # embiggen needs to manipulate from the unmodified init_img
init_image=init_image, # notice that init_image is different from init_img
mask_image=mask_image,
strength=strength,
embiggen=embiggen,
embiggen_tiles=embiggen_tiles,
)
if init_color:
self.correct_colors(image_list = results,
reference_image_path = init_color,
image_callback = image_callback)
if upscale is not None or gfpgan_strength > 0:
self.upscale_and_reconstruct(results,
upscale = upscale,
facetool = facetool,
strength = gfpgan_strength,
codeformer_fidelity = codeformer_fidelity,
save_original = save_original,
image_callback = image_callback)
@@ -381,7 +455,8 @@ class Generate:
toc = time.time()
print('>> Usage stats:')
print(
f'>> {len(results)} image(s) generated in', '%4.2fs' % (toc - tic)
f'>> {len(results)} image(s) generated in', '%4.2fs' % (
toc - tic)
)
if self._has_cuda():
print(
@@ -400,53 +475,193 @@ class Generate:
)
return results
def _make_images(self, img_path, mask_path, width, height, fit=False):
# this needs to be generalized to all sorts of postprocessors, which should be wrapped
# in a nice harmonized call signature. For now we have a bunch of if/elses!
def apply_postprocessor(
self,
image_path,
tool = 'gfpgan', # one of 'upscale', 'gfpgan', 'codeformer', 'outpaint', or 'embiggen'
gfpgan_strength = 0.0,
codeformer_fidelity = 0.75,
upscale = None,
out_direction = None,
save_original = True, # to get new name
callback = None,
opt = None,
):
# retrieve the seed from the image;
# note that we will try both the new way and the old way, since not all files have the
# metadata (yet)
seed = None
image_metadata = None
prompt = None
try:
args = metadata_from_png(image_path)
if len(args) > 1:
print("* Can't postprocess a grid")
return
seed = args[0].seed
prompt = args[0].prompt
print(f'>> retrieved seed {seed} and prompt "{prompt}" from {image_path}')
except:
m = re.search('(\d+)\.png$',image_path)
if m:
seed = m.group(1)
if not seed:
print('* Could not recover seed for image. Replacing with 42. This will not affect image quality')
seed = 42
# face fixers and esrgan take an Image, but embiggen takes a path
image = Image.open(image_path)
# Note that we need to adopt a uniform API for the postprocessors.
# This is completely ad hoc ATCM
if tool in ('gfpgan','codeformer','upscale'):
if tool == 'gfpgan':
facetool = 'gfpgan'
elif tool == 'codeformer':
facetool = 'codeformer'
elif tool == 'upscale':
facetool = 'gfpgan' # but won't be run
gfpgan_strength = 0
return self.upscale_and_reconstruct(
[[image,seed]],
facetool = facetool,
strength = gfpgan_strength,
codeformer_fidelity = codeformer_fidelity,
save_original = save_original,
upscale = upscale,
image_callback = callback,
)
elif tool == 'embiggen':
# fetch the metadata from the image
generator = self._make_embiggen()
uc, c = get_uc_and_c(
prompt, model =self.model,
skip_normalize=opt.skip_normalize,
log_tokens =opt.log_tokenization
)
opt.strength = 0.40
print(f'>> Setting img2img strength to {opt.strength} for happy embiggening')
# embiggen takes a image path (sigh)
generator.generate(
prompt,
sampler = self.sampler,
steps = opt.steps,
cfg_scale = opt.cfg_scale,
ddim_eta = self.ddim_eta,
conditioning= (uc, c),
init_img = image_path, # not the Image! (sigh)
init_image = image, # embiggen wants both! (sigh)
strength = opt.strength,
width = opt.width,
height = opt.height,
embiggen = opt.embiggen,
embiggen_tiles = opt.embiggen_tiles,
image_callback = callback,
)
elif tool == 'outpaint':
oldargs = metadata_from_png(image_path)
opt.strength = 0.83
opt.init_img = image_path
return self.prompt2image(
oldargs.prompt,
out_direction = opt.out_direction,
sampler = self.sampler,
steps = opt.steps,
cfg_scale = opt.cfg_scale,
ddim_eta = self.ddim_eta,
conditioning= get_uc_and_c(
oldargs.prompt, model =self.model,
skip_normalize=opt.skip_normalize,
log_tokens =opt.log_tokenization
),
width = opt.width,
height = opt.height,
init_img = image_path, # not the Image! (sigh)
strength = opt.strength,
image_callback = callback,
)
else:
print(f'* postprocessing tool {tool} is not yet supported')
return None
def _make_images(
self,
img,
mask,
width,
height,
fit=False,
out_direction=None,
):
init_image = None
init_mask = None
if not img_path:
return None,None
if not img:
return None, None
image = self._load_img(img_path, width, height, fit=fit) # this returns an Image
image = self._load_img(
img,
width,
height,
fit=fit
) # this returns an Image
if out_direction:
image = self._create_outpaint_image(image, out_direction)
init_image = self._create_init_image(image) # this returns a torch tensor
if self._has_transparency(image) and not mask_path: # if image has a transparent area and no mask was provided, then try to generate mask
print('>> Initial image has transparent areas. Will inpaint in these regions.')
# if image has a transparent area and no mask was provided, then try to generate mask
if self._has_transparency(image) and not mask:
print(
'>> Initial image has transparent areas. Will inpaint in these regions.')
if self._check_for_erasure(image):
print(
'>> WARNING: Colors underneath the transparent region seem to have been erased.\n',
'>> Inpainting will be suboptimal. Please preserve the colors when making\n',
'>> a transparency mask, or provide mask explicitly using --init_mask (-M).'
)
init_mask = self._create_init_mask(image) # this returns a torch tensor
# this returns a torch tensor
init_mask = self._create_init_mask(image)
if mask_path:
mask_image = self._load_img(mask_path, width, height, fit=fit) # this returns an Image
init_mask = self._create_init_mask(mask_image)
if mask:
mask_image = self._load_img(
mask, width, height, fit=fit) # this returns an Image
init_mask = self._create_init_mask(mask_image)
return init_image,init_mask
return init_image, init_mask
def _make_base(self):
if not self.generators.get('base'):
from ldm.dream.generator import Generator
self.generators['base'] = Generator(self.model, self.precision)
return self.generators['base']
def _make_img2img(self):
if not self.generators.get('img2img'):
from ldm.dream.generator.img2img import Img2Img
self.generators['img2img'] = Img2Img(self.model)
self.generators['img2img'] = Img2Img(self.model, self.precision)
return self.generators['img2img']
def _make_embiggen(self):
if not self.generators.get('embiggen'):
from ldm.dream.generator.embiggen import Embiggen
self.generators['embiggen'] = Embiggen(self.model)
self.generators['embiggen'] = Embiggen(self.model, self.precision)
return self.generators['embiggen']
def _make_txt2img(self):
if not self.generators.get('txt2img'):
from ldm.dream.generator.txt2img import Txt2Img
self.generators['txt2img'] = Txt2Img(self.model)
self.generators['txt2img'] = Txt2Img(self.model, self.precision)
self.generators['txt2img'].free_gpu_mem = self.free_gpu_mem
return self.generators['txt2img']
def _make_inpaint(self):
if not self.generators.get('inpaint'):
from ldm.dream.generator.inpaint import Inpaint
self.generators['inpaint'] = Inpaint(self.model)
self.generators['inpaint'] = Inpaint(self.model, self.precision)
return self.generators['inpaint']
def load_model(self):
@@ -457,7 +672,7 @@ class Generate:
model = self._load_model_from_config(self.config, self.weights)
if self.embedding_path is not None:
model.embedding_manager.load(
self.embedding_path, self.full_precision
self.embedding_path, self.precision == 'float32' or self.precision == 'autocast'
)
self.model = model.to(self.device)
# model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
@@ -475,38 +690,63 @@ class Generate:
return self.model
def correct_colors(self,
image_list,
reference_image_path,
image_callback = None):
reference_image = Image.open(reference_image_path)
correction_target = cv2.cvtColor(np.asarray(reference_image),
cv2.COLOR_RGB2LAB)
for r in image_list:
image, seed = r
image = cv2.cvtColor(np.asarray(image),
cv2.COLOR_RGB2LAB)
image = skimage.exposure.match_histograms(image,
correction_target,
channel_axis=2)
image = Image.fromarray(
cv2.cvtColor(image, cv2.COLOR_LAB2RGB).astype("uint8")
)
if image_callback is not None:
image_callback(image, seed)
else:
r[0] = image
def upscale_and_reconstruct(self,
image_list,
facetool = 'gfpgan',
upscale = None,
strength = 0.0,
codeformer_fidelity = 0.75,
save_original = False,
image_callback = None):
try:
if upscale is not None:
from ldm.gfpgan.gfpgan_tools import real_esrgan_upscale
if strength > 0:
from ldm.gfpgan.gfpgan_tools import run_gfpgan
except (ModuleNotFoundError, ImportError):
print(traceback.format_exc(), file=sys.stderr)
print('>> You may need to install the ESRGAN and/or GFPGAN modules')
return
for r in image_list:
image, seed = r
try:
if upscale is not None:
if len(upscale) < 2:
upscale.append(0.75)
image = real_esrgan_upscale(
image,
upscale[1],
int(upscale[0]),
seed,
)
if self.esrgan is not None:
if len(upscale) < 2:
upscale.append(0.75)
image = self.esrgan.process(
image, upscale[1], seed, int(upscale[0]))
else:
print(">> ESRGAN is disabled. Image not upscaled.")
if strength > 0:
image = run_gfpgan(
image, strength, seed, 1
)
if self.gfpgan is not None or self.codeformer is not None:
if facetool == 'gfpgan':
if self.gfpgan is None:
print('>> GFPGAN not found. Face restoration is disabled.')
else:
image = self.gfpgan.process(image, strength, seed)
if facetool == 'codeformer':
if self.codeformer is None:
print('>> CodeFormer not found. Face restoration is disabled.')
else:
cf_device = 'cpu' if str(self.device) == 'mps' else self.device
image = self.codeformer.process(image=image, strength=strength, device=cf_device, seed=seed, fidelity=codeformer_fidelity)
else:
print(">> Face Restoration is disabled.")
except Exception as e:
print(
f'>> Error running RealESRGAN or GFPGAN. Your image was not upscaled.\n{e}'
@@ -518,14 +758,8 @@ class Generate:
r[0] = image
# to help WebGUI - front end to generator util function
def sample_to_image(self,samples):
return self._sample_to_image(samples)
def _sample_to_image(self,samples):
if not self.base_generator:
from ldm.dream.generator import Generator
self.base_generator = Generator(self.model)
return self.base_generator.sample_to_image(samples)
def sample_to_image(self, samples):
return self._make_base().sample_to_image(samples)
def _set_sampler(self):
msg = f'>> Setting Sampler to {self.sampler_name}'
@@ -564,7 +798,7 @@ class Generate:
# for usage statistics
device_type = choose_torch_device()
if device_type == 'cuda':
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_peak_memory_stats()
tic = time.time()
# this does the work
@@ -577,16 +811,13 @@ class Generate:
sd = pl_sd['state_dict']
model = instantiate_from_config(c.model)
m, u = model.load_state_dict(sd, strict=False)
if self.full_precision:
print(
'>> Using slower but more accurate full-precision math (--full_precision)'
)
if self.precision == 'float16':
print('>> Using faster float16 precision')
model.to(torch.float16)
else:
print(
'>> Using half precision math. Call with --full_precision to use more accurate but VRAM-intensive full precision.'
)
model.half()
print('>> Using more accurate float32 precision')
model.to(self.device)
model.eval()
@@ -605,22 +836,32 @@ class Generate:
return model
def _load_img(self, path, width, height, fit=False):
assert os.path.exists(path), f'>> {path}: File not found'
def _load_img(self, img, width, height, fit=False):
if isinstance(img, Image.Image):
image = img
print(
f'>> using provided input image of size {image.width}x{image.height}'
)
elif isinstance(img, str):
assert os.path.exists(img), f'>> {img}: File not found'
image = Image.open(img)
print(
f'>> loaded input image of size {image.width}x{image.height} from {img}'
)
else:
image = Image.open(img)
print(
f'>> loaded input image of size {image.width}x{image.height}'
)
# with Image.open(path) as img:
# image = img.convert('RGBA')
image = Image.open(path)
print(
f'>> loaded input image of size {image.width}x{image.height} from {path}'
)
if fit:
image = self._fit_image(image,(width,height))
image = self._fit_image(image, (width, height))
else:
image = self._squeeze_image(image)
return image
def _create_init_image(self,image):
def _create_init_image(self, image):
image = image.convert('RGB')
# print(
# f'>> DEBUG: writing the image to img.png'
@@ -629,16 +870,77 @@ class Generate:
image = np.array(image).astype(np.float32) / 255.0
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image)
image = 2.0 * image - 1.0
image = 2.0 * image - 1.0
return image.to(self.device)
# TODO: outpainting is a post-processing application and should be made to behave
# like the other ones.
def _create_outpaint_image(self, image, direction_args):
assert len(direction_args) in [1, 2], 'Direction (-D) must have exactly one or two arguments.'
if len(direction_args) == 1:
direction = direction_args[0]
pixels = None
elif len(direction_args) == 2:
direction = direction_args[0]
pixels = int(direction_args[1])
assert direction in ['top', 'left', 'bottom', 'right'], 'Direction (-D) must be one of "top", "left", "bottom", "right"'
image = image.convert("RGBA")
# we always extend top, but rotate to extend along the requested side
if direction == 'left':
image = image.transpose(Image.Transpose.ROTATE_270)
elif direction == 'bottom':
image = image.transpose(Image.Transpose.ROTATE_180)
elif direction == 'right':
image = image.transpose(Image.Transpose.ROTATE_90)
pixels = image.height//2 if pixels is None else int(pixels)
assert 0 < pixels < image.height, 'Direction (-D) pixels length must be in the range 0 - image.size'
# the top part of the image is taken from the source image mirrored
# coordinates (0,0) are the upper left corner of an image
top = image.transpose(Image.Transpose.FLIP_TOP_BOTTOM).convert("RGBA")
top = top.crop((0, top.height - pixels, top.width, top.height))
# setting all alpha of the top part to 0
alpha = top.getchannel("A")
alpha.paste(0, (0, 0, top.width, top.height))
top.putalpha(alpha)
# taking the bottom from the original image
bottom = image.crop((0, 0, image.width, image.height - pixels))
new_img = image.copy()
new_img.paste(top, (0, 0))
new_img.paste(bottom, (0, pixels))
# create a 10% dither in the middle
dither = min(image.height//10, pixels)
for x in range(0, image.width, 2):
for y in range(pixels - dither, pixels + dither):
(r, g, b, a) = new_img.getpixel((x, y))
new_img.putpixel((x, y), (r, g, b, 0))
# let's rotate back again
if direction == 'left':
new_img = new_img.transpose(Image.Transpose.ROTATE_90)
elif direction == 'bottom':
new_img = new_img.transpose(Image.Transpose.ROTATE_180)
elif direction == 'right':
new_img = new_img.transpose(Image.Transpose.ROTATE_270)
return new_img
def _create_init_mask(self, image):
# convert into a black/white mask
image = self._image_to_mask(image)
image = image.convert('RGB')
# BUG: We need to use the model's downsample factor rather than hardcoding "8"
from ldm.dream.generator.base import downsampling
image = image.resize((image.width//downsampling, image.height//downsampling), resample=Image.Resampling.LANCZOS)
image = image.resize((image.width//downsampling, image.height //
downsampling), resample=Image.Resampling.NEAREST)
# print(
# f'>> DEBUG: writing the mask to mask.png'
# )
@@ -660,7 +962,7 @@ class Generate:
mask = ImageOps.invert(mask)
return mask
def _has_transparency(self,image):
def _has_transparency(self, image):
if image.info.get("transparency", None) is not None:
return True
if image.mode == "P":
@@ -674,11 +976,10 @@ class Generate:
return True
return False
def _check_for_erasure(self,image):
def _check_for_erasure(self, image):
width, height = image.size
pixdata = image.load()
colored = 0
pixdata = image.load()
colored = 0
for y in range(height):
for x in range(width):
if pixdata[x, y][3] == 0:
@@ -688,28 +989,28 @@ class Generate:
colored += 1
return colored == 0
def _squeeze_image(self,image):
x,y,resize_needed = self._resolution_check(image.width,image.height)
def _squeeze_image(self, image):
x, y, resize_needed = self._resolution_check(image.width, image.height)
if resize_needed:
return InitImageResizer(image).resize(x,y)
return InitImageResizer(image).resize(x, y)
return image
def _fit_image(self,image,max_dimensions):
w,h = max_dimensions
def _fit_image(self, image, max_dimensions):
w, h = max_dimensions
print(
f'>> image will be resized to fit inside a box {w}x{h} in size.'
)
if image.width > image.height:
h = None # by setting h to none, we tell InitImageResizer to fit into the width and calculate height
h = None # by setting h to none, we tell InitImageResizer to fit into the width and calculate height
elif image.height > image.width:
w = None # ditto for w
w = None # ditto for w
else:
pass
image = InitImageResizer(image).resize(w,h) # note that InitImageResizer does the multiple of 64 truncation internally
# note that InitImageResizer does the multiple of 64 truncation internally
image = InitImageResizer(image).resize(w, h)
print(
f'>> after adjusting image dimensions to be multiples of 64, init image is {image.width}x{image.height}'
)
)
return image
def _resolution_check(self, width, height, log=False):
@@ -723,7 +1024,7 @@ class Generate:
f'>> Provided width and height must be multiples of 64. Auto-resizing to {w}x{h}'
)
height = h
width = w
width = w
resize_needed = True
if (width * height) > (self.width * self.height):

View File

@@ -1,168 +0,0 @@
import torch
import warnings
import os
import sys
import numpy as np
from PIL import Image
#from scripts.dream import create_argv_parser
from ldm.dream.args import Args
opt = Args()
opt.parse_args()
model_path = os.path.join(opt.gfpgan_dir, opt.gfpgan_model_path)
gfpgan_model_exists = os.path.isfile(model_path)
def run_gfpgan(image, strength, seed, upsampler_scale=4):
print(f'>> GFPGAN - Restoring Faces for image seed:{seed}')
gfpgan = None
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
try:
if not gfpgan_model_exists:
raise Exception('GFPGAN model not found at path ' + model_path)
sys.path.append(os.path.abspath(opt.gfpgan_dir))
from gfpgan import GFPGANer
bg_upsampler = _load_gfpgan_bg_upsampler(
opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
)
gfpgan = GFPGANer(
model_path=model_path,
upscale=upsampler_scale,
arch='clean',
channel_multiplier=2,
bg_upsampler=bg_upsampler,
)
except Exception:
import traceback
print('>> Error loading GFPGAN:', file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
if gfpgan is None:
print(
f'>> WARNING: GFPGAN not initialized.'
)
print(
f'>> Download https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth to {model_path}, \nor change GFPGAN directory with --gfpgan_dir.'
)
return image
image = image.convert('RGB')
cropped_faces, restored_faces, restored_img = gfpgan.enhance(
np.array(image, dtype=np.uint8),
has_aligned=False,
only_center_face=False,
paste_back=True,
)
res = Image.fromarray(restored_img)
if strength < 1.0:
# Resize the image to the new image if the sizes have changed
if restored_img.size != image.size:
image = image.resize(res.size)
res = Image.blend(image, res, strength)
if torch.cuda.is_available():
torch.cuda.empty_cache()
gfpgan = None
return res
def _load_gfpgan_bg_upsampler(bg_upsampler, upsampler_scale, bg_tile=400):
if bg_upsampler == 'realesrgan':
if not torch.cuda.is_available(): # CPU or MPS on M1
use_half_precision = False
else:
use_half_precision = True
model_path = {
2: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
4: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
}
if upsampler_scale not in model_path:
return None
from basicsr.archs.rrdbnet_arch import RRDBNet
from realesrgan import RealESRGANer
if upsampler_scale == 4:
model = RRDBNet(
num_in_ch=3,
num_out_ch=3,
num_feat=64,
num_block=23,
num_grow_ch=32,
scale=4,
)
if upsampler_scale == 2:
model = RRDBNet(
num_in_ch=3,
num_out_ch=3,
num_feat=64,
num_block=23,
num_grow_ch=32,
scale=2,
)
bg_upsampler = RealESRGANer(
scale=upsampler_scale,
model_path=model_path[upsampler_scale],
model=model,
tile=bg_tile,
tile_pad=10,
pre_pad=0,
half=use_half_precision,
)
else:
bg_upsampler = None
return bg_upsampler
def real_esrgan_upscale(image, strength, upsampler_scale, seed):
print(
f'>> Real-ESRGAN Upscaling seed:{seed} : scale:{upsampler_scale}x'
)
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
try:
upsampler = _load_gfpgan_bg_upsampler(
opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
)
except Exception:
import traceback
print('>> Error loading Real-ESRGAN:', file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
output, img_mode = upsampler.enhance(
np.array(image, dtype=np.uint8),
outscale=upsampler_scale,
alpha_upsampler=opt.gfpgan_bg_upsampler,
)
res = Image.fromarray(output)
if strength < 1.0:
# Resize the image to the new image if the sizes have changed
if output.size != image.size:
image = image.resize(res.size)
res = Image.blend(image, res, strength)
if torch.cuda.is_available():
torch.cuda.empty_cache()
upsampler = None
return res

View File

@@ -90,7 +90,7 @@ class LinearAttention(nn.Module):
b, c, h, w = x.shape
qkv = self.to_qkv(x)
q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
k = k.softmax(dim=-1)
k = k.softmax(dim=-1)
context = torch.einsum('bhdn,bhen->bhde', k, v)
out = torch.einsum('bhde,bhdn->bhen', context, q)
out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
@@ -167,101 +167,85 @@ class CrossAttention(nn.Module):
nn.Linear(inner_dim, query_dim),
nn.Dropout(dropout)
)
if torch.cuda.is_available():
self.einsum_op = self.einsum_op_cuda
else:
self.mem_total = psutil.virtual_memory().total / (1024**3)
self.einsum_op = self.einsum_op_mps_v1 if self.mem_total >= 32 else self.einsum_op_mps_v2
def einsum_op_compvis(self, q, k, v, r1):
s1 = einsum('b i d, b j d -> b i j', q, k) * self.scale # faster
s2 = s1.softmax(dim=-1, dtype=q.dtype)
del s1
r1 = einsum('b i j, b j d -> b i d', s2, v)
del s2
return r1
self.mem_total_gb = psutil.virtual_memory().total // (1 << 30)
def einsum_op_mps_v1(self, q, k, v, r1):
def einsum_op_compvis(self, q, k, v):
s = einsum('b i d, b j d -> b i j', q, k)
s = s.softmax(dim=-1, dtype=s.dtype)
return einsum('b i j, b j d -> b i d', s, v)
def einsum_op_slice_0(self, q, k, v, slice_size):
r = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
for i in range(0, q.shape[0], slice_size):
end = i + slice_size
r[i:end] = self.einsum_op_compvis(q[i:end], k[i:end], v[i:end])
return r
def einsum_op_slice_1(self, q, k, v, slice_size):
r = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
for i in range(0, q.shape[1], slice_size):
end = i + slice_size
r[:, i:end] = self.einsum_op_compvis(q[:, i:end], k, v)
return r
def einsum_op_mps_v1(self, q, k, v):
if q.shape[1] <= 4096: # (512x512) max q.shape[1]: 4096
r1 = self.einsum_op_compvis(q, k, v, r1)
return self.einsum_op_compvis(q, k, v)
else:
slice_size = math.floor(2**30 / (q.shape[0] * q.shape[1]))
for i in range(0, q.shape[1], slice_size):
end = i + slice_size
s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * self.scale
s2 = s1.softmax(dim=-1, dtype=r1.dtype)
del s1
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
del s2
return r1
return self.einsum_op_slice_1(q, k, v, slice_size)
def einsum_op_mps_v2(self, q, k, v, r1):
if self.mem_total >= 8 and q.shape[1] <= 4096:
r1 = self.einsum_op_compvis(q, k, v, r1)
def einsum_op_mps_v2(self, q, k, v):
if self.mem_total_gb > 8 and q.shape[1] <= 4096:
return self.einsum_op_compvis(q, k, v)
else:
slice_size = 1
for i in range(0, q.shape[0], slice_size):
end = min(q.shape[0], i + slice_size)
s1 = einsum('b i d, b j d -> b i j', q[i:end], k[i:end])
s1 *= self.scale
s2 = s1.softmax(dim=-1, dtype=r1.dtype)
del s1
r1[i:end] = einsum('b i j, b j d -> b i d', s2, v[i:end])
del s2
return r1
def einsum_op_cuda(self, q, k, v, r1):
return self.einsum_op_slice_0(q, k, v, 1)
def einsum_op_tensor_mem(self, q, k, v, max_tensor_mb):
size_mb = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size() // (1 << 20)
if size_mb <= max_tensor_mb:
return self.einsum_op_compvis(q, k, v)
div = 1 << int((size_mb - 1) / max_tensor_mb).bit_length()
if div <= q.shape[0]:
return self.einsum_op_slice_0(q, k, v, q.shape[0] // div)
return self.einsum_op_slice_1(q, k, v, max(q.shape[1] // div, 1))
def einsum_op_cuda(self, q, k, v):
stats = torch.cuda.memory_stats(q.device)
mem_active = stats['active_bytes.all.current']
mem_reserved = stats['reserved_bytes.all.current']
mem_free_cuda, _ = torch.cuda.mem_get_info(torch.cuda.current_device())
mem_free_cuda, _ = torch.cuda.mem_get_info(q.device)
mem_free_torch = mem_reserved - mem_active
mem_free_total = mem_free_cuda + mem_free_torch
# Divide factor of safety as there's copying and fragmentation
return self.einsum_op_tensor_mem(q, k, v, mem_free_total / 3.3 / (1 << 20))
gb = 1024 ** 3
tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * 4
mem_required = tensor_size * 2.5
steps = 1
def einsum_op(self, q, k, v):
if q.device.type == 'cuda':
return self.einsum_op_cuda(q, k, v)
if mem_required > mem_free_total:
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
if q.device.type == 'mps':
if self.mem_total_gb >= 32:
return self.einsum_op_mps_v1(q, k, v)
return self.einsum_op_mps_v2(q, k, v)
if steps > 64:
max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
raise RuntimeError(f'Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). '
f'Need: {mem_required/64/gb:0.1f}GB free, Have:{mem_free_total/gb:0.1f}GB free')
slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
for i in range(0, q.shape[1], slice_size):
end = min(q.shape[1], i + slice_size)
s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * self.scale
s2 = s1.softmax(dim=-1, dtype=r1.dtype)
del s1
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
del s2
return r1
# Smaller slices are faster due to L2/L3/SLC caches.
# Tested on i7 with 8MB L3 cache.
return self.einsum_op_tensor_mem(q, k, v, 32)
def forward(self, x, context=None, mask=None):
h = self.heads
q_in = self.to_q(x)
q = self.to_q(x)
context = default(context, x)
k_in = self.to_k(context)
v_in = self.to_v(context)
device_type = 'mps' if x.device.type == 'mps' else 'cuda'
k = self.to_k(context) * self.scale
v = self.to_v(context)
del context, x
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
del q_in, k_in, v_in
r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
r1 = self.einsum_op(q, k, v, r1)
del q, k, v
r2 = rearrange(r1, '(b h) n d -> b n (h d)', h=h)
del r1
return self.to_out(r2)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
r = self.einsum_op(q, k, v)
return self.to_out(rearrange(r, '(b h) n d -> b n (h d)', h=h))
class BasicTransformerBlock(nn.Module):

View File

@@ -3,6 +3,7 @@ import gc
import math
import torch
import torch.nn as nn
from torch.nn.functional import silu
import numpy as np
from einops import rearrange
@@ -32,11 +33,6 @@ def get_timestep_embedding(timesteps, embedding_dim):
return emb
def nonlinearity(x):
# swish
return x*torch.sigmoid(x)
def Normalize(in_channels, num_groups=32):
return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
@@ -122,14 +118,14 @@ class ResnetBlock(nn.Module):
def forward(self, x, temb):
h = self.norm1(x)
h = nonlinearity(h)
h = silu(h)
h = self.conv1(h)
if temb is not None:
h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
h = h + self.temb_proj(silu(temb))[:,:,None,None]
h = self.norm2(h)
h = nonlinearity(h)
h = silu(h)
h = self.dropout(h)
h = self.conv2(h)
@@ -368,7 +364,7 @@ class Model(nn.Module):
assert t is not None
temb = get_timestep_embedding(t, self.ch)
temb = self.temb.dense[0](temb)
temb = nonlinearity(temb)
temb = silu(temb)
temb = self.temb.dense[1](temb)
else:
temb = None
@@ -402,7 +398,7 @@ class Model(nn.Module):
# end
h = self.norm_out(h)
h = nonlinearity(h)
h = silu(h)
h = self.conv_out(h)
return h
@@ -499,7 +495,7 @@ class Encoder(nn.Module):
# end
h = self.norm_out(h)
h = nonlinearity(h)
h = silu(h)
h = self.conv_out(h)
return h
@@ -611,7 +607,7 @@ class Decoder(nn.Module):
return h
h = self.norm_out(h)
h = nonlinearity(h)
h = silu(h)
h = self.conv_out(h)
if self.tanh_out:
h = torch.tanh(h)
@@ -649,7 +645,7 @@ class SimpleDecoder(nn.Module):
x = layer(x)
h = self.norm_out(x)
h = nonlinearity(h)
h = silu(h)
x = self.conv_out(h)
return x
@@ -697,7 +693,7 @@ class UpsampleDecoder(nn.Module):
if i_level != self.num_resolutions - 1:
h = self.upsample_blocks[k](h)
h = self.norm_out(h)
h = nonlinearity(h)
h = silu(h)
h = self.conv_out(h)
return h
@@ -873,7 +869,7 @@ class FirstStagePostProcessor(nn.Module):
z_fs = self.encode_with_pretrained(x)
z = self.proj_norm(z_fs)
z = self.proj(z)
z = nonlinearity(z)
z = silu(z)
for submodel, downmodel in zip(self.model,self.downsampler):
z = submodel(z,temb=None)

View File

@@ -252,12 +252,6 @@ def normalization(channels):
return GroupNorm32(32, channels)
# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
class SiLU(nn.Module):
def forward(self, x):
return x * torch.sigmoid(x)
class GroupNorm32(nn.GroupNorm):
def forward(self, x):
return super().forward(x.float()).type(x.dtype)

View File

@@ -82,7 +82,9 @@ class EmbeddingManager(nn.Module):
get_embedding_for_clip_token,
embedder.transformer.text_model.embeddings,
)
token_dim = 1280
# per bug report #572
#token_dim = 1280
token_dim = 768
else: # using LDM's BERT encoder
self.is_clip = False
get_token_for_string = partial(