mirror of
https://github.com/invoke-ai/InvokeAI.git
synced 2026-02-05 18:34:55 -05:00
Merge branch 'development' into main
This commit is contained in:
@@ -2,7 +2,10 @@
|
||||
|
||||
The Args class parses both the command line (shell) arguments, as well as the
|
||||
command string passed at the dream> prompt. It serves as the definitive repository
|
||||
of all the arguments used by Generate and their default values.
|
||||
of all the arguments used by Generate and their default values, and implements the
|
||||
preliminary metadata standards discussed here:
|
||||
|
||||
https://github.com/lstein/stable-diffusion/issues/266
|
||||
|
||||
To use:
|
||||
opt = Args()
|
||||
@@ -52,15 +55,40 @@ you wish to apply logic as to which one to use. For example:
|
||||
To add new attributes, edit the _create_arg_parser() and
|
||||
_create_dream_cmd_parser() methods.
|
||||
|
||||
We also export the function build_metadata
|
||||
**Generating and retrieving sd-metadata**
|
||||
|
||||
To generate a dict representing RFC266 metadata:
|
||||
|
||||
metadata = metadata_dumps(opt,<seeds,model_hash,postprocesser>)
|
||||
|
||||
This will generate an RFC266 dictionary that can then be turned into a JSON
|
||||
and written to the PNG file. The optional seeds, weights, model_hash and
|
||||
postprocesser arguments are not available to the opt object and so must be
|
||||
provided externally. See how dream.py does it.
|
||||
|
||||
Note that this function was originally called format_metadata() and a wrapper
|
||||
is provided that issues a deprecation notice.
|
||||
|
||||
To retrieve a (series of) opt objects corresponding to the metadata, do this:
|
||||
|
||||
opt_list = metadata_loads(metadata)
|
||||
|
||||
The metadata should be pulled out of the PNG image. pngwriter has a method
|
||||
retrieve_metadata that will do this, or you can do it in one swell foop
|
||||
with metadata_from_png():
|
||||
|
||||
opt_list = metadata_from_png('/path/to/image_file.png')
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from argparse import Namespace
|
||||
import shlex
|
||||
import json
|
||||
import hashlib
|
||||
import os
|
||||
import copy
|
||||
import base64
|
||||
import ldm.dream.pngwriter
|
||||
from ldm.dream.conditioning import split_weighted_subprompts
|
||||
|
||||
SAMPLER_CHOICES = [
|
||||
@@ -74,6 +102,13 @@ SAMPLER_CHOICES = [
|
||||
'plms',
|
||||
]
|
||||
|
||||
PRECISION_CHOICES = [
|
||||
'auto',
|
||||
'float32',
|
||||
'autocast',
|
||||
'float16',
|
||||
]
|
||||
|
||||
# is there a way to pick this up during git commits?
|
||||
APP_ID = 'lstein/stable-diffusion'
|
||||
APP_VERSION = 'v1.15'
|
||||
@@ -105,6 +140,7 @@ class Args(object):
|
||||
try:
|
||||
elements = shlex.split(command)
|
||||
except ValueError:
|
||||
import sys, traceback
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
return
|
||||
switches = ['']
|
||||
@@ -141,35 +177,49 @@ class Args(object):
|
||||
a = vars(self)
|
||||
a.update(kwargs)
|
||||
switches = list()
|
||||
switches.append(f'"{a["prompt"]}')
|
||||
switches.append(f'"{a["prompt"]}"')
|
||||
switches.append(f'-s {a["steps"]}')
|
||||
switches.append(f'-S {a["seed"]}')
|
||||
switches.append(f'-W {a["width"]}')
|
||||
switches.append(f'-H {a["height"]}')
|
||||
switches.append(f'-C {a["cfg_scale"]}')
|
||||
switches.append(f'-A {a["sampler_name"]}')
|
||||
switches.append(f'-S {a["seed"]}')
|
||||
if a['grid']:
|
||||
switches.append('--grid')
|
||||
if a['iterations'] and a['iterations']>0:
|
||||
switches.append(f'-n {a["iterations"]}')
|
||||
if a['seamless']:
|
||||
switches.append('--seamless')
|
||||
|
||||
# img2img generations have parameters relevant only to them and have special handling
|
||||
if a['init_img'] and len(a['init_img'])>0:
|
||||
switches.append(f'-I {a["init_img"]}')
|
||||
if a['fit']:
|
||||
switches.append(f'--fit')
|
||||
if a['strength'] and a['strength']>0:
|
||||
switches.append(f'-f {a["strength"]}')
|
||||
switches.append(f'-A ddim') # TODO: FIX ME WHEN IMG2IMG SUPPORTS ALL SAMPLERS
|
||||
if a['fit']:
|
||||
switches.append(f'--fit')
|
||||
if a['init_mask'] and len(a['init_mask'])>0:
|
||||
switches.append(f'-M {a["init_mask"]}')
|
||||
if a['init_color'] and len(a['init_color'])>0:
|
||||
switches.append(f'--init_color {a["init_color"]}')
|
||||
if a['strength'] and a['strength']>0:
|
||||
switches.append(f'-f {a["strength"]}')
|
||||
else:
|
||||
switches.append(f'-A {a["sampler_name"]}')
|
||||
|
||||
# gfpgan-specific parameters
|
||||
if a['gfpgan_strength']:
|
||||
switches.append(f'-G {a["gfpgan_strength"]}')
|
||||
|
||||
# esrgan-specific parameters
|
||||
if a['upscale']:
|
||||
switches.append(f'-U {" ".join([str(u) for u in a["upscale"]])}')
|
||||
|
||||
# embiggen parameters
|
||||
if a['embiggen']:
|
||||
switches.append(f'--embiggen {" ".join([str(u) for u in a["embiggen"]])}')
|
||||
if a['embiggen_tiles']:
|
||||
switches.append(f'--embiggen_tiles {" ".join([str(u) for u in a["embiggen_tiles"]])}')
|
||||
if a['variation_amount'] > 0:
|
||||
switches.append(f'-v {a["variation_amount"]}')
|
||||
|
||||
# outpainting parameters
|
||||
if a['out_direction']:
|
||||
switches.append(f'-D {" ".join([str(u) for u in a["out_direction"]])}')
|
||||
if a['with_variations']:
|
||||
formatted_variations = ','.join(f'{seed}:{weight}' for seed, weight in (a["with_variations"]))
|
||||
switches.append(f'-V {formatted_variations}')
|
||||
@@ -189,10 +239,10 @@ class Args(object):
|
||||
pass
|
||||
|
||||
if cmd_switches and arg_switches and name=='__dict__':
|
||||
a = arg_switches.__dict__
|
||||
a.update(cmd_switches.__dict__)
|
||||
return a
|
||||
|
||||
return self._merge_dict(
|
||||
arg_switches.__dict__,
|
||||
cmd_switches.__dict__,
|
||||
)
|
||||
try:
|
||||
return object.__getattribute__(self,name)
|
||||
except AttributeError:
|
||||
@@ -216,13 +266,8 @@ class Args(object):
|
||||
# the arg value. For example, the --grid and --individual options are a little
|
||||
# funny because of their push/pull relationship. This is how to handle it.
|
||||
if name=='grid':
|
||||
return value_arg or value_cmd # arg supersedes cmd
|
||||
if name=='individual':
|
||||
return value_cmd or value_arg # cmd supersedes arg
|
||||
if value_cmd is not None:
|
||||
return value_cmd
|
||||
else:
|
||||
return value_arg
|
||||
return not cmd_switches.individual and value_arg # arg supersedes cmd
|
||||
return value_cmd if value_cmd is not None else value_arg
|
||||
|
||||
def __setattr__(self,name,value):
|
||||
if name.startswith('_'):
|
||||
@@ -230,6 +275,14 @@ class Args(object):
|
||||
else:
|
||||
self._cmd_switches.__dict__[name] = value
|
||||
|
||||
def _merge_dict(self,dict1,dict2):
|
||||
new_dict = {}
|
||||
for k in set(list(dict1.keys())+list(dict2.keys())):
|
||||
value1 = dict1.get(k,None)
|
||||
value2 = dict2.get(k,None)
|
||||
new_dict[k] = value2 if value2 is not None else value1
|
||||
return new_dict
|
||||
|
||||
def _create_arg_parser(self):
|
||||
'''
|
||||
This defines all the arguments used on the command line when you launch
|
||||
@@ -268,12 +321,38 @@ class Args(object):
|
||||
default='stable-diffusion-1.4',
|
||||
help='Indicates which diffusion model to load. (currently "stable-diffusion-1.4" (default) or "laion400m")',
|
||||
)
|
||||
model_group.add_argument(
|
||||
'--sampler',
|
||||
'-A',
|
||||
'-m',
|
||||
dest='sampler_name',
|
||||
type=str,
|
||||
choices=SAMPLER_CHOICES,
|
||||
metavar='SAMPLER_NAME',
|
||||
help=f'Switch to a different sampler. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
|
||||
default='k_lms',
|
||||
)
|
||||
model_group.add_argument(
|
||||
'-F',
|
||||
'--full_precision',
|
||||
dest='full_precision',
|
||||
action='store_true',
|
||||
help='Use more memory-intensive full precision math for calculations',
|
||||
help='Deprecated way to set --precision=float32',
|
||||
)
|
||||
model_group.add_argument(
|
||||
'--free_gpu_mem',
|
||||
dest='free_gpu_mem',
|
||||
action='store_true',
|
||||
help='Force free gpu memory before final decoding',
|
||||
)
|
||||
model_group.add_argument(
|
||||
'--precision',
|
||||
dest='precision',
|
||||
type=str,
|
||||
choices=PRECISION_CHOICES,
|
||||
metavar='PRECISION',
|
||||
help=f'Set model precision. Defaults to auto selected based on device. Options: {", ".join(PRECISION_CHOICES)}',
|
||||
default='auto',
|
||||
)
|
||||
file_group.add_argument(
|
||||
'--from_file',
|
||||
@@ -294,11 +373,6 @@ class Args(object):
|
||||
action='store_true',
|
||||
help='Place images in subdirectories named after the prompt.',
|
||||
)
|
||||
render_group.add_argument(
|
||||
'--seamless',
|
||||
action='store_true',
|
||||
help='Change the model to seamless tiling (circular) mode',
|
||||
)
|
||||
render_group.add_argument(
|
||||
'--grid',
|
||||
'-g',
|
||||
@@ -310,16 +384,21 @@ class Args(object):
|
||||
type=str,
|
||||
help='Path to a pre-trained embedding manager checkpoint - can only be set on command line',
|
||||
)
|
||||
# GFPGAN related args
|
||||
# Restoration related args
|
||||
postprocessing_group.add_argument(
|
||||
'--gfpgan_bg_upsampler',
|
||||
type=str,
|
||||
default='realesrgan',
|
||||
help='Background upsampler. Default: realesrgan. Options: realesrgan, none.',
|
||||
|
||||
'--no_restore',
|
||||
dest='restore',
|
||||
action='store_false',
|
||||
help='Disable face restoration with GFPGAN or codeformer',
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
'--gfpgan_bg_tile',
|
||||
'--no_upscale',
|
||||
dest='esrgan',
|
||||
action='store_false',
|
||||
help='Disable upscaling with ESRGAN',
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
'--esrgan_bg_tile',
|
||||
type=int,
|
||||
default=400,
|
||||
help='Tile size for background sampler, 0 for no tile during testing. Default: 400.',
|
||||
@@ -327,7 +406,7 @@ class Args(object):
|
||||
postprocessing_group.add_argument(
|
||||
'--gfpgan_model_path',
|
||||
type=str,
|
||||
default='experiments/pretrained_models/GFPGANv1.3.pth',
|
||||
default='experiments/pretrained_models/GFPGANv1.4.pth',
|
||||
help='Indicates the path to the GFPGAN model, relative to --gfpgan_dir.',
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
@@ -359,7 +438,10 @@ class Args(object):
|
||||
# This creates the parser that processes commands on the dream> command line
|
||||
def _create_dream_cmd_parser(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Example: dream> a fantastic alien landscape -W1024 -H960 -s100 -n12'
|
||||
description="""
|
||||
Generate example: dream> a fantastic alien landscape -W576 -H512 -s60 -n4
|
||||
Postprocess example: dream> !pp 0000045.4829112.png -G1 -U4 -ft codeformer
|
||||
"""
|
||||
)
|
||||
render_group = parser.add_argument_group('General rendering')
|
||||
img2img_group = parser.add_argument_group('Image-to-image and inpainting')
|
||||
@@ -393,14 +475,12 @@ class Args(object):
|
||||
'--width',
|
||||
type=int,
|
||||
help='Image width, multiple of 64',
|
||||
default=512
|
||||
)
|
||||
render_group.add_argument(
|
||||
'-H',
|
||||
'--height',
|
||||
type=int,
|
||||
help='Image height, multiple of 64',
|
||||
default=512,
|
||||
)
|
||||
render_group.add_argument(
|
||||
'-C',
|
||||
@@ -416,8 +496,8 @@ class Args(object):
|
||||
help='generate a grid'
|
||||
)
|
||||
render_group.add_argument(
|
||||
'--individual',
|
||||
'-i',
|
||||
'--individual',
|
||||
action='store_true',
|
||||
help='override command-line --grid setting and generate individual images'
|
||||
)
|
||||
@@ -436,7 +516,6 @@ class Args(object):
|
||||
choices=SAMPLER_CHOICES,
|
||||
metavar='SAMPLER_NAME',
|
||||
help=f'Switch to a different sampler. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
|
||||
default='k_lms',
|
||||
)
|
||||
render_group.add_argument(
|
||||
'-t',
|
||||
@@ -448,7 +527,6 @@ class Args(object):
|
||||
'--outdir',
|
||||
'-o',
|
||||
type=str,
|
||||
default='outputs/img-samples',
|
||||
help='Directory to save generated images and a log of prompts and seeds',
|
||||
)
|
||||
img2img_group.add_argument(
|
||||
@@ -463,6 +541,11 @@ class Args(object):
|
||||
type=str,
|
||||
help='Path to input mask for inpainting mode (supersedes width and height)',
|
||||
)
|
||||
img2img_group.add_argument(
|
||||
'--init_color',
|
||||
type=str,
|
||||
help='Path to reference image for color correction (used for repeated img2img and inpainting)'
|
||||
)
|
||||
img2img_group.add_argument(
|
||||
'-T',
|
||||
'-fit',
|
||||
@@ -477,12 +560,34 @@ class Args(object):
|
||||
help='Strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely',
|
||||
default=0.75,
|
||||
)
|
||||
img2img_group.add_argument(
|
||||
'-D',
|
||||
'--out_direction',
|
||||
nargs='+',
|
||||
type=str,
|
||||
metavar=('direction', 'pixels'),
|
||||
help='Direction to extend the given image (left|right|top|bottom). If a distance pixel value is not specified it defaults to half the image size'
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
'-ft',
|
||||
'--facetool',
|
||||
type=str,
|
||||
default='gfpgan',
|
||||
help='Select the face restoration AI to use: gfpgan, codeformer',
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
'-G',
|
||||
'--gfpgan_strength',
|
||||
type=float,
|
||||
help='The strength at which to apply the GFPGAN model to the result, in order to improve faces.',
|
||||
default=0,
|
||||
help='The strength at which to apply the face restoration to the result.',
|
||||
default=0.0,
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
'-cf',
|
||||
'--codeformer_fidelity',
|
||||
type=float,
|
||||
help='Used along with CodeFormer. Takes values between 0 and 1. 0 produces high quality but low accuracy. 1 produces high accuracy but low quality.',
|
||||
default=0.75
|
||||
)
|
||||
postprocessing_group.add_argument(
|
||||
'-U',
|
||||
@@ -535,36 +640,55 @@ class Args(object):
|
||||
)
|
||||
return parser
|
||||
|
||||
# very partial implementation of https://github.com/lstein/stable-diffusion/issues/266
|
||||
# it does not write all the required top-level metadata, writes too much image
|
||||
# data, and doesn't support grids yet. But you gotta start somewhere, no?
|
||||
def format_metadata(opt,
|
||||
seeds=[],
|
||||
weights=None,
|
||||
model_hash=None,
|
||||
postprocessing=None):
|
||||
def format_metadata(**kwargs):
|
||||
print(f'format_metadata() is deprecated. Please use metadata_dumps()')
|
||||
return metadata_dumps(kwargs)
|
||||
|
||||
def metadata_dumps(opt,
|
||||
seeds=[],
|
||||
model_hash=None,
|
||||
postprocessing=None):
|
||||
'''
|
||||
Given an Args object, returns a partial implementation of
|
||||
the stable diffusion metadata standard
|
||||
Given an Args object, returns a dict containing the keys and
|
||||
structure of the proposed stable diffusion metadata standard
|
||||
https://github.com/lstein/stable-diffusion/discussions/392
|
||||
This is intended to be turned into JSON and stored in the
|
||||
"sd
|
||||
'''
|
||||
|
||||
# top-level metadata minus `image` or `images`
|
||||
metadata = {
|
||||
'model' : 'stable diffusion',
|
||||
'model_id' : opt.model,
|
||||
'model_hash' : model_hash,
|
||||
'app_id' : APP_ID,
|
||||
'app_version' : APP_VERSION,
|
||||
}
|
||||
|
||||
# add some RFC266 fields that are generated internally, and not as
|
||||
# user args
|
||||
image_dict = opt.to_dict(
|
||||
postprocessing=postprocessing
|
||||
)
|
||||
|
||||
# TODO: This is just a hack until postprocessing pipeline work completed
|
||||
image_dict['postprocessing'] = []
|
||||
if image_dict['gfpgan_strength'] and image_dict['gfpgan_strength'] > 0:
|
||||
image_dict['postprocessing'].append('GFPGAN (not RFC compliant)')
|
||||
if image_dict['upscale'] and image_dict['upscale'][0] > 0:
|
||||
image_dict['postprocessing'].append('ESRGAN (not RFC compliant)')
|
||||
# 'postprocessing' is either null or an array of postprocessing metadatal
|
||||
if postprocessing:
|
||||
# TODO: This is just a hack until postprocessing pipeline work completed
|
||||
image_dict['postprocessing'] = []
|
||||
|
||||
if image_dict['gfpgan_strength'] and image_dict['gfpgan_strength'] > 0:
|
||||
image_dict['postprocessing'].append('GFPGAN (not RFC compliant)')
|
||||
if image_dict['upscale'] and image_dict['upscale'][0] > 0:
|
||||
image_dict['postprocessing'].append('ESRGAN (not RFC compliant)')
|
||||
else:
|
||||
image_dict['postprocessing'] = None
|
||||
|
||||
# remove any image keys not mentioned in RFC #266
|
||||
rfc266_img_fields = ['type','postprocessing','sampler','prompt','seed','variations','steps',
|
||||
'cfg_scale','step_number','width','height','extra','strength']
|
||||
|
||||
rfc_dict ={}
|
||||
|
||||
for item in image_dict.items():
|
||||
key,value = item
|
||||
if key in rfc266_img_fields:
|
||||
@@ -572,39 +696,95 @@ def format_metadata(opt,
|
||||
|
||||
# semantic drift
|
||||
rfc_dict['sampler'] = image_dict.get('sampler_name',None)
|
||||
|
||||
|
||||
# display weighted subprompts (liable to change)
|
||||
if opt.prompt:
|
||||
subprompts = split_weighted_subprompts(opt.prompt)
|
||||
subprompts = [{'prompt':x[0],'weight':x[1]} for x in subprompts]
|
||||
rfc_dict['prompt'] = subprompts
|
||||
|
||||
# variations
|
||||
if opt.with_variations:
|
||||
variations = [{'seed':x[0],'weight':x[1]} for x in opt.with_variations]
|
||||
rfc_dict['variations'] = variations
|
||||
# 'variations' should always exist and be an array, empty or consisting of {'seed': seed, 'weight': weight} pairs
|
||||
rfc_dict['variations'] = [{'seed':x[0],'weight':x[1]} for x in opt.with_variations] if opt.with_variations else []
|
||||
|
||||
if opt.init_img:
|
||||
rfc_dict['type'] = 'img2img'
|
||||
rfc_dict['strength_steps'] = rfc_dict.pop('strength')
|
||||
rfc_dict['orig_hash'] = sha256(image_dict['init_img'])
|
||||
rfc_dict['sampler'] = 'ddim' # FIX ME WHEN IMG2IMG SUPPORTS ALL SAMPLERS
|
||||
rfc_dict['orig_hash'] = calculate_init_img_hash(opt.init_img)
|
||||
rfc_dict['sampler'] = 'ddim' # TODO: FIX ME WHEN IMG2IMG SUPPORTS ALL SAMPLERS
|
||||
else:
|
||||
rfc_dict['type'] = 'txt2img'
|
||||
rfc_dict.pop('strength')
|
||||
|
||||
images = []
|
||||
for seed in seeds:
|
||||
rfc_dict['seed'] = seed
|
||||
images.append(copy.copy(rfc_dict))
|
||||
if len(seeds)==0 and opt.seed:
|
||||
seeds=[seed]
|
||||
|
||||
return {
|
||||
'model' : 'stable diffusion',
|
||||
'model_id' : opt.model,
|
||||
'model_hash' : model_hash,
|
||||
'app_id' : APP_ID,
|
||||
'app_version' : APP_VERSION,
|
||||
'images' : images,
|
||||
}
|
||||
if opt.grid:
|
||||
images = []
|
||||
for seed in seeds:
|
||||
rfc_dict['seed'] = seed
|
||||
images.append(copy.copy(rfc_dict))
|
||||
metadata['images'] = images
|
||||
else:
|
||||
# there should only ever be a single seed if we did not generate a grid
|
||||
assert len(seeds) == 1, 'Expected a single seed'
|
||||
rfc_dict['seed'] = seeds[0]
|
||||
metadata['image'] = rfc_dict
|
||||
|
||||
return metadata
|
||||
|
||||
def metadata_from_png(png_file_path):
|
||||
'''
|
||||
Given the path to a PNG file created by dream.py, retrieves
|
||||
an Args object containing the image metadata
|
||||
'''
|
||||
meta = ldm.dream.pngwriter.retrieve_metadata(png_file_path)
|
||||
opts = metadata_loads(meta)
|
||||
return opts[0]
|
||||
|
||||
def metadata_loads(metadata):
|
||||
'''
|
||||
Takes the dictionary corresponding to RFC266 (https://github.com/lstein/stable-diffusion/issues/266)
|
||||
and returns a series of opt objects for each of the images described in the dictionary.
|
||||
'''
|
||||
results = []
|
||||
try:
|
||||
if 'grid' in metadata['sd-metadata']:
|
||||
images = metadata['sd-metadata']['images']
|
||||
else:
|
||||
images = [metadata['sd-metadata']['image']]
|
||||
for image in images:
|
||||
# repack the prompt and variations
|
||||
if 'prompt' in image:
|
||||
image['prompt'] = ','.join([':'.join([x['prompt'], str(x['weight'])]) for x in image['prompt']])
|
||||
if 'variations' in image:
|
||||
image['variations'] = ','.join([':'.join([str(x['seed']),str(x['weight'])]) for x in image['variations']])
|
||||
# fix a bit of semantic drift here
|
||||
image['sampler_name']=image.pop('sampler')
|
||||
opt = Args()
|
||||
opt._cmd_switches = Namespace(**image)
|
||||
results.append(opt)
|
||||
except KeyError as e:
|
||||
import sys, traceback
|
||||
print('>> badly-formatted metadata',file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
return results
|
||||
|
||||
# image can either be a file path on disk or a base64-encoded
|
||||
# representation of the file's contents
|
||||
def calculate_init_img_hash(image_string):
|
||||
prefix = 'data:image/png;base64,'
|
||||
hash = None
|
||||
if image_string.startswith(prefix):
|
||||
imagebase64 = image_string[len(prefix):]
|
||||
imagedata = base64.b64decode(imagebase64)
|
||||
with open('outputs/test.png','wb') as file:
|
||||
file.write(imagedata)
|
||||
sha = hashlib.sha256()
|
||||
sha.update(imagedata)
|
||||
hash = sha.hexdigest()
|
||||
else:
|
||||
hash = sha256(image_string)
|
||||
return hash
|
||||
|
||||
# Bah. This should be moved somewhere else...
|
||||
def sha256(path):
|
||||
|
||||
@@ -13,7 +13,20 @@ import re
|
||||
import torch
|
||||
|
||||
def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
|
||||
uc = model.get_learned_conditioning([''])
|
||||
# Extract Unconditioned Words From Prompt
|
||||
unconditioned_words = ''
|
||||
unconditional_regex = r'\[(.*?)\]'
|
||||
unconditionals = re.findall(unconditional_regex, prompt)
|
||||
|
||||
if len(unconditionals) > 0:
|
||||
unconditioned_words = ' '.join(unconditionals)
|
||||
|
||||
# Remove Unconditioned Words From Prompt
|
||||
unconditional_regex_compile = re.compile(unconditional_regex)
|
||||
clean_prompt = unconditional_regex_compile.sub(' ', prompt)
|
||||
prompt = re.sub(' +', ' ', clean_prompt)
|
||||
|
||||
uc = model.get_learned_conditioning([unconditioned_words])
|
||||
|
||||
# get weighted sub-prompts
|
||||
weighted_subprompts = split_weighted_subprompts(
|
||||
@@ -25,15 +38,16 @@ def get_uc_and_c(prompt, model, log_tokens=False, skip_normalize=False):
|
||||
c = torch.zeros_like(uc)
|
||||
# normalize each "sub prompt" and add it
|
||||
for subprompt, weight in weighted_subprompts:
|
||||
log_tokenization(subprompt, model, log_tokens)
|
||||
log_tokenization(subprompt, model, log_tokens, weight)
|
||||
c = torch.add(
|
||||
c,
|
||||
model.get_learned_conditioning([subprompt]),
|
||||
alpha=weight,
|
||||
)
|
||||
else: # just standard 1 prompt
|
||||
log_tokenization(prompt, model, log_tokens)
|
||||
log_tokenization(prompt, model, log_tokens, 1)
|
||||
c = model.get_learned_conditioning([prompt])
|
||||
uc = model.get_learned_conditioning([unconditioned_words])
|
||||
return (uc, c)
|
||||
|
||||
def split_weighted_subprompts(text, skip_normalize=False)->list:
|
||||
@@ -65,14 +79,14 @@ def split_weighted_subprompts(text, skip_normalize=False)->list:
|
||||
if weight_sum == 0:
|
||||
print(
|
||||
"Warning: Subprompt weights add up to zero. Discarding and using even weights instead.")
|
||||
equal_weight = 1 / len(parsed_prompts)
|
||||
equal_weight = 1 / max(len(parsed_prompts), 1)
|
||||
return [(x[0], equal_weight) for x in parsed_prompts]
|
||||
return [(x[0], x[1] / weight_sum) for x in parsed_prompts]
|
||||
|
||||
|
||||
# shows how the prompt is tokenized
|
||||
# usually tokens have '</w>' to indicate end-of-word,
|
||||
# but for readability it has been replaced with ' '
|
||||
def log_tokenization(text, model, log=False):
|
||||
def log_tokenization(text, model, log=False, weight=1):
|
||||
if not log:
|
||||
return
|
||||
tokens = model.cond_stage_model.tokenizer._tokenize(text)
|
||||
@@ -89,8 +103,8 @@ def log_tokenization(text, model, log=False):
|
||||
usedTokens += 1
|
||||
else: # over max token length
|
||||
discarded = discarded + f"\x1b[0;3{s};40m{token}"
|
||||
print(f"\n>> Tokens ({usedTokens}):\n{tokenized}\x1b[0m")
|
||||
if discarded != "":
|
||||
print(
|
||||
f">> Tokens Discarded ({totalTokens-usedTokens}):\n{discarded}\x1b[0m"
|
||||
)
|
||||
print(f"\n>> Tokens ({usedTokens}), Weight ({weight:.2f}):\n{tokenized}\x1b[0m")
|
||||
if discarded != "":
|
||||
print(
|
||||
f">> Tokens Discarded ({totalTokens-usedTokens}):\n{discarded}\x1b[0m"
|
||||
)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import torch
|
||||
from torch import autocast
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from contextlib import nullcontext
|
||||
|
||||
def choose_torch_device() -> str:
|
||||
'''Convenience routine for guessing which GPU device to run model on'''
|
||||
@@ -10,15 +10,18 @@ def choose_torch_device() -> str:
|
||||
return 'mps'
|
||||
return 'cpu'
|
||||
|
||||
def choose_autocast_device(device):
|
||||
'''Returns an autocast compatible device from a torch device'''
|
||||
device_type = device.type # this returns 'mps' on M1
|
||||
# autocast only for cuda, but GTX 16xx have issues with it
|
||||
if device_type == 'cuda':
|
||||
device_name = torch.cuda.get_device_name()
|
||||
if 'GeForce GTX 1660' in device_name or 'GeForce GTX 1650' in device_name:
|
||||
return device_type,nullcontext
|
||||
else:
|
||||
return device_type,autocast
|
||||
else:
|
||||
return 'cpu',nullcontext
|
||||
def choose_precision(device) -> str:
|
||||
'''Returns an appropriate precision for the given torch device'''
|
||||
if device.type == 'cuda':
|
||||
device_name = torch.cuda.get_device_name(device)
|
||||
if not ('GeForce GTX 1660' in device_name or 'GeForce GTX 1650' in device_name):
|
||||
return 'float16'
|
||||
return 'float32'
|
||||
|
||||
def choose_autocast(precision):
|
||||
'''Returns an autocast context or nullcontext for the given precision string'''
|
||||
# float16 currently requires autocast to avoid errors like:
|
||||
# 'expected scalar type Half but found Float'
|
||||
if precision == 'autocast' or precision == 'float16':
|
||||
return autocast
|
||||
return nullcontext
|
||||
|
||||
@@ -9,13 +9,14 @@ from tqdm import tqdm, trange
|
||||
from PIL import Image
|
||||
from einops import rearrange, repeat
|
||||
from pytorch_lightning import seed_everything
|
||||
from ldm.dream.devices import choose_autocast_device
|
||||
from ldm.dream.devices import choose_autocast
|
||||
|
||||
downsampling = 8
|
||||
|
||||
class Generator():
|
||||
def __init__(self,model):
|
||||
def __init__(self, model, precision):
|
||||
self.model = model
|
||||
self.precision = precision
|
||||
self.seed = None
|
||||
self.latent_channels = model.channels
|
||||
self.downsampling_factor = downsampling # BUG: should come from model or config
|
||||
@@ -38,7 +39,7 @@ class Generator():
|
||||
def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
|
||||
image_callback=None, step_callback=None,
|
||||
**kwargs):
|
||||
device_type,scope = choose_autocast_device(self.model.device)
|
||||
scope = choose_autocast(self.precision)
|
||||
make_image = self.get_make_image(
|
||||
prompt,
|
||||
init_image = init_image,
|
||||
@@ -50,8 +51,9 @@ class Generator():
|
||||
|
||||
results = []
|
||||
seed = seed if seed else self.new_seed()
|
||||
first_seed = seed
|
||||
seed, initial_noise = self.generate_initial_noise(seed, width, height)
|
||||
with scope(device_type), self.model.ema_scope():
|
||||
with scope(self.model.device.type), self.model.ema_scope():
|
||||
for n in trange(iterations, desc='Generating'):
|
||||
x_T = None
|
||||
if self.variation_amount > 0:
|
||||
@@ -70,7 +72,7 @@ class Generator():
|
||||
image = make_image(x_T)
|
||||
results.append([image, seed])
|
||||
if image_callback is not None:
|
||||
image_callback(image, seed)
|
||||
image_callback(image, seed, first_seed=first_seed)
|
||||
seed = self.new_seed()
|
||||
return results
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ from ldm.dream.generator.base import Generator
|
||||
from ldm.dream.generator.img2img import Img2Img
|
||||
|
||||
class Embiggen(Generator):
|
||||
def __init__(self,model):
|
||||
super().__init__(model)
|
||||
def __init__(self, model, precision):
|
||||
super().__init__(model, precision)
|
||||
self.init_latent = None
|
||||
|
||||
# Replace generate because Embiggen doesn't need/use most of what it does normallly
|
||||
@@ -62,19 +62,20 @@ class Embiggen(Generator):
|
||||
Return value depends on the seed at the time you call it
|
||||
"""
|
||||
# Construct embiggen arg array, and sanity check arguments
|
||||
if embiggen == None: # embiggen can also be called with just embiggen_tiles
|
||||
embiggen = [1.0] # If not specified, assume no scaling
|
||||
elif embiggen[0] < 0 :
|
||||
if embiggen == None: # embiggen can also be called with just embiggen_tiles
|
||||
embiggen = [1.0] # If not specified, assume no scaling
|
||||
elif embiggen[0] < 0:
|
||||
embiggen[0] = 1.0
|
||||
print('>> Embiggen scaling factor cannot be negative, fell back to the default of 1.0 !')
|
||||
print(
|
||||
'>> Embiggen scaling factor cannot be negative, fell back to the default of 1.0 !')
|
||||
if len(embiggen) < 2:
|
||||
embiggen.append(0.75)
|
||||
elif embiggen[1] > 1.0 or embiggen[1] < 0 :
|
||||
elif embiggen[1] > 1.0 or embiggen[1] < 0:
|
||||
embiggen[1] = 0.75
|
||||
print('>> Embiggen upscaling strength for ESRGAN must be between 0 and 1, fell back to the default of 0.75 !')
|
||||
if len(embiggen) < 3:
|
||||
embiggen.append(0.25)
|
||||
elif embiggen[2] < 0 :
|
||||
elif embiggen[2] < 0:
|
||||
embiggen[2] = 0.25
|
||||
print('>> Overlap size for Embiggen must be a positive ratio between 0 and 1 OR a number of pixels, fell back to the default of 0.25 !')
|
||||
|
||||
@@ -84,8 +85,11 @@ class Embiggen(Generator):
|
||||
embiggen_tiles = list(map(lambda n: n-1, embiggen_tiles))
|
||||
embiggen_tiles.sort()
|
||||
|
||||
if strength >= 0.5:
|
||||
print(f'* WARNING: Embiggen may produce mirror motifs if the strength (-f) is too high (currently {strength}). Try values between 0.35-0.45.')
|
||||
|
||||
# Prep img2img generator, since we wrap over it
|
||||
gen_img2img = Img2Img(self.model)
|
||||
gen_img2img = Img2Img(self.model,self.precision)
|
||||
|
||||
# Open original init image (not a tensor) to manipulate
|
||||
initsuperimage = Image.open(init_img)
|
||||
@@ -100,29 +104,30 @@ class Embiggen(Generator):
|
||||
if embiggen[0] != 1.0:
|
||||
initsuperwidth = round(initsuperwidth*embiggen[0])
|
||||
initsuperheight = round(initsuperheight*embiggen[0])
|
||||
if embiggen[1] > 0: # No point in ESRGAN upscaling if strength is set zero
|
||||
from ldm.gfpgan.gfpgan_tools import (
|
||||
real_esrgan_upscale,
|
||||
)
|
||||
print(f'>> ESRGAN upscaling init image prior to cutting with Embiggen with strength {embiggen[1]}')
|
||||
if embiggen[1] > 0: # No point in ESRGAN upscaling if strength is set zero
|
||||
from ldm.dream.restoration.realesrgan import ESRGAN
|
||||
esrgan = ESRGAN()
|
||||
print(
|
||||
f'>> ESRGAN upscaling init image prior to cutting with Embiggen with strength {embiggen[1]}')
|
||||
if embiggen[0] > 2:
|
||||
initsuperimage = real_esrgan_upscale(
|
||||
initsuperimage = esrgan.process(
|
||||
initsuperimage,
|
||||
embiggen[1], # upscale strength
|
||||
4, # upscale scale
|
||||
embiggen[1], # upscale strength
|
||||
self.seed,
|
||||
4, # upscale scale
|
||||
)
|
||||
else:
|
||||
initsuperimage = real_esrgan_upscale(
|
||||
initsuperimage = esrgan.process(
|
||||
initsuperimage,
|
||||
embiggen[1], # upscale strength
|
||||
2, # upscale scale
|
||||
embiggen[1], # upscale strength
|
||||
self.seed,
|
||||
2, # upscale scale
|
||||
)
|
||||
# We could keep recursively re-running ESRGAN for a requested embiggen[0] larger than 4x
|
||||
# but from personal experiance it doesn't greatly improve anything after 4x
|
||||
# Resize to target scaling factor resolution
|
||||
initsuperimage = initsuperimage.resize((initsuperwidth, initsuperheight), Image.Resampling.LANCZOS)
|
||||
initsuperimage = initsuperimage.resize(
|
||||
(initsuperwidth, initsuperheight), Image.Resampling.LANCZOS)
|
||||
|
||||
# Use width and height as tile widths and height
|
||||
# Determine buffer size in pixels
|
||||
@@ -145,25 +150,28 @@ class Embiggen(Generator):
|
||||
emb_tiles_x = 1
|
||||
emb_tiles_y = 1
|
||||
if (initsuperwidth - width) > 0:
|
||||
emb_tiles_x = ceildiv(initsuperwidth - width, width - overlap_size_x) + 1
|
||||
emb_tiles_x = ceildiv(initsuperwidth - width,
|
||||
width - overlap_size_x) + 1
|
||||
if (initsuperheight - height) > 0:
|
||||
emb_tiles_y = ceildiv(initsuperheight - height, height - overlap_size_y) + 1
|
||||
emb_tiles_y = ceildiv(initsuperheight - height,
|
||||
height - overlap_size_y) + 1
|
||||
# Sanity
|
||||
assert emb_tiles_x > 1 or emb_tiles_y > 1, f'ERROR: Based on the requested dimensions of {initsuperwidth}x{initsuperheight} and tiles of {width}x{height} you don\'t need to Embiggen! Check your arguments.'
|
||||
|
||||
# Prep alpha layers --------------
|
||||
# https://stackoverflow.com/questions/69321734/how-to-create-different-transparency-like-gradient-with-python-pil
|
||||
# agradientL is Left-side transparent
|
||||
agradientL = Image.linear_gradient('L').rotate(90).resize((overlap_size_x, height))
|
||||
agradientL = Image.linear_gradient('L').rotate(
|
||||
90).resize((overlap_size_x, height))
|
||||
# agradientT is Top-side transparent
|
||||
agradientT = Image.linear_gradient('L').resize((width, overlap_size_y))
|
||||
# radial corner is the left-top corner, made full circle then cut to just the left-top quadrant
|
||||
agradientC = Image.new('L', (256, 256))
|
||||
for y in range(256):
|
||||
for x in range(256):
|
||||
#Find distance to lower right corner (numpy takes arrays)
|
||||
# Find distance to lower right corner (numpy takes arrays)
|
||||
distanceToLR = np.sqrt([(255 - x) ** 2 + (255 - y) ** 2])[0]
|
||||
#Clamp values to max 255
|
||||
# Clamp values to max 255
|
||||
if distanceToLR > 255:
|
||||
distanceToLR = 255
|
||||
#Place the pixel as invert of distance
|
||||
@@ -200,54 +208,73 @@ class Embiggen(Generator):
|
||||
if embiggen_tiles:
|
||||
# Individual unconnected sides
|
||||
alphaLayerR = Image.new("L", (width, height), 255)
|
||||
alphaLayerR.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
|
||||
alphaLayerR.paste(agradientL.rotate(
|
||||
180), (width - overlap_size_x, 0))
|
||||
alphaLayerB = Image.new("L", (width, height), 255)
|
||||
alphaLayerB.paste(agradientT.rotate(180), (0, height - overlap_size_y))
|
||||
alphaLayerB.paste(agradientT.rotate(
|
||||
180), (0, height - overlap_size_y))
|
||||
alphaLayerTB = Image.new("L", (width, height), 255)
|
||||
alphaLayerTB.paste(agradientT, (0, 0))
|
||||
alphaLayerTB.paste(agradientT.rotate(180), (0, height - overlap_size_y))
|
||||
alphaLayerTB.paste(agradientT.rotate(
|
||||
180), (0, height - overlap_size_y))
|
||||
alphaLayerLR = Image.new("L", (width, height), 255)
|
||||
alphaLayerLR.paste(agradientL, (0, 0))
|
||||
alphaLayerLR.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
|
||||
alphaLayerLR.paste(agradientL.rotate(
|
||||
180), (width - overlap_size_x, 0))
|
||||
|
||||
# Sides and corner Layers
|
||||
alphaLayerRBC = Image.new("L", (width, height), 255)
|
||||
alphaLayerRBC.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
|
||||
alphaLayerRBC.paste(agradientT.rotate(180), (0, height - overlap_size_y))
|
||||
alphaLayerRBC.paste(agradientC.rotate(180).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
|
||||
alphaLayerRBC.paste(agradientL.rotate(
|
||||
180), (width - overlap_size_x, 0))
|
||||
alphaLayerRBC.paste(agradientT.rotate(
|
||||
180), (0, height - overlap_size_y))
|
||||
alphaLayerRBC.paste(agradientC.rotate(180).resize(
|
||||
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
|
||||
alphaLayerLBC = Image.new("L", (width, height), 255)
|
||||
alphaLayerLBC.paste(agradientL, (0, 0))
|
||||
alphaLayerLBC.paste(agradientT.rotate(180), (0, height - overlap_size_y))
|
||||
alphaLayerLBC.paste(agradientC.rotate(90).resize((overlap_size_x, overlap_size_y)), (0, height - overlap_size_y))
|
||||
alphaLayerLBC.paste(agradientT.rotate(
|
||||
180), (0, height - overlap_size_y))
|
||||
alphaLayerLBC.paste(agradientC.rotate(90).resize(
|
||||
(overlap_size_x, overlap_size_y)), (0, height - overlap_size_y))
|
||||
alphaLayerRTC = Image.new("L", (width, height), 255)
|
||||
alphaLayerRTC.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
|
||||
alphaLayerRTC.paste(agradientL.rotate(
|
||||
180), (width - overlap_size_x, 0))
|
||||
alphaLayerRTC.paste(agradientT, (0, 0))
|
||||
alphaLayerRTC.paste(agradientC.rotate(270).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
|
||||
alphaLayerRTC.paste(agradientC.rotate(270).resize(
|
||||
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
|
||||
|
||||
# All but X layers
|
||||
alphaLayerABT = Image.new("L", (width, height), 255)
|
||||
alphaLayerABT.paste(alphaLayerLBC, (0, 0))
|
||||
alphaLayerABT.paste(agradientL.rotate(180), (width - overlap_size_x, 0))
|
||||
alphaLayerABT.paste(agradientC.rotate(180).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
|
||||
alphaLayerABT.paste(agradientL.rotate(
|
||||
180), (width - overlap_size_x, 0))
|
||||
alphaLayerABT.paste(agradientC.rotate(180).resize(
|
||||
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
|
||||
alphaLayerABL = Image.new("L", (width, height), 255)
|
||||
alphaLayerABL.paste(alphaLayerRTC, (0, 0))
|
||||
alphaLayerABL.paste(agradientT.rotate(180), (0, height - overlap_size_y))
|
||||
alphaLayerABL.paste(agradientC.rotate(180).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
|
||||
alphaLayerABL.paste(agradientT.rotate(
|
||||
180), (0, height - overlap_size_y))
|
||||
alphaLayerABL.paste(agradientC.rotate(180).resize(
|
||||
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, height - overlap_size_y))
|
||||
alphaLayerABR = Image.new("L", (width, height), 255)
|
||||
alphaLayerABR.paste(alphaLayerLBC, (0, 0))
|
||||
alphaLayerABR.paste(agradientT, (0, 0))
|
||||
alphaLayerABR.paste(agradientC.resize((overlap_size_x, overlap_size_y)), (0, 0))
|
||||
alphaLayerABR.paste(agradientC.resize(
|
||||
(overlap_size_x, overlap_size_y)), (0, 0))
|
||||
alphaLayerABB = Image.new("L", (width, height), 255)
|
||||
alphaLayerABB.paste(alphaLayerRTC, (0, 0))
|
||||
alphaLayerABB.paste(agradientL, (0, 0))
|
||||
alphaLayerABB.paste(agradientC.resize((overlap_size_x, overlap_size_y)), (0, 0))
|
||||
alphaLayerABB.paste(agradientC.resize(
|
||||
(overlap_size_x, overlap_size_y)), (0, 0))
|
||||
|
||||
# All-around layer
|
||||
alphaLayerAA = Image.new("L", (width, height), 255)
|
||||
alphaLayerAA.paste(alphaLayerABT, (0, 0))
|
||||
alphaLayerAA.paste(agradientT, (0, 0))
|
||||
alphaLayerAA.paste(agradientC.resize((overlap_size_x, overlap_size_y)), (0, 0))
|
||||
alphaLayerAA.paste(agradientC.rotate(270).resize((overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
|
||||
alphaLayerAA.paste(agradientC.resize(
|
||||
(overlap_size_x, overlap_size_y)), (0, 0))
|
||||
alphaLayerAA.paste(agradientC.rotate(270).resize(
|
||||
(overlap_size_x, overlap_size_y)), (width - overlap_size_x, 0))
|
||||
|
||||
# Clean up temporary gradients
|
||||
del agradientL
|
||||
@@ -259,7 +286,8 @@ class Embiggen(Generator):
|
||||
if embiggen_tiles:
|
||||
print(f'>> Making {len(embiggen_tiles)} Embiggen tiles...')
|
||||
else:
|
||||
print(f'>> Making {(emb_tiles_x * emb_tiles_y)} Embiggen tiles ({emb_tiles_x}x{emb_tiles_y})...')
|
||||
print(
|
||||
f'>> Making {(emb_tiles_x * emb_tiles_y)} Embiggen tiles ({emb_tiles_x}x{emb_tiles_y})...')
|
||||
|
||||
emb_tile_store = []
|
||||
# Although we could use the same seed for every tile for determinism, at higher strengths this may
|
||||
@@ -294,20 +322,23 @@ class Embiggen(Generator):
|
||||
top = round(emb_row_i * (height - overlap_size_y))
|
||||
right = left + width
|
||||
bottom = top + height
|
||||
|
||||
|
||||
# Cropped image of above dimension (does not modify the original)
|
||||
newinitimage = initsuperimage.crop((left, top, right, bottom))
|
||||
# DEBUG:
|
||||
# newinitimagepath = init_img[0:-4] + f'_emb_Ti{tile}.png'
|
||||
# newinitimage.save(newinitimagepath)
|
||||
|
||||
|
||||
if embiggen_tiles:
|
||||
print(f'Making tile #{tile + 1} ({embiggen_tiles.index(tile) + 1} of {len(embiggen_tiles)} requested)')
|
||||
print(
|
||||
f'Making tile #{tile + 1} ({embiggen_tiles.index(tile) + 1} of {len(embiggen_tiles)} requested)')
|
||||
else:
|
||||
print(f'Starting {tile + 1} of {(emb_tiles_x * emb_tiles_y)} tiles')
|
||||
print(
|
||||
f'Starting {tile + 1} of {(emb_tiles_x * emb_tiles_y)} tiles')
|
||||
|
||||
# create a torch tensor from an Image
|
||||
newinitimage = np.array(newinitimage).astype(np.float32) / 255.0
|
||||
newinitimage = np.array(
|
||||
newinitimage).astype(np.float32) / 255.0
|
||||
newinitimage = newinitimage[None].transpose(0, 3, 1, 2)
|
||||
newinitimage = torch.from_numpy(newinitimage)
|
||||
newinitimage = 2.0 * newinitimage - 1.0
|
||||
@@ -329,18 +360,35 @@ class Embiggen(Generator):
|
||||
init_image = newinitimage, # notice that init_image is different from init_img
|
||||
mask_image = None,
|
||||
strength = strength,
|
||||
iterations=1,
|
||||
seed=self.seed,
|
||||
sampler=sampler,
|
||||
steps=steps,
|
||||
cfg_scale=cfg_scale,
|
||||
conditioning=conditioning,
|
||||
ddim_eta=ddim_eta,
|
||||
image_callback=None, # called only after the final image is generated
|
||||
step_callback=step_callback, # called after each intermediate image is generated
|
||||
width=width,
|
||||
height=height,
|
||||
init_img=init_img, # img2img doesn't need this, but it might in the future
|
||||
init_image=newinitimage, # notice that init_image is different from init_img
|
||||
mask_image=None,
|
||||
strength=strength,
|
||||
)
|
||||
|
||||
emb_tile_store.append(tile_results[0][0])
|
||||
# DEBUG (but, also has other uses), worth saving if you want tiles without a transparency overlap to manually composite
|
||||
# emb_tile_store[-1].save(init_img[0:-4] + f'_emb_To{tile}.png')
|
||||
del newinitimage
|
||||
|
||||
|
||||
# Sanity check we have them all
|
||||
if len(emb_tile_store) == (emb_tiles_x * emb_tiles_y) or (embiggen_tiles != [] and len(emb_tile_store) == len(embiggen_tiles)):
|
||||
outputsuperimage = Image.new("RGBA", (initsuperwidth, initsuperheight))
|
||||
outputsuperimage = Image.new(
|
||||
"RGBA", (initsuperwidth, initsuperheight))
|
||||
if embiggen_tiles:
|
||||
outputsuperimage.alpha_composite(initsuperimage.convert('RGBA'), (0, 0))
|
||||
outputsuperimage.alpha_composite(
|
||||
initsuperimage.convert('RGBA'), (0, 0))
|
||||
for tile in range(emb_tiles_x * emb_tiles_y):
|
||||
if embiggen_tiles:
|
||||
if tile in embiggen_tiles:
|
||||
@@ -361,7 +409,8 @@ class Embiggen(Generator):
|
||||
if emb_column_i + 1 == emb_tiles_x:
|
||||
left = initsuperwidth - width
|
||||
else:
|
||||
left = round(emb_column_i * (width - overlap_size_x))
|
||||
left = round(emb_column_i *
|
||||
(width - overlap_size_x))
|
||||
if emb_row_i + 1 == emb_tiles_y:
|
||||
top = initsuperheight - height
|
||||
else:
|
||||
@@ -372,26 +421,26 @@ class Embiggen(Generator):
|
||||
# top of image
|
||||
if emb_row_i == 0:
|
||||
if emb_column_i == 0:
|
||||
if (tile+1) in embiggen_tiles: # Look-ahead right
|
||||
if (tile+emb_tiles_x) not in embiggen_tiles: # Look-ahead down
|
||||
if (tile+1) in embiggen_tiles: # Look-ahead right
|
||||
if (tile+emb_tiles_x) not in embiggen_tiles: # Look-ahead down
|
||||
intileimage.putalpha(alphaLayerB)
|
||||
# Otherwise do nothing on this tile
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
intileimage.putalpha(alphaLayerR)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerRBC)
|
||||
elif emb_column_i == emb_tiles_x - 1:
|
||||
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
|
||||
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
|
||||
intileimage.putalpha(alphaLayerL)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerLBC)
|
||||
else:
|
||||
if (tile+1) in embiggen_tiles: # Look-ahead right
|
||||
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
|
||||
if (tile+1) in embiggen_tiles: # Look-ahead right
|
||||
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
|
||||
intileimage.putalpha(alphaLayerL)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerLBC)
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
intileimage.putalpha(alphaLayerLR)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerABT)
|
||||
@@ -418,12 +467,12 @@ class Embiggen(Generator):
|
||||
intileimage.putalpha(alphaLayerTaC)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerTB)
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
intileimage.putalpha(alphaLayerRTC)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerABL)
|
||||
elif emb_column_i == emb_tiles_x - 1:
|
||||
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
|
||||
if (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down
|
||||
intileimage.putalpha(alphaLayerLTC)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerABR)
|
||||
@@ -433,7 +482,7 @@ class Embiggen(Generator):
|
||||
intileimage.putalpha(alphaLayerLTaC)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerABR)
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
elif (tile+emb_tiles_x) in embiggen_tiles: # Look-ahead down only
|
||||
intileimage.putalpha(alphaLayerABB)
|
||||
else:
|
||||
intileimage.putalpha(alphaLayerAA)
|
||||
@@ -459,4 +508,4 @@ class Embiggen(Generator):
|
||||
# after internal loops and patching up return Embiggen image
|
||||
return outputsuperimage
|
||||
# end of function declaration
|
||||
return make_image
|
||||
return make_image
|
||||
|
||||
@@ -4,15 +4,15 @@ ldm.dream.generator.img2img descends from ldm.dream.generator
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from ldm.dream.devices import choose_autocast_device
|
||||
from ldm.dream.devices import choose_autocast
|
||||
from ldm.dream.generator.base import Generator
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
|
||||
class Img2Img(Generator):
|
||||
def __init__(self,model):
|
||||
super().__init__(model)
|
||||
def __init__(self, model, precision):
|
||||
super().__init__(model, precision)
|
||||
self.init_latent = None # by get_noise()
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
|
||||
conditioning,init_image,strength,step_callback=None,**kwargs):
|
||||
@@ -32,8 +32,8 @@ class Img2Img(Generator):
|
||||
ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
|
||||
)
|
||||
|
||||
device_type,scope = choose_autocast_device(self.model.device)
|
||||
with scope(device_type):
|
||||
scope = choose_autocast(self.precision)
|
||||
with scope(self.model.device.type):
|
||||
self.init_latent = self.model.get_first_stage_encoding(
|
||||
self.model.encode_first_stage(init_image)
|
||||
) # move to latent space
|
||||
|
||||
@@ -5,15 +5,15 @@ ldm.dream.generator.inpaint descends from ldm.dream.generator
|
||||
import torch
|
||||
import numpy as np
|
||||
from einops import rearrange, repeat
|
||||
from ldm.dream.devices import choose_autocast_device
|
||||
from ldm.dream.devices import choose_autocast
|
||||
from ldm.dream.generator.img2img import Img2Img
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
|
||||
class Inpaint(Img2Img):
|
||||
def __init__(self,model):
|
||||
def __init__(self, model, precision):
|
||||
self.init_latent = None
|
||||
super().__init__(model)
|
||||
|
||||
super().__init__(model, precision)
|
||||
|
||||
@torch.no_grad()
|
||||
def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
|
||||
conditioning,init_image,mask_image,strength,
|
||||
@@ -34,12 +34,12 @@ class Inpaint(Img2Img):
|
||||
)
|
||||
sampler = DDIMSampler(self.model, device=self.model.device)
|
||||
|
||||
sampler.make_schedule(
|
||||
ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
|
||||
)
|
||||
sampler.make_schedule(
|
||||
ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False
|
||||
)
|
||||
|
||||
device_type,scope = choose_autocast_device(self.model.device)
|
||||
with scope(device_type):
|
||||
scope = choose_autocast(self.precision)
|
||||
with scope(self.model.device.type):
|
||||
self.init_latent = self.model.get_first_stage_encoding(
|
||||
self.model.encode_first_stage(init_image)
|
||||
) # move to latent space
|
||||
|
||||
@@ -7,9 +7,9 @@ import numpy as np
|
||||
from ldm.dream.generator.base import Generator
|
||||
|
||||
class Txt2Img(Generator):
|
||||
def __init__(self,model):
|
||||
super().__init__(model)
|
||||
|
||||
def __init__(self, model, precision):
|
||||
super().__init__(model, precision)
|
||||
|
||||
@torch.no_grad()
|
||||
def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
|
||||
conditioning,width,height,step_callback=None,**kwargs):
|
||||
@@ -27,6 +27,10 @@ class Txt2Img(Generator):
|
||||
height // self.downsampling_factor,
|
||||
width // self.downsampling_factor,
|
||||
]
|
||||
|
||||
if self.free_gpu_mem and self.model.model.device != self.model.device:
|
||||
self.model.model.to(self.model.device)
|
||||
|
||||
samples, _ = sampler.sample(
|
||||
batch_size = 1,
|
||||
S = steps,
|
||||
@@ -39,6 +43,10 @@ class Txt2Img(Generator):
|
||||
eta = ddim_eta,
|
||||
img_callback = step_callback
|
||||
)
|
||||
|
||||
if self.free_gpu_mem:
|
||||
self.model.model.to("cpu")
|
||||
|
||||
return self.sample_to_image(samples)
|
||||
|
||||
return make_image
|
||||
|
||||
61
ldm/dream/log.py
Normal file
61
ldm/dream/log.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Functions for better format logging
|
||||
write_log -- logs the name of the output image, prompt, and prompt args to the terminal and different types of file
|
||||
1 write_log_message -- Writes a message to the console
|
||||
2 write_log_files -- Writes a message to files
|
||||
2.1 write_log_default -- File in plain text
|
||||
2.2 write_log_txt -- File in txt format
|
||||
2.3 write_log_markdown -- File in markdown format
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
|
||||
def write_log(results, log_path, file_types, output_cntr):
|
||||
"""
|
||||
logs the name of the output image, prompt, and prompt args to the terminal and files
|
||||
"""
|
||||
output_cntr = write_log_message(results, output_cntr)
|
||||
write_log_files(results, log_path, file_types)
|
||||
return output_cntr
|
||||
|
||||
|
||||
def write_log_message(results, output_cntr):
|
||||
"""logs to the terminal"""
|
||||
log_lines = [f"{path}: {prompt}\n" for path, prompt in results]
|
||||
for l in log_lines:
|
||||
output_cntr += 1
|
||||
print(f"[{output_cntr}] {l}", end="")
|
||||
return output_cntr
|
||||
|
||||
|
||||
def write_log_files(results, log_path, file_types):
|
||||
for file_type in file_types:
|
||||
if file_type == "txt":
|
||||
write_log_txt(log_path, results)
|
||||
elif file_type == "md" or file_type == "markdown":
|
||||
write_log_markdown(log_path, results)
|
||||
else:
|
||||
print(f"'{file_type}' format is not supported, so write in plain text")
|
||||
write_log_default(log_path, results, file_type)
|
||||
|
||||
|
||||
def write_log_default(log_path, results, file_type):
|
||||
plain_txt_lines = [f"{path}: {prompt}\n" for path, prompt in results]
|
||||
with open(log_path + "." + file_type, "a", encoding="utf-8") as file:
|
||||
file.writelines(plain_txt_lines)
|
||||
|
||||
|
||||
def write_log_txt(log_path, results):
|
||||
txt_lines = [f"{path}: {prompt}\n" for path, prompt in results]
|
||||
with open(log_path + ".txt", "a", encoding="utf-8") as file:
|
||||
file.writelines(txt_lines)
|
||||
|
||||
|
||||
def write_log_markdown(log_path, results):
|
||||
md_lines = []
|
||||
for path, prompt in results:
|
||||
file_name = os.path.basename(path)
|
||||
md_lines.append(f"## {file_name}\n\n\n{prompt}\n")
|
||||
with open(log_path + ".md", "a", encoding="utf-8") as file:
|
||||
file.writelines(md_lines)
|
||||
@@ -34,7 +34,6 @@ class PngWriter:
|
||||
# saves image named _image_ to outdir/name, writing metadata from prompt
|
||||
# returns full path of output
|
||||
def save_image_and_prompt_to_png(self, image, dream_prompt, name, metadata=None):
|
||||
print(f'self.outdir={self.outdir}, name={name}')
|
||||
path = os.path.join(self.outdir, name)
|
||||
info = PngImagePlugin.PngInfo()
|
||||
info.add_text('Dream', dream_prompt)
|
||||
|
||||
@@ -22,11 +22,12 @@ class Completer:
|
||||
def complete(self, text, state):
|
||||
buffer = readline.get_line_buffer()
|
||||
|
||||
if text.startswith(('-I', '--init_img','-M','--init_mask')):
|
||||
if text.startswith(('-I', '--init_img','-M','--init_mask',
|
||||
'--init_color')):
|
||||
return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
|
||||
|
||||
if buffer.strip().endswith('cd') or text.startswith(('.', '/')):
|
||||
return self._path_completions(text, state, ())
|
||||
if buffer.strip().endswith('pp') or text.startswith(('.', '/')):
|
||||
return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
|
||||
|
||||
response = None
|
||||
if state == 0:
|
||||
@@ -57,6 +58,8 @@ class Completer:
|
||||
path = text.replace('--init_mask=', '', 1).lstrip()
|
||||
elif text.startswith('-M'):
|
||||
path = text.replace('-M', '', 1).lstrip()
|
||||
elif text.startswith('--init_color='):
|
||||
path = text.replace('--init_color=', '', 1).lstrip()
|
||||
else:
|
||||
path = text
|
||||
|
||||
@@ -100,6 +103,7 @@ if readline_available:
|
||||
'--individual','-i',
|
||||
'--init_img','-I',
|
||||
'--init_mask','-M',
|
||||
'--init_color',
|
||||
'--strength','-f',
|
||||
'--variants','-v',
|
||||
'--outdir','-o',
|
||||
|
||||
4
ldm/dream/restoration/__init__.py
Normal file
4
ldm/dream/restoration/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
'''
|
||||
Initialization file for the ldm.dream.restoration package
|
||||
'''
|
||||
from .base import Restoration
|
||||
38
ldm/dream/restoration/base.py
Normal file
38
ldm/dream/restoration/base.py
Normal file
@@ -0,0 +1,38 @@
|
||||
class Restoration():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def load_face_restore_models(self, gfpgan_dir='./src/gfpgan', gfpgan_model_path='experiments/pretrained_models/GFPGANv1.4.pth'):
|
||||
# Load GFPGAN
|
||||
gfpgan = self.load_gfpgan(gfpgan_dir, gfpgan_model_path)
|
||||
if gfpgan.gfpgan_model_exists:
|
||||
print('>> GFPGAN Initialized')
|
||||
else:
|
||||
print('>> GFPGAN Disabled')
|
||||
gfpgan = None
|
||||
|
||||
# Load CodeFormer
|
||||
codeformer = self.load_codeformer()
|
||||
if codeformer.codeformer_model_exists:
|
||||
print('>> CodeFormer Initialized')
|
||||
else:
|
||||
print('>> CodeFormer Disabled')
|
||||
codeformer = None
|
||||
|
||||
return gfpgan, codeformer
|
||||
|
||||
# Face Restore Models
|
||||
def load_gfpgan(self, gfpgan_dir, gfpgan_model_path):
|
||||
from ldm.dream.restoration.gfpgan import GFPGAN
|
||||
return GFPGAN(gfpgan_dir, gfpgan_model_path)
|
||||
|
||||
def load_codeformer(self):
|
||||
from ldm.dream.restoration.codeformer import CodeFormerRestoration
|
||||
return CodeFormerRestoration()
|
||||
|
||||
# Upscale Models
|
||||
def load_esrgan(self, esrgan_bg_tile=400):
|
||||
from ldm.dream.restoration.realesrgan import ESRGAN
|
||||
esrgan = ESRGAN(esrgan_bg_tile)
|
||||
print('>> ESRGAN Initialized')
|
||||
return esrgan;
|
||||
84
ldm/dream/restoration/codeformer.py
Normal file
84
ldm/dream/restoration/codeformer.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import os
|
||||
import torch
|
||||
import numpy as np
|
||||
import warnings
|
||||
import sys
|
||||
|
||||
pretrained_model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
|
||||
|
||||
class CodeFormerRestoration():
|
||||
def __init__(self,
|
||||
codeformer_dir='ldm/dream/restoration/codeformer',
|
||||
codeformer_model_path='weights/codeformer.pth') -> None:
|
||||
self.model_path = os.path.join(codeformer_dir, codeformer_model_path)
|
||||
self.codeformer_model_exists = os.path.isfile(self.model_path)
|
||||
|
||||
if not self.codeformer_model_exists:
|
||||
print('## NOT FOUND: CodeFormer model not found at ' + self.model_path)
|
||||
sys.path.append(os.path.abspath(codeformer_dir))
|
||||
|
||||
def process(self, image, strength, device, seed=None, fidelity=0.75):
|
||||
if seed is not None:
|
||||
print(f'>> CodeFormer - Restoring Faces for image seed:{seed}')
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
from basicsr.utils.download_util import load_file_from_url
|
||||
from basicsr.utils import img2tensor, tensor2img
|
||||
from facexlib.utils.face_restoration_helper import FaceRestoreHelper
|
||||
from ldm.dream.restoration.codeformer_arch import CodeFormer
|
||||
from torchvision.transforms.functional import normalize
|
||||
from PIL import Image
|
||||
|
||||
cf_class = CodeFormer
|
||||
|
||||
cf = cf_class(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, connect_list=['32', '64', '128', '256']).to(device)
|
||||
|
||||
checkpoint_path = load_file_from_url(url=pretrained_model_url, model_dir=os.path.abspath('ldm/dream/restoration/codeformer/weights'), progress=True)
|
||||
checkpoint = torch.load(checkpoint_path)['params_ema']
|
||||
cf.load_state_dict(checkpoint)
|
||||
cf.eval()
|
||||
|
||||
image = image.convert('RGB')
|
||||
|
||||
face_helper = FaceRestoreHelper(upscale_factor=1, use_parse=True, device=device)
|
||||
face_helper.clean_all()
|
||||
face_helper.read_image(np.array(image, dtype=np.uint8))
|
||||
face_helper.get_face_landmarks_5(resize=640, eye_dist_threshold=5)
|
||||
face_helper.align_warp_face()
|
||||
|
||||
for idx, cropped_face in enumerate(face_helper.cropped_faces):
|
||||
cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
|
||||
normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
|
||||
cropped_face_t = cropped_face_t.unsqueeze(0).to(device)
|
||||
|
||||
try:
|
||||
with torch.no_grad():
|
||||
output = cf(cropped_face_t, w=fidelity, adain=True)[0]
|
||||
restored_face = tensor2img(output.squeeze(0), rgb2bgr=True, min_max=(-1, 1))
|
||||
del output
|
||||
torch.cuda.empty_cache()
|
||||
except RuntimeError as error:
|
||||
print(f'\tFailed inference for CodeFormer: {error}.')
|
||||
restored_face = cropped_face
|
||||
|
||||
restored_face = restored_face.astype('uint8')
|
||||
face_helper.add_restored_face(restored_face)
|
||||
|
||||
|
||||
face_helper.get_inverse_affine(None)
|
||||
|
||||
restored_img = face_helper.paste_faces_to_input_image()
|
||||
|
||||
res = Image.fromarray(restored_img)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if restored_img.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
cf = None
|
||||
|
||||
return res
|
||||
3
ldm/dream/restoration/codeformer/weights/README
Normal file
3
ldm/dream/restoration/codeformer/weights/README
Normal file
@@ -0,0 +1,3 @@
|
||||
To use codeformer face reconstruction, you will need to copy
|
||||
https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth
|
||||
into this directory.
|
||||
276
ldm/dream/restoration/codeformer_arch.py
Normal file
276
ldm/dream/restoration/codeformer_arch.py
Normal file
@@ -0,0 +1,276 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn, Tensor
|
||||
import torch.nn.functional as F
|
||||
from typing import Optional, List
|
||||
|
||||
from ldm.dream.restoration.vqgan_arch import *
|
||||
from basicsr.utils import get_root_logger
|
||||
from basicsr.utils.registry import ARCH_REGISTRY
|
||||
|
||||
def calc_mean_std(feat, eps=1e-5):
|
||||
"""Calculate mean and std for adaptive_instance_normalization.
|
||||
|
||||
Args:
|
||||
feat (Tensor): 4D tensor.
|
||||
eps (float): A small value added to the variance to avoid
|
||||
divide-by-zero. Default: 1e-5.
|
||||
"""
|
||||
size = feat.size()
|
||||
assert len(size) == 4, 'The input feature should be 4D tensor.'
|
||||
b, c = size[:2]
|
||||
feat_var = feat.view(b, c, -1).var(dim=2) + eps
|
||||
feat_std = feat_var.sqrt().view(b, c, 1, 1)
|
||||
feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
|
||||
return feat_mean, feat_std
|
||||
|
||||
|
||||
def adaptive_instance_normalization(content_feat, style_feat):
|
||||
"""Adaptive instance normalization.
|
||||
|
||||
Adjust the reference features to have the similar color and illuminations
|
||||
as those in the degradate features.
|
||||
|
||||
Args:
|
||||
content_feat (Tensor): The reference feature.
|
||||
style_feat (Tensor): The degradate features.
|
||||
"""
|
||||
size = content_feat.size()
|
||||
style_mean, style_std = calc_mean_std(style_feat)
|
||||
content_mean, content_std = calc_mean_std(content_feat)
|
||||
normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
|
||||
return normalized_feat * style_std.expand(size) + style_mean.expand(size)
|
||||
|
||||
|
||||
class PositionEmbeddingSine(nn.Module):
|
||||
"""
|
||||
This is a more standard version of the position embedding, very similar to the one
|
||||
used by the Attention is all you need paper, generalized to work on images.
|
||||
"""
|
||||
|
||||
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
|
||||
super().__init__()
|
||||
self.num_pos_feats = num_pos_feats
|
||||
self.temperature = temperature
|
||||
self.normalize = normalize
|
||||
if scale is not None and normalize is False:
|
||||
raise ValueError("normalize should be True if scale is passed")
|
||||
if scale is None:
|
||||
scale = 2 * math.pi
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
if mask is None:
|
||||
mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
|
||||
not_mask = ~mask
|
||||
y_embed = not_mask.cumsum(1, dtype=torch.float32)
|
||||
x_embed = not_mask.cumsum(2, dtype=torch.float32)
|
||||
if self.normalize:
|
||||
eps = 1e-6
|
||||
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
|
||||
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
|
||||
|
||||
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
|
||||
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
|
||||
|
||||
pos_x = x_embed[:, :, :, None] / dim_t
|
||||
pos_y = y_embed[:, :, :, None] / dim_t
|
||||
pos_x = torch.stack(
|
||||
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
|
||||
).flatten(3)
|
||||
pos_y = torch.stack(
|
||||
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
|
||||
).flatten(3)
|
||||
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
|
||||
return pos
|
||||
|
||||
def _get_activation_fn(activation):
|
||||
"""Return an activation function given a string"""
|
||||
if activation == "relu":
|
||||
return F.relu
|
||||
if activation == "gelu":
|
||||
return F.gelu
|
||||
if activation == "glu":
|
||||
return F.glu
|
||||
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
|
||||
|
||||
|
||||
class TransformerSALayer(nn.Module):
|
||||
def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
|
||||
super().__init__()
|
||||
self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
|
||||
# Implementation of Feedforward model - MLP
|
||||
self.linear1 = nn.Linear(embed_dim, dim_mlp)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_mlp, embed_dim)
|
||||
|
||||
self.norm1 = nn.LayerNorm(embed_dim)
|
||||
self.norm2 = nn.LayerNorm(embed_dim)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
|
||||
self.activation = _get_activation_fn(activation)
|
||||
|
||||
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward(self, tgt,
|
||||
tgt_mask: Optional[Tensor] = None,
|
||||
tgt_key_padding_mask: Optional[Tensor] = None,
|
||||
query_pos: Optional[Tensor] = None):
|
||||
|
||||
# self attention
|
||||
tgt2 = self.norm1(tgt)
|
||||
q = k = self.with_pos_embed(tgt2, query_pos)
|
||||
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
|
||||
key_padding_mask=tgt_key_padding_mask)[0]
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
|
||||
# ffn
|
||||
tgt2 = self.norm2(tgt)
|
||||
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
return tgt
|
||||
|
||||
class Fuse_sft_block(nn.Module):
|
||||
def __init__(self, in_ch, out_ch):
|
||||
super().__init__()
|
||||
self.encode_enc = ResBlock(2*in_ch, out_ch)
|
||||
|
||||
self.scale = nn.Sequential(
|
||||
nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
|
||||
nn.LeakyReLU(0.2, True),
|
||||
nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
|
||||
|
||||
self.shift = nn.Sequential(
|
||||
nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
|
||||
nn.LeakyReLU(0.2, True),
|
||||
nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
|
||||
|
||||
def forward(self, enc_feat, dec_feat, w=1):
|
||||
enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
|
||||
scale = self.scale(enc_feat)
|
||||
shift = self.shift(enc_feat)
|
||||
residual = w * (dec_feat * scale + shift)
|
||||
out = dec_feat + residual
|
||||
return out
|
||||
|
||||
|
||||
@ARCH_REGISTRY.register()
|
||||
class CodeFormer(VQAutoEncoder):
|
||||
def __init__(self, dim_embd=512, n_head=8, n_layers=9,
|
||||
codebook_size=1024, latent_size=256,
|
||||
connect_list=['32', '64', '128', '256'],
|
||||
fix_modules=['quantize','generator']):
|
||||
super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
|
||||
|
||||
if fix_modules is not None:
|
||||
for module in fix_modules:
|
||||
for param in getattr(self, module).parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
self.connect_list = connect_list
|
||||
self.n_layers = n_layers
|
||||
self.dim_embd = dim_embd
|
||||
self.dim_mlp = dim_embd*2
|
||||
|
||||
self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
|
||||
self.feat_emb = nn.Linear(256, self.dim_embd)
|
||||
|
||||
# transformer
|
||||
self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
|
||||
for _ in range(self.n_layers)])
|
||||
|
||||
# logits_predict head
|
||||
self.idx_pred_layer = nn.Sequential(
|
||||
nn.LayerNorm(dim_embd),
|
||||
nn.Linear(dim_embd, codebook_size, bias=False))
|
||||
|
||||
self.channels = {
|
||||
'16': 512,
|
||||
'32': 256,
|
||||
'64': 256,
|
||||
'128': 128,
|
||||
'256': 128,
|
||||
'512': 64,
|
||||
}
|
||||
|
||||
# after second residual block for > 16, before attn layer for ==16
|
||||
self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
|
||||
# after first residual block for > 16, before attn layer for ==16
|
||||
self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
|
||||
|
||||
# fuse_convs_dict
|
||||
self.fuse_convs_dict = nn.ModuleDict()
|
||||
for f_size in self.connect_list:
|
||||
in_ch = self.channels[f_size]
|
||||
self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch)
|
||||
|
||||
def _init_weights(self, module):
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
module.weight.data.normal_(mean=0.0, std=0.02)
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
|
||||
# ################### Encoder #####################
|
||||
enc_feat_dict = {}
|
||||
out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
|
||||
for i, block in enumerate(self.encoder.blocks):
|
||||
x = block(x)
|
||||
if i in out_list:
|
||||
enc_feat_dict[str(x.shape[-1])] = x.clone()
|
||||
|
||||
lq_feat = x
|
||||
# ################# Transformer ###################
|
||||
# quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
|
||||
pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
|
||||
# BCHW -> BC(HW) -> (HW)BC
|
||||
feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
|
||||
query_emb = feat_emb
|
||||
# Transformer encoder
|
||||
for layer in self.ft_layers:
|
||||
query_emb = layer(query_emb, query_pos=pos_emb)
|
||||
|
||||
# output logits
|
||||
logits = self.idx_pred_layer(query_emb) # (hw)bn
|
||||
logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
|
||||
|
||||
if code_only: # for training stage II
|
||||
# logits doesn't need softmax before cross_entropy loss
|
||||
return logits, lq_feat
|
||||
|
||||
# ################# Quantization ###################
|
||||
# if self.training:
|
||||
# quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
|
||||
# # b(hw)c -> bc(hw) -> bchw
|
||||
# quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
|
||||
# ------------
|
||||
soft_one_hot = F.softmax(logits, dim=2)
|
||||
_, top_idx = torch.topk(soft_one_hot, 1, dim=2)
|
||||
quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
|
||||
# preserve gradients
|
||||
# quant_feat = lq_feat + (quant_feat - lq_feat).detach()
|
||||
|
||||
if detach_16:
|
||||
quant_feat = quant_feat.detach() # for training stage III
|
||||
if adain:
|
||||
quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
|
||||
|
||||
# ################## Generator ####################
|
||||
x = quant_feat
|
||||
fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
|
||||
|
||||
for i, block in enumerate(self.generator.blocks):
|
||||
x = block(x)
|
||||
if i in fuse_list: # fuse after i-th block
|
||||
f_size = str(x.shape[-1])
|
||||
if w>0:
|
||||
x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
|
||||
out = x
|
||||
# logits doesn't need softmax before cross_entropy loss
|
||||
return out, logits, lq_feat
|
||||
76
ldm/dream/restoration/gfpgan.py
Normal file
76
ldm/dream/restoration/gfpgan.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import torch
|
||||
import warnings
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class GFPGAN():
|
||||
def __init__(
|
||||
self,
|
||||
gfpgan_dir='src/gfpgan',
|
||||
gfpgan_model_path='experiments/pretrained_models/GFPGANv1.4.pth') -> None:
|
||||
|
||||
self.model_path = os.path.join(gfpgan_dir, gfpgan_model_path)
|
||||
self.gfpgan_model_exists = os.path.isfile(self.model_path)
|
||||
|
||||
if not self.gfpgan_model_exists:
|
||||
print('## NOT FOUND: GFPGAN model not found at ' + self.model_path)
|
||||
return None
|
||||
sys.path.append(os.path.abspath(gfpgan_dir))
|
||||
|
||||
def model_exists(self):
|
||||
return os.path.isfile(self.model_path)
|
||||
|
||||
def process(self, image, strength: float, seed: str = None):
|
||||
if seed is not None:
|
||||
print(f'>> GFPGAN - Restoring Faces for image seed:{seed}')
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
try:
|
||||
from gfpgan import GFPGANer
|
||||
self.gfpgan = GFPGANer(
|
||||
model_path=self.model_path,
|
||||
upscale=1,
|
||||
arch='clean',
|
||||
channel_multiplier=2,
|
||||
bg_upsampler=None,
|
||||
)
|
||||
except Exception:
|
||||
import traceback
|
||||
print('>> Error loading GFPGAN:', file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
if self.gfpgan is None:
|
||||
print(
|
||||
f'>> WARNING: GFPGAN not initialized.'
|
||||
)
|
||||
print(
|
||||
f'>> Download https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth to {self.model_path}, \nor change GFPGAN directory with --gfpgan_dir.'
|
||||
)
|
||||
|
||||
image = image.convert('RGB')
|
||||
|
||||
_, _, restored_img = self.gfpgan.enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
has_aligned=False,
|
||||
only_center_face=False,
|
||||
paste_back=True,
|
||||
)
|
||||
res = Image.fromarray(restored_img)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if restored_img.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
self.gfpgan = None
|
||||
|
||||
return res
|
||||
102
ldm/dream/restoration/realesrgan.py
Normal file
102
ldm/dream/restoration/realesrgan.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import torch
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class ESRGAN():
|
||||
def __init__(self, bg_tile_size=400) -> None:
|
||||
self.bg_tile_size = bg_tile_size
|
||||
|
||||
if not torch.cuda.is_available(): # CPU or MPS on M1
|
||||
use_half_precision = False
|
||||
else:
|
||||
use_half_precision = True
|
||||
|
||||
def load_esrgan_bg_upsampler(self, upsampler_scale):
|
||||
if not torch.cuda.is_available(): # CPU or MPS on M1
|
||||
use_half_precision = False
|
||||
else:
|
||||
use_half_precision = True
|
||||
|
||||
model_path = {
|
||||
2: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
|
||||
4: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
|
||||
}
|
||||
|
||||
if upsampler_scale not in model_path:
|
||||
return None
|
||||
else:
|
||||
from basicsr.archs.rrdbnet_arch import RRDBNet
|
||||
from realesrgan import RealESRGANer
|
||||
|
||||
if upsampler_scale == 4:
|
||||
model = RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=4,
|
||||
)
|
||||
if upsampler_scale == 2:
|
||||
model = RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=2,
|
||||
)
|
||||
|
||||
bg_upsampler = RealESRGANer(
|
||||
scale=upsampler_scale,
|
||||
model_path=model_path[upsampler_scale],
|
||||
model=model,
|
||||
tile=self.bg_tile_size,
|
||||
tile_pad=10,
|
||||
pre_pad=0,
|
||||
half=use_half_precision,
|
||||
)
|
||||
|
||||
return bg_upsampler
|
||||
|
||||
def process(self, image, strength: float, seed: str = None, upsampler_scale: int = 2):
|
||||
if seed is not None:
|
||||
print(
|
||||
f'>> Real-ESRGAN Upscaling seed:{seed} : scale:{upsampler_scale}x'
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
try:
|
||||
upsampler = self.load_esrgan_bg_upsampler(upsampler_scale)
|
||||
except Exception:
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
print('>> Error loading Real-ESRGAN:', file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
output, _ = upsampler.enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
outscale=upsampler_scale,
|
||||
alpha_upsampler='realesrgan',
|
||||
)
|
||||
|
||||
res = Image.fromarray(output)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if output.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
upsampler = None
|
||||
|
||||
return res
|
||||
435
ldm/dream/restoration/vqgan_arch.py
Normal file
435
ldm/dream/restoration/vqgan_arch.py
Normal file
@@ -0,0 +1,435 @@
|
||||
'''
|
||||
VQGAN code, adapted from the original created by the Unleashing Transformers authors:
|
||||
https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py
|
||||
|
||||
'''
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import copy
|
||||
from basicsr.utils import get_root_logger
|
||||
from basicsr.utils.registry import ARCH_REGISTRY
|
||||
|
||||
def normalize(in_channels):
|
||||
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def swish(x):
|
||||
return x*torch.sigmoid(x)
|
||||
|
||||
|
||||
# Define VQVAE classes
|
||||
class VectorQuantizer(nn.Module):
|
||||
def __init__(self, codebook_size, emb_dim, beta):
|
||||
super(VectorQuantizer, self).__init__()
|
||||
self.codebook_size = codebook_size # number of embeddings
|
||||
self.emb_dim = emb_dim # dimension of embedding
|
||||
self.beta = beta # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
|
||||
self.embedding = nn.Embedding(self.codebook_size, self.emb_dim)
|
||||
self.embedding.weight.data.uniform_(-1.0 / self.codebook_size, 1.0 / self.codebook_size)
|
||||
|
||||
def forward(self, z):
|
||||
# reshape z -> (batch, height, width, channel) and flatten
|
||||
z = z.permute(0, 2, 3, 1).contiguous()
|
||||
z_flattened = z.view(-1, self.emb_dim)
|
||||
|
||||
# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
|
||||
d = (z_flattened ** 2).sum(dim=1, keepdim=True) + (self.embedding.weight**2).sum(1) - \
|
||||
2 * torch.matmul(z_flattened, self.embedding.weight.t())
|
||||
|
||||
mean_distance = torch.mean(d)
|
||||
# find closest encodings
|
||||
# min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
|
||||
min_encoding_scores, min_encoding_indices = torch.topk(d, 1, dim=1, largest=False)
|
||||
# [0-1], higher score, higher confidence
|
||||
min_encoding_scores = torch.exp(-min_encoding_scores/10)
|
||||
|
||||
min_encodings = torch.zeros(min_encoding_indices.shape[0], self.codebook_size).to(z)
|
||||
min_encodings.scatter_(1, min_encoding_indices, 1)
|
||||
|
||||
# get quantized latent vectors
|
||||
z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
|
||||
# compute loss for embedding
|
||||
loss = torch.mean((z_q.detach()-z)**2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
|
||||
# preserve gradients
|
||||
z_q = z + (z_q - z).detach()
|
||||
|
||||
# perplexity
|
||||
e_mean = torch.mean(min_encodings, dim=0)
|
||||
perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
|
||||
# reshape back to match original input shape
|
||||
z_q = z_q.permute(0, 3, 1, 2).contiguous()
|
||||
|
||||
return z_q, loss, {
|
||||
"perplexity": perplexity,
|
||||
"min_encodings": min_encodings,
|
||||
"min_encoding_indices": min_encoding_indices,
|
||||
"min_encoding_scores": min_encoding_scores,
|
||||
"mean_distance": mean_distance
|
||||
}
|
||||
|
||||
def get_codebook_feat(self, indices, shape):
|
||||
# input indices: batch*token_num -> (batch*token_num)*1
|
||||
# shape: batch, height, width, channel
|
||||
indices = indices.view(-1,1)
|
||||
min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices)
|
||||
min_encodings.scatter_(1, indices, 1)
|
||||
# get quantized latent vectors
|
||||
z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
|
||||
|
||||
if shape is not None: # reshape back to match original input shape
|
||||
z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous()
|
||||
|
||||
return z_q
|
||||
|
||||
|
||||
class GumbelQuantizer(nn.Module):
|
||||
def __init__(self, codebook_size, emb_dim, num_hiddens, straight_through=False, kl_weight=5e-4, temp_init=1.0):
|
||||
super().__init__()
|
||||
self.codebook_size = codebook_size # number of embeddings
|
||||
self.emb_dim = emb_dim # dimension of embedding
|
||||
self.straight_through = straight_through
|
||||
self.temperature = temp_init
|
||||
self.kl_weight = kl_weight
|
||||
self.proj = nn.Conv2d(num_hiddens, codebook_size, 1) # projects last encoder layer to quantized logits
|
||||
self.embed = nn.Embedding(codebook_size, emb_dim)
|
||||
|
||||
def forward(self, z):
|
||||
hard = self.straight_through if self.training else True
|
||||
|
||||
logits = self.proj(z)
|
||||
|
||||
soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard)
|
||||
|
||||
z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
|
||||
|
||||
# + kl divergence to the prior loss
|
||||
qy = F.softmax(logits, dim=1)
|
||||
diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean()
|
||||
min_encoding_indices = soft_one_hot.argmax(dim=1)
|
||||
|
||||
return z_q, diff, {
|
||||
"min_encoding_indices": min_encoding_indices
|
||||
}
|
||||
|
||||
|
||||
class Downsample(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
super().__init__()
|
||||
self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
|
||||
|
||||
def forward(self, x):
|
||||
pad = (0, 1, 0, 1)
|
||||
x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class Upsample(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
super().__init__()
|
||||
self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.interpolate(x, scale_factor=2.0, mode="nearest")
|
||||
x = self.conv(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class ResBlock(nn.Module):
|
||||
def __init__(self, in_channels, out_channels=None):
|
||||
super(ResBlock, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = in_channels if out_channels is None else out_channels
|
||||
self.norm1 = normalize(in_channels)
|
||||
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
|
||||
self.norm2 = normalize(out_channels)
|
||||
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
|
||||
if self.in_channels != self.out_channels:
|
||||
self.conv_out = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
||||
|
||||
def forward(self, x_in):
|
||||
x = x_in
|
||||
x = self.norm1(x)
|
||||
x = swish(x)
|
||||
x = self.conv1(x)
|
||||
x = self.norm2(x)
|
||||
x = swish(x)
|
||||
x = self.conv2(x)
|
||||
if self.in_channels != self.out_channels:
|
||||
x_in = self.conv_out(x_in)
|
||||
|
||||
return x + x_in
|
||||
|
||||
|
||||
class AttnBlock(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
|
||||
self.norm = normalize(in_channels)
|
||||
self.q = torch.nn.Conv2d(
|
||||
in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0
|
||||
)
|
||||
self.k = torch.nn.Conv2d(
|
||||
in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0
|
||||
)
|
||||
self.v = torch.nn.Conv2d(
|
||||
in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0
|
||||
)
|
||||
self.proj_out = torch.nn.Conv2d(
|
||||
in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
h_ = x
|
||||
h_ = self.norm(h_)
|
||||
q = self.q(h_)
|
||||
k = self.k(h_)
|
||||
v = self.v(h_)
|
||||
|
||||
# compute attention
|
||||
b, c, h, w = q.shape
|
||||
q = q.reshape(b, c, h*w)
|
||||
q = q.permute(0, 2, 1)
|
||||
k = k.reshape(b, c, h*w)
|
||||
w_ = torch.bmm(q, k)
|
||||
w_ = w_ * (int(c)**(-0.5))
|
||||
w_ = F.softmax(w_, dim=2)
|
||||
|
||||
# attend to values
|
||||
v = v.reshape(b, c, h*w)
|
||||
w_ = w_.permute(0, 2, 1)
|
||||
h_ = torch.bmm(v, w_)
|
||||
h_ = h_.reshape(b, c, h, w)
|
||||
|
||||
h_ = self.proj_out(h_)
|
||||
|
||||
return x+h_
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, in_channels, nf, emb_dim, ch_mult, num_res_blocks, resolution, attn_resolutions):
|
||||
super().__init__()
|
||||
self.nf = nf
|
||||
self.num_resolutions = len(ch_mult)
|
||||
self.num_res_blocks = num_res_blocks
|
||||
self.resolution = resolution
|
||||
self.attn_resolutions = attn_resolutions
|
||||
|
||||
curr_res = self.resolution
|
||||
in_ch_mult = (1,)+tuple(ch_mult)
|
||||
|
||||
blocks = []
|
||||
# initial convultion
|
||||
blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1))
|
||||
|
||||
# residual and downsampling blocks, with attention on smaller res (16x16)
|
||||
for i in range(self.num_resolutions):
|
||||
block_in_ch = nf * in_ch_mult[i]
|
||||
block_out_ch = nf * ch_mult[i]
|
||||
for _ in range(self.num_res_blocks):
|
||||
blocks.append(ResBlock(block_in_ch, block_out_ch))
|
||||
block_in_ch = block_out_ch
|
||||
if curr_res in attn_resolutions:
|
||||
blocks.append(AttnBlock(block_in_ch))
|
||||
|
||||
if i != self.num_resolutions - 1:
|
||||
blocks.append(Downsample(block_in_ch))
|
||||
curr_res = curr_res // 2
|
||||
|
||||
# non-local attention block
|
||||
blocks.append(ResBlock(block_in_ch, block_in_ch))
|
||||
blocks.append(AttnBlock(block_in_ch))
|
||||
blocks.append(ResBlock(block_in_ch, block_in_ch))
|
||||
|
||||
# normalise and convert to latent size
|
||||
blocks.append(normalize(block_in_ch))
|
||||
blocks.append(nn.Conv2d(block_in_ch, emb_dim, kernel_size=3, stride=1, padding=1))
|
||||
self.blocks = nn.ModuleList(blocks)
|
||||
|
||||
def forward(self, x):
|
||||
for block in self.blocks:
|
||||
x = block(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class Generator(nn.Module):
|
||||
def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
|
||||
super().__init__()
|
||||
self.nf = nf
|
||||
self.ch_mult = ch_mult
|
||||
self.num_resolutions = len(self.ch_mult)
|
||||
self.num_res_blocks = res_blocks
|
||||
self.resolution = img_size
|
||||
self.attn_resolutions = attn_resolutions
|
||||
self.in_channels = emb_dim
|
||||
self.out_channels = 3
|
||||
block_in_ch = self.nf * self.ch_mult[-1]
|
||||
curr_res = self.resolution // 2 ** (self.num_resolutions-1)
|
||||
|
||||
blocks = []
|
||||
# initial conv
|
||||
blocks.append(nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1))
|
||||
|
||||
# non-local attention block
|
||||
blocks.append(ResBlock(block_in_ch, block_in_ch))
|
||||
blocks.append(AttnBlock(block_in_ch))
|
||||
blocks.append(ResBlock(block_in_ch, block_in_ch))
|
||||
|
||||
for i in reversed(range(self.num_resolutions)):
|
||||
block_out_ch = self.nf * self.ch_mult[i]
|
||||
|
||||
for _ in range(self.num_res_blocks):
|
||||
blocks.append(ResBlock(block_in_ch, block_out_ch))
|
||||
block_in_ch = block_out_ch
|
||||
|
||||
if curr_res in self.attn_resolutions:
|
||||
blocks.append(AttnBlock(block_in_ch))
|
||||
|
||||
if i != 0:
|
||||
blocks.append(Upsample(block_in_ch))
|
||||
curr_res = curr_res * 2
|
||||
|
||||
blocks.append(normalize(block_in_ch))
|
||||
blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
|
||||
|
||||
self.blocks = nn.ModuleList(blocks)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
for block in self.blocks:
|
||||
x = block(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@ARCH_REGISTRY.register()
|
||||
class VQAutoEncoder(nn.Module):
|
||||
def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=[16], codebook_size=1024, emb_dim=256,
|
||||
beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
|
||||
super().__init__()
|
||||
logger = get_root_logger()
|
||||
self.in_channels = 3
|
||||
self.nf = nf
|
||||
self.n_blocks = res_blocks
|
||||
self.codebook_size = codebook_size
|
||||
self.embed_dim = emb_dim
|
||||
self.ch_mult = ch_mult
|
||||
self.resolution = img_size
|
||||
self.attn_resolutions = attn_resolutions
|
||||
self.quantizer_type = quantizer
|
||||
self.encoder = Encoder(
|
||||
self.in_channels,
|
||||
self.nf,
|
||||
self.embed_dim,
|
||||
self.ch_mult,
|
||||
self.n_blocks,
|
||||
self.resolution,
|
||||
self.attn_resolutions
|
||||
)
|
||||
if self.quantizer_type == "nearest":
|
||||
self.beta = beta #0.25
|
||||
self.quantize = VectorQuantizer(self.codebook_size, self.embed_dim, self.beta)
|
||||
elif self.quantizer_type == "gumbel":
|
||||
self.gumbel_num_hiddens = emb_dim
|
||||
self.straight_through = gumbel_straight_through
|
||||
self.kl_weight = gumbel_kl_weight
|
||||
self.quantize = GumbelQuantizer(
|
||||
self.codebook_size,
|
||||
self.embed_dim,
|
||||
self.gumbel_num_hiddens,
|
||||
self.straight_through,
|
||||
self.kl_weight
|
||||
)
|
||||
self.generator = Generator(
|
||||
self.nf,
|
||||
self.embed_dim,
|
||||
self.ch_mult,
|
||||
self.n_blocks,
|
||||
self.resolution,
|
||||
self.attn_resolutions
|
||||
)
|
||||
|
||||
if model_path is not None:
|
||||
chkpt = torch.load(model_path, map_location='cpu')
|
||||
if 'params_ema' in chkpt:
|
||||
self.load_state_dict(torch.load(model_path, map_location='cpu')['params_ema'])
|
||||
logger.info(f'vqgan is loaded from: {model_path} [params_ema]')
|
||||
elif 'params' in chkpt:
|
||||
self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
|
||||
logger.info(f'vqgan is loaded from: {model_path} [params]')
|
||||
else:
|
||||
raise ValueError(f'Wrong params!')
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = self.encoder(x)
|
||||
quant, codebook_loss, quant_stats = self.quantize(x)
|
||||
x = self.generator(quant)
|
||||
return x, codebook_loss, quant_stats
|
||||
|
||||
|
||||
|
||||
# patch based discriminator
|
||||
@ARCH_REGISTRY.register()
|
||||
class VQGANDiscriminator(nn.Module):
|
||||
def __init__(self, nc=3, ndf=64, n_layers=4, model_path=None):
|
||||
super().__init__()
|
||||
|
||||
layers = [nn.Conv2d(nc, ndf, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, True)]
|
||||
ndf_mult = 1
|
||||
ndf_mult_prev = 1
|
||||
for n in range(1, n_layers): # gradually increase the number of filters
|
||||
ndf_mult_prev = ndf_mult
|
||||
ndf_mult = min(2 ** n, 8)
|
||||
layers += [
|
||||
nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=2, padding=1, bias=False),
|
||||
nn.BatchNorm2d(ndf * ndf_mult),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]
|
||||
|
||||
ndf_mult_prev = ndf_mult
|
||||
ndf_mult = min(2 ** n_layers, 8)
|
||||
|
||||
layers += [
|
||||
nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=1, padding=1, bias=False),
|
||||
nn.BatchNorm2d(ndf * ndf_mult),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]
|
||||
|
||||
layers += [
|
||||
nn.Conv2d(ndf * ndf_mult, 1, kernel_size=4, stride=1, padding=1)] # output 1 channel prediction map
|
||||
self.main = nn.Sequential(*layers)
|
||||
|
||||
if model_path is not None:
|
||||
chkpt = torch.load(model_path, map_location='cpu')
|
||||
if 'params_d' in chkpt:
|
||||
self.load_state_dict(torch.load(model_path, map_location='cpu')['params_d'])
|
||||
elif 'params' in chkpt:
|
||||
self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
|
||||
else:
|
||||
raise ValueError(f'Wrong params!')
|
||||
|
||||
def forward(self, x):
|
||||
return self.main(x)
|
||||
@@ -4,7 +4,7 @@ import copy
|
||||
import base64
|
||||
import mimetypes
|
||||
import os
|
||||
from ldm.dream.args import Args, format_metadata
|
||||
from ldm.dream.args import Args, metadata_dumps
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from ldm.dream.pngwriter import PngWriter
|
||||
from threading import Event
|
||||
@@ -37,6 +37,8 @@ def build_opt(post_data, seed, gfpgan_model_exists):
|
||||
setattr(opt, 'seed', None if int(post_data['seed']) == -1 else int(post_data['seed']))
|
||||
setattr(opt, 'variation_amount', float(post_data['variation_amount']) if int(post_data['seed']) != -1 else 0)
|
||||
setattr(opt, 'with_variations', [])
|
||||
setattr(opt, 'embiggen', None)
|
||||
setattr(opt, 'embiggen_tiles', None)
|
||||
|
||||
broken = False
|
||||
if int(post_data['seed']) != -1 and post_data['with_variations'] != '':
|
||||
@@ -76,16 +78,15 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "text/html")
|
||||
self.end_headers()
|
||||
with open("./static/dream_web/index.html", "rb") as content:
|
||||
with open("./static/legacy_web/index.html", "rb") as content:
|
||||
self.wfile.write(content.read())
|
||||
elif self.path == "/config.js":
|
||||
# unfortunately this import can't be at the top level, since that would cause a circular import
|
||||
from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "application/javascript")
|
||||
self.end_headers()
|
||||
config = {
|
||||
'gfpgan_model_exists': gfpgan_model_exists
|
||||
'gfpgan_model_exists': self.gfpgan_model_exists
|
||||
}
|
||||
self.wfile.write(bytes("let config = " + json.dumps(config) + ";\n", "utf-8"))
|
||||
elif self.path == "/run_log.json":
|
||||
@@ -94,7 +95,7 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
self.end_headers()
|
||||
output = []
|
||||
|
||||
log_file = os.path.join(self.outdir, "dream_web_log.txt")
|
||||
log_file = os.path.join(self.outdir, "legacy_web_log.txt")
|
||||
if os.path.exists(log_file):
|
||||
with open(log_file, "r") as log:
|
||||
for line in log:
|
||||
@@ -114,7 +115,7 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
else:
|
||||
path_dir = os.path.dirname(self.path)
|
||||
out_dir = os.path.realpath(self.outdir.rstrip('/'))
|
||||
if self.path.startswith('/static/dream_web/'):
|
||||
if self.path.startswith('/static/legacy_web/'):
|
||||
path = '.' + self.path
|
||||
elif out_dir.replace('\\', '/').endswith(path_dir):
|
||||
file = os.path.basename(self.path)
|
||||
@@ -138,14 +139,12 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
self.end_headers()
|
||||
|
||||
# unfortunately this import can't be at the top level, since that would cause a circular import
|
||||
from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
|
||||
|
||||
content_length = int(self.headers['Content-Length'])
|
||||
post_data = json.loads(self.rfile.read(content_length))
|
||||
opt = build_opt(post_data, self.model.seed, gfpgan_model_exists)
|
||||
opt = build_opt(post_data, self.model.seed, self.gfpgan_model_exists)
|
||||
|
||||
self.canceled.clear()
|
||||
print(f">> Request to generate with prompt: {opt.prompt}")
|
||||
# In order to handle upscaled images, the PngWriter needs to maintain state
|
||||
# across images generated by each call to prompt2img(), so we define it in
|
||||
# the outer scope of image_done()
|
||||
@@ -162,7 +161,7 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
# is complete. The upscaling replaces the original file, so the second
|
||||
# entry should not be inserted into the image list.
|
||||
# LS: This repeats code in dream.py
|
||||
def image_done(image, seed, upscaled=False):
|
||||
def image_done(image, seed, upscaled=False, first_seed=None):
|
||||
name = f'{prefix}.{seed}.png'
|
||||
iter_opt = copy.copy(opt)
|
||||
if opt.variation_amount > 0:
|
||||
@@ -176,10 +175,9 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
path = pngwriter.save_image_and_prompt_to_png(
|
||||
image,
|
||||
dream_prompt = formatted_prompt,
|
||||
metadata = format_metadata(iter_opt,
|
||||
seeds = [seed],
|
||||
weights = self.model.weights,
|
||||
model_hash = self.model.model_hash
|
||||
metadata = metadata_dumps(iter_opt,
|
||||
seeds = [seed],
|
||||
model_hash = self.model.model_hash
|
||||
),
|
||||
name = name,
|
||||
)
|
||||
@@ -188,7 +186,7 @@ class DreamServer(BaseHTTPRequestHandler):
|
||||
config['seed'] = seed
|
||||
# Append post_data to log, but only once!
|
||||
if not upscaled:
|
||||
with open(os.path.join(self.outdir, "dream_web_log.txt"), "a") as log:
|
||||
with open(os.path.join(self.outdir, "legacy_web_log.txt"), "a") as log:
|
||||
log.write(f"{path}: {json.dumps(config)}\n")
|
||||
|
||||
self.wfile.write(bytes(json.dumps(
|
||||
|
||||
557
ldm/generate.py
557
ldm/generate.py
@@ -15,20 +15,59 @@ import traceback
|
||||
import transformers
|
||||
import io
|
||||
import hashlib
|
||||
import cv2
|
||||
import skimage
|
||||
|
||||
from omegaconf import OmegaConf
|
||||
from PIL import Image, ImageOps
|
||||
from torch import nn
|
||||
from pytorch_lightning import seed_everything, logging
|
||||
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from ldm.models.diffusion.plms import PLMSSampler
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from ldm.models.diffusion.plms import PLMSSampler
|
||||
from ldm.models.diffusion.ksampler import KSampler
|
||||
from ldm.dream.pngwriter import PngWriter
|
||||
from ldm.dream.image_util import InitImageResizer
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
from ldm.dream.conditioning import get_uc_and_c
|
||||
from ldm.dream.pngwriter import PngWriter
|
||||
from ldm.dream.args import metadata_from_png
|
||||
from ldm.dream.image_util import InitImageResizer
|
||||
from ldm.dream.devices import choose_torch_device, choose_precision
|
||||
from ldm.dream.conditioning import get_uc_and_c
|
||||
|
||||
def fix_func(orig):
|
||||
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
||||
def new_func(*args, **kw):
|
||||
device = kw.get("device", "mps")
|
||||
kw["device"]="cpu"
|
||||
return orig(*args, **kw).to(device)
|
||||
return new_func
|
||||
return orig
|
||||
|
||||
torch.rand = fix_func(torch.rand)
|
||||
torch.rand_like = fix_func(torch.rand_like)
|
||||
torch.randn = fix_func(torch.randn)
|
||||
torch.randn_like = fix_func(torch.randn_like)
|
||||
torch.randint = fix_func(torch.randint)
|
||||
torch.randint_like = fix_func(torch.randint_like)
|
||||
torch.bernoulli = fix_func(torch.bernoulli)
|
||||
torch.multinomial = fix_func(torch.multinomial)
|
||||
|
||||
def fix_func(orig):
|
||||
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
||||
def new_func(*args, **kw):
|
||||
device = kw.get("device", "mps")
|
||||
kw["device"]="cpu"
|
||||
return orig(*args, **kw).to(device)
|
||||
return new_func
|
||||
return orig
|
||||
|
||||
torch.rand = fix_func(torch.rand)
|
||||
torch.rand_like = fix_func(torch.rand_like)
|
||||
torch.randn = fix_func(torch.randn)
|
||||
torch.randn_like = fix_func(torch.randn_like)
|
||||
torch.randint = fix_func(torch.randint)
|
||||
torch.randint_like = fix_func(torch.randint_like)
|
||||
torch.bernoulli = fix_func(torch.bernoulli)
|
||||
torch.multinomial = fix_func(torch.multinomial)
|
||||
|
||||
def fix_func(orig):
|
||||
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
||||
@@ -102,7 +141,7 @@ gr = Generate(
|
||||
# these values are set once and shouldn't be changed
|
||||
conf = path to configuration file ('configs/models.yaml')
|
||||
model = symbolic name of the model in the configuration file
|
||||
full_precision = False
|
||||
precision = float precision to be used
|
||||
|
||||
# this value is sticky and maintained between generation calls
|
||||
sampler_name = ['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'] // k_lms
|
||||
@@ -128,9 +167,13 @@ class Generate:
|
||||
sampler_name = 'k_lms',
|
||||
ddim_eta = 0.0, # deterministic
|
||||
full_precision = False,
|
||||
precision = 'auto',
|
||||
# these are deprecated; if present they override values in the conf file
|
||||
weights = None,
|
||||
config = None,
|
||||
gfpgan=None,
|
||||
codeformer=None,
|
||||
esrgan=None
|
||||
):
|
||||
models = OmegaConf.load(conf)
|
||||
mconfig = models[model]
|
||||
@@ -143,7 +186,7 @@ class Generate:
|
||||
self.cfg_scale = 7.5
|
||||
self.sampler_name = sampler_name
|
||||
self.ddim_eta = 0.0 # same seed always produces same image
|
||||
self.full_precision = True if choose_torch_device() == 'mps' else full_precision
|
||||
self.precision = precision
|
||||
self.strength = 0.75
|
||||
self.seamless = False
|
||||
self.embedding_path = embedding_path
|
||||
@@ -154,12 +197,23 @@ class Generate:
|
||||
self.generators = {}
|
||||
self.base_generator = None
|
||||
self.seed = None
|
||||
self.gfpgan = gfpgan
|
||||
self.codeformer = codeformer
|
||||
self.esrgan = esrgan
|
||||
|
||||
# Note that in previous versions, there was an option to pass the
|
||||
# device to Generate(). However the device was then ignored, so
|
||||
# it wasn't actually doing anything. This logic could be reinstated.
|
||||
device_type = choose_torch_device()
|
||||
self.device = torch.device(device_type)
|
||||
if full_precision:
|
||||
if self.precision != 'auto':
|
||||
raise ValueError('Remove --full_precision / -F if using --precision')
|
||||
print('Please remove deprecated --full_precision / -F')
|
||||
print('If auto config does not work you can use --precision=float32')
|
||||
self.precision = 'float32'
|
||||
if self.precision == 'auto':
|
||||
self.precision = choose_precision(self.device)
|
||||
|
||||
# for VRAM usage statistics
|
||||
self.session_peakmem = torch.cuda.max_memory_allocated() if self._has_cuda else None
|
||||
@@ -220,11 +274,15 @@ class Generate:
|
||||
init_mask = None,
|
||||
fit = False,
|
||||
strength = None,
|
||||
init_color = None,
|
||||
# these are specific to embiggen (which also relies on img2img args)
|
||||
embiggen = None,
|
||||
embiggen_tiles = None,
|
||||
out_direction = None,
|
||||
# these are specific to GFPGAN/ESRGAN
|
||||
facetool = None,
|
||||
gfpgan_strength = 0,
|
||||
codeformer_fidelity = None,
|
||||
save_original = False,
|
||||
upscale = None,
|
||||
# Set this True to handle KeyboardInterrupt internally
|
||||
@@ -269,16 +327,17 @@ class Generate:
|
||||
write the prompt into the PNG metadata.
|
||||
"""
|
||||
# TODO: convert this into a getattr() loop
|
||||
steps = steps or self.steps
|
||||
width = width or self.width
|
||||
height = height or self.height
|
||||
seamless = seamless or self.seamless
|
||||
cfg_scale = cfg_scale or self.cfg_scale
|
||||
ddim_eta = ddim_eta or self.ddim_eta
|
||||
iterations = iterations or self.iterations
|
||||
strength = strength or self.strength
|
||||
self.seed = seed
|
||||
steps = steps or self.steps
|
||||
width = width or self.width
|
||||
height = height or self.height
|
||||
seamless = seamless or self.seamless
|
||||
cfg_scale = cfg_scale or self.cfg_scale
|
||||
ddim_eta = ddim_eta or self.ddim_eta
|
||||
iterations = iterations or self.iterations
|
||||
strength = strength or self.strength
|
||||
self.seed = seed
|
||||
self.log_tokenization = log_tokenization
|
||||
self.step_callback = step_callback
|
||||
with_variations = [] if with_variations is None else with_variations
|
||||
|
||||
# will instantiate the model or return it from cache
|
||||
@@ -287,16 +346,17 @@ class Generate:
|
||||
for m in model.modules():
|
||||
if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
|
||||
m.padding_mode = 'circular' if seamless else m._orig_padding_mode
|
||||
|
||||
|
||||
assert cfg_scale > 1.0, 'CFG_Scale (-C) must be >1.0'
|
||||
assert (
|
||||
0.0 < strength < 1.0
|
||||
), 'img2img and inpaint strength can only work with 0.0 < strength < 1.0'
|
||||
assert (
|
||||
0.0 <= variation_amount <= 1.0
|
||||
0.0 <= variation_amount <= 1.0
|
||||
), '-v --variation_amount must be in [0.0, 1.0]'
|
||||
assert (
|
||||
(embiggen == None and embiggen_tiles == None) or ((embiggen != None or embiggen_tiles != None) and init_img != None)
|
||||
(embiggen == None and embiggen_tiles == None) or (
|
||||
(embiggen != None or embiggen_tiles != None) and init_img != None)
|
||||
), 'Embiggen requires an init/input image to be specified'
|
||||
|
||||
if len(with_variations) > 0 or variation_amount > 1.0:
|
||||
@@ -318,9 +378,9 @@ class Generate:
|
||||
if self._has_cuda():
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
results = list()
|
||||
init_image = None
|
||||
mask_image = None
|
||||
results = list()
|
||||
init_image = None
|
||||
mask_image = None
|
||||
|
||||
try:
|
||||
uc, c = get_uc_and_c(
|
||||
@@ -329,8 +389,14 @@ class Generate:
|
||||
log_tokens =self.log_tokenization
|
||||
)
|
||||
|
||||
(init_image,mask_image) = self._make_images(init_img,init_mask, width, height, fit)
|
||||
|
||||
init_image,mask_image = self._make_images(
|
||||
init_img,
|
||||
init_mask,
|
||||
width,
|
||||
height,
|
||||
fit=fit,
|
||||
out_direction=out_direction,
|
||||
)
|
||||
if (init_image is not None) and (mask_image is not None):
|
||||
generator = self._make_inpaint()
|
||||
elif (embiggen != None or embiggen_tiles != None):
|
||||
@@ -340,32 +406,40 @@ class Generate:
|
||||
else:
|
||||
generator = self._make_txt2img()
|
||||
|
||||
generator.set_variation(self.seed, variation_amount, with_variations)
|
||||
generator.set_variation(
|
||||
self.seed, variation_amount, with_variations)
|
||||
results = generator.generate(
|
||||
prompt,
|
||||
iterations = iterations,
|
||||
seed = self.seed,
|
||||
sampler = self.sampler,
|
||||
steps = steps,
|
||||
cfg_scale = cfg_scale,
|
||||
conditioning = (uc,c),
|
||||
ddim_eta = ddim_eta,
|
||||
image_callback = image_callback, # called after the final image is generated
|
||||
step_callback = step_callback, # called after each intermediate image is generated
|
||||
width = width,
|
||||
height = height,
|
||||
init_img = init_img, # embiggen needs to manipulate from the unmodified init_img
|
||||
init_image = init_image, # notice that init_image is different from init_img
|
||||
mask_image = mask_image,
|
||||
strength = strength,
|
||||
embiggen = embiggen,
|
||||
embiggen_tiles = embiggen_tiles,
|
||||
iterations=iterations,
|
||||
seed=self.seed,
|
||||
sampler=self.sampler,
|
||||
steps=steps,
|
||||
cfg_scale=cfg_scale,
|
||||
conditioning=(uc, c),
|
||||
ddim_eta=ddim_eta,
|
||||
image_callback=image_callback, # called after the final image is generated
|
||||
step_callback=step_callback, # called after each intermediate image is generated
|
||||
width=width,
|
||||
height=height,
|
||||
init_img=init_img, # embiggen needs to manipulate from the unmodified init_img
|
||||
init_image=init_image, # notice that init_image is different from init_img
|
||||
mask_image=mask_image,
|
||||
strength=strength,
|
||||
embiggen=embiggen,
|
||||
embiggen_tiles=embiggen_tiles,
|
||||
)
|
||||
|
||||
if init_color:
|
||||
self.correct_colors(image_list = results,
|
||||
reference_image_path = init_color,
|
||||
image_callback = image_callback)
|
||||
|
||||
if upscale is not None or gfpgan_strength > 0:
|
||||
self.upscale_and_reconstruct(results,
|
||||
upscale = upscale,
|
||||
facetool = facetool,
|
||||
strength = gfpgan_strength,
|
||||
codeformer_fidelity = codeformer_fidelity,
|
||||
save_original = save_original,
|
||||
image_callback = image_callback)
|
||||
|
||||
@@ -381,7 +455,8 @@ class Generate:
|
||||
toc = time.time()
|
||||
print('>> Usage stats:')
|
||||
print(
|
||||
f'>> {len(results)} image(s) generated in', '%4.2fs' % (toc - tic)
|
||||
f'>> {len(results)} image(s) generated in', '%4.2fs' % (
|
||||
toc - tic)
|
||||
)
|
||||
if self._has_cuda():
|
||||
print(
|
||||
@@ -400,53 +475,193 @@ class Generate:
|
||||
)
|
||||
return results
|
||||
|
||||
def _make_images(self, img_path, mask_path, width, height, fit=False):
|
||||
# this needs to be generalized to all sorts of postprocessors, which should be wrapped
|
||||
# in a nice harmonized call signature. For now we have a bunch of if/elses!
|
||||
def apply_postprocessor(
|
||||
self,
|
||||
image_path,
|
||||
tool = 'gfpgan', # one of 'upscale', 'gfpgan', 'codeformer', 'outpaint', or 'embiggen'
|
||||
gfpgan_strength = 0.0,
|
||||
codeformer_fidelity = 0.75,
|
||||
upscale = None,
|
||||
out_direction = None,
|
||||
save_original = True, # to get new name
|
||||
callback = None,
|
||||
opt = None,
|
||||
):
|
||||
# retrieve the seed from the image;
|
||||
# note that we will try both the new way and the old way, since not all files have the
|
||||
# metadata (yet)
|
||||
seed = None
|
||||
image_metadata = None
|
||||
prompt = None
|
||||
try:
|
||||
args = metadata_from_png(image_path)
|
||||
if len(args) > 1:
|
||||
print("* Can't postprocess a grid")
|
||||
return
|
||||
seed = args[0].seed
|
||||
prompt = args[0].prompt
|
||||
print(f'>> retrieved seed {seed} and prompt "{prompt}" from {image_path}')
|
||||
except:
|
||||
m = re.search('(\d+)\.png$',image_path)
|
||||
if m:
|
||||
seed = m.group(1)
|
||||
|
||||
if not seed:
|
||||
print('* Could not recover seed for image. Replacing with 42. This will not affect image quality')
|
||||
seed = 42
|
||||
|
||||
# face fixers and esrgan take an Image, but embiggen takes a path
|
||||
image = Image.open(image_path)
|
||||
|
||||
# Note that we need to adopt a uniform API for the postprocessors.
|
||||
# This is completely ad hoc ATCM
|
||||
if tool in ('gfpgan','codeformer','upscale'):
|
||||
if tool == 'gfpgan':
|
||||
facetool = 'gfpgan'
|
||||
elif tool == 'codeformer':
|
||||
facetool = 'codeformer'
|
||||
elif tool == 'upscale':
|
||||
facetool = 'gfpgan' # but won't be run
|
||||
gfpgan_strength = 0
|
||||
return self.upscale_and_reconstruct(
|
||||
[[image,seed]],
|
||||
facetool = facetool,
|
||||
strength = gfpgan_strength,
|
||||
codeformer_fidelity = codeformer_fidelity,
|
||||
save_original = save_original,
|
||||
upscale = upscale,
|
||||
image_callback = callback,
|
||||
)
|
||||
|
||||
elif tool == 'embiggen':
|
||||
# fetch the metadata from the image
|
||||
generator = self._make_embiggen()
|
||||
uc, c = get_uc_and_c(
|
||||
prompt, model =self.model,
|
||||
skip_normalize=opt.skip_normalize,
|
||||
log_tokens =opt.log_tokenization
|
||||
)
|
||||
opt.strength = 0.40
|
||||
print(f'>> Setting img2img strength to {opt.strength} for happy embiggening')
|
||||
# embiggen takes a image path (sigh)
|
||||
generator.generate(
|
||||
prompt,
|
||||
sampler = self.sampler,
|
||||
steps = opt.steps,
|
||||
cfg_scale = opt.cfg_scale,
|
||||
ddim_eta = self.ddim_eta,
|
||||
conditioning= (uc, c),
|
||||
init_img = image_path, # not the Image! (sigh)
|
||||
init_image = image, # embiggen wants both! (sigh)
|
||||
strength = opt.strength,
|
||||
width = opt.width,
|
||||
height = opt.height,
|
||||
embiggen = opt.embiggen,
|
||||
embiggen_tiles = opt.embiggen_tiles,
|
||||
image_callback = callback,
|
||||
)
|
||||
elif tool == 'outpaint':
|
||||
oldargs = metadata_from_png(image_path)
|
||||
opt.strength = 0.83
|
||||
opt.init_img = image_path
|
||||
return self.prompt2image(
|
||||
oldargs.prompt,
|
||||
out_direction = opt.out_direction,
|
||||
sampler = self.sampler,
|
||||
steps = opt.steps,
|
||||
cfg_scale = opt.cfg_scale,
|
||||
ddim_eta = self.ddim_eta,
|
||||
conditioning= get_uc_and_c(
|
||||
oldargs.prompt, model =self.model,
|
||||
skip_normalize=opt.skip_normalize,
|
||||
log_tokens =opt.log_tokenization
|
||||
),
|
||||
width = opt.width,
|
||||
height = opt.height,
|
||||
init_img = image_path, # not the Image! (sigh)
|
||||
strength = opt.strength,
|
||||
image_callback = callback,
|
||||
)
|
||||
else:
|
||||
print(f'* postprocessing tool {tool} is not yet supported')
|
||||
return None
|
||||
|
||||
|
||||
def _make_images(
|
||||
self,
|
||||
img,
|
||||
mask,
|
||||
width,
|
||||
height,
|
||||
fit=False,
|
||||
out_direction=None,
|
||||
):
|
||||
init_image = None
|
||||
init_mask = None
|
||||
if not img_path:
|
||||
return None,None
|
||||
if not img:
|
||||
return None, None
|
||||
|
||||
image = self._load_img(img_path, width, height, fit=fit) # this returns an Image
|
||||
image = self._load_img(
|
||||
img,
|
||||
width,
|
||||
height,
|
||||
fit=fit
|
||||
) # this returns an Image
|
||||
if out_direction:
|
||||
image = self._create_outpaint_image(image, out_direction)
|
||||
init_image = self._create_init_image(image) # this returns a torch tensor
|
||||
|
||||
if self._has_transparency(image) and not mask_path: # if image has a transparent area and no mask was provided, then try to generate mask
|
||||
print('>> Initial image has transparent areas. Will inpaint in these regions.')
|
||||
# if image has a transparent area and no mask was provided, then try to generate mask
|
||||
if self._has_transparency(image) and not mask:
|
||||
print(
|
||||
'>> Initial image has transparent areas. Will inpaint in these regions.')
|
||||
if self._check_for_erasure(image):
|
||||
print(
|
||||
'>> WARNING: Colors underneath the transparent region seem to have been erased.\n',
|
||||
'>> Inpainting will be suboptimal. Please preserve the colors when making\n',
|
||||
'>> a transparency mask, or provide mask explicitly using --init_mask (-M).'
|
||||
)
|
||||
init_mask = self._create_init_mask(image) # this returns a torch tensor
|
||||
# this returns a torch tensor
|
||||
init_mask = self._create_init_mask(image)
|
||||
|
||||
if mask_path:
|
||||
mask_image = self._load_img(mask_path, width, height, fit=fit) # this returns an Image
|
||||
init_mask = self._create_init_mask(mask_image)
|
||||
if mask:
|
||||
mask_image = self._load_img(
|
||||
mask, width, height, fit=fit) # this returns an Image
|
||||
init_mask = self._create_init_mask(mask_image)
|
||||
|
||||
return init_image,init_mask
|
||||
return init_image, init_mask
|
||||
|
||||
def _make_base(self):
|
||||
if not self.generators.get('base'):
|
||||
from ldm.dream.generator import Generator
|
||||
self.generators['base'] = Generator(self.model, self.precision)
|
||||
return self.generators['base']
|
||||
|
||||
def _make_img2img(self):
|
||||
if not self.generators.get('img2img'):
|
||||
from ldm.dream.generator.img2img import Img2Img
|
||||
self.generators['img2img'] = Img2Img(self.model)
|
||||
self.generators['img2img'] = Img2Img(self.model, self.precision)
|
||||
return self.generators['img2img']
|
||||
|
||||
|
||||
def _make_embiggen(self):
|
||||
if not self.generators.get('embiggen'):
|
||||
from ldm.dream.generator.embiggen import Embiggen
|
||||
self.generators['embiggen'] = Embiggen(self.model)
|
||||
self.generators['embiggen'] = Embiggen(self.model, self.precision)
|
||||
return self.generators['embiggen']
|
||||
|
||||
def _make_txt2img(self):
|
||||
if not self.generators.get('txt2img'):
|
||||
from ldm.dream.generator.txt2img import Txt2Img
|
||||
self.generators['txt2img'] = Txt2Img(self.model)
|
||||
self.generators['txt2img'] = Txt2Img(self.model, self.precision)
|
||||
self.generators['txt2img'].free_gpu_mem = self.free_gpu_mem
|
||||
return self.generators['txt2img']
|
||||
|
||||
def _make_inpaint(self):
|
||||
if not self.generators.get('inpaint'):
|
||||
from ldm.dream.generator.inpaint import Inpaint
|
||||
self.generators['inpaint'] = Inpaint(self.model)
|
||||
self.generators['inpaint'] = Inpaint(self.model, self.precision)
|
||||
return self.generators['inpaint']
|
||||
|
||||
def load_model(self):
|
||||
@@ -457,7 +672,7 @@ class Generate:
|
||||
model = self._load_model_from_config(self.config, self.weights)
|
||||
if self.embedding_path is not None:
|
||||
model.embedding_manager.load(
|
||||
self.embedding_path, self.full_precision
|
||||
self.embedding_path, self.precision == 'float32' or self.precision == 'autocast'
|
||||
)
|
||||
self.model = model.to(self.device)
|
||||
# model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
|
||||
@@ -475,38 +690,63 @@ class Generate:
|
||||
|
||||
return self.model
|
||||
|
||||
def correct_colors(self,
|
||||
image_list,
|
||||
reference_image_path,
|
||||
image_callback = None):
|
||||
reference_image = Image.open(reference_image_path)
|
||||
correction_target = cv2.cvtColor(np.asarray(reference_image),
|
||||
cv2.COLOR_RGB2LAB)
|
||||
for r in image_list:
|
||||
image, seed = r
|
||||
image = cv2.cvtColor(np.asarray(image),
|
||||
cv2.COLOR_RGB2LAB)
|
||||
image = skimage.exposure.match_histograms(image,
|
||||
correction_target,
|
||||
channel_axis=2)
|
||||
image = Image.fromarray(
|
||||
cv2.cvtColor(image, cv2.COLOR_LAB2RGB).astype("uint8")
|
||||
)
|
||||
if image_callback is not None:
|
||||
image_callback(image, seed)
|
||||
else:
|
||||
r[0] = image
|
||||
|
||||
def upscale_and_reconstruct(self,
|
||||
image_list,
|
||||
facetool = 'gfpgan',
|
||||
upscale = None,
|
||||
strength = 0.0,
|
||||
codeformer_fidelity = 0.75,
|
||||
save_original = False,
|
||||
image_callback = None):
|
||||
try:
|
||||
if upscale is not None:
|
||||
from ldm.gfpgan.gfpgan_tools import real_esrgan_upscale
|
||||
if strength > 0:
|
||||
from ldm.gfpgan.gfpgan_tools import run_gfpgan
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
print('>> You may need to install the ESRGAN and/or GFPGAN modules')
|
||||
return
|
||||
|
||||
for r in image_list:
|
||||
image, seed = r
|
||||
try:
|
||||
if upscale is not None:
|
||||
if len(upscale) < 2:
|
||||
upscale.append(0.75)
|
||||
image = real_esrgan_upscale(
|
||||
image,
|
||||
upscale[1],
|
||||
int(upscale[0]),
|
||||
seed,
|
||||
)
|
||||
if self.esrgan is not None:
|
||||
if len(upscale) < 2:
|
||||
upscale.append(0.75)
|
||||
image = self.esrgan.process(
|
||||
image, upscale[1], seed, int(upscale[0]))
|
||||
else:
|
||||
print(">> ESRGAN is disabled. Image not upscaled.")
|
||||
if strength > 0:
|
||||
image = run_gfpgan(
|
||||
image, strength, seed, 1
|
||||
)
|
||||
if self.gfpgan is not None or self.codeformer is not None:
|
||||
if facetool == 'gfpgan':
|
||||
if self.gfpgan is None:
|
||||
print('>> GFPGAN not found. Face restoration is disabled.')
|
||||
else:
|
||||
image = self.gfpgan.process(image, strength, seed)
|
||||
if facetool == 'codeformer':
|
||||
if self.codeformer is None:
|
||||
print('>> CodeFormer not found. Face restoration is disabled.')
|
||||
else:
|
||||
cf_device = 'cpu' if str(self.device) == 'mps' else self.device
|
||||
image = self.codeformer.process(image=image, strength=strength, device=cf_device, seed=seed, fidelity=codeformer_fidelity)
|
||||
else:
|
||||
print(">> Face Restoration is disabled.")
|
||||
except Exception as e:
|
||||
print(
|
||||
f'>> Error running RealESRGAN or GFPGAN. Your image was not upscaled.\n{e}'
|
||||
@@ -518,14 +758,8 @@ class Generate:
|
||||
r[0] = image
|
||||
|
||||
# to help WebGUI - front end to generator util function
|
||||
def sample_to_image(self,samples):
|
||||
return self._sample_to_image(samples)
|
||||
|
||||
def _sample_to_image(self,samples):
|
||||
if not self.base_generator:
|
||||
from ldm.dream.generator import Generator
|
||||
self.base_generator = Generator(self.model)
|
||||
return self.base_generator.sample_to_image(samples)
|
||||
def sample_to_image(self, samples):
|
||||
return self._make_base().sample_to_image(samples)
|
||||
|
||||
def _set_sampler(self):
|
||||
msg = f'>> Setting Sampler to {self.sampler_name}'
|
||||
@@ -564,7 +798,7 @@ class Generate:
|
||||
# for usage statistics
|
||||
device_type = choose_torch_device()
|
||||
if device_type == 'cuda':
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
tic = time.time()
|
||||
|
||||
# this does the work
|
||||
@@ -577,16 +811,13 @@ class Generate:
|
||||
sd = pl_sd['state_dict']
|
||||
model = instantiate_from_config(c.model)
|
||||
m, u = model.load_state_dict(sd, strict=False)
|
||||
|
||||
if self.full_precision:
|
||||
print(
|
||||
'>> Using slower but more accurate full-precision math (--full_precision)'
|
||||
)
|
||||
|
||||
if self.precision == 'float16':
|
||||
print('>> Using faster float16 precision')
|
||||
model.to(torch.float16)
|
||||
else:
|
||||
print(
|
||||
'>> Using half precision math. Call with --full_precision to use more accurate but VRAM-intensive full precision.'
|
||||
)
|
||||
model.half()
|
||||
print('>> Using more accurate float32 precision')
|
||||
|
||||
model.to(self.device)
|
||||
model.eval()
|
||||
|
||||
@@ -605,22 +836,32 @@ class Generate:
|
||||
|
||||
return model
|
||||
|
||||
def _load_img(self, path, width, height, fit=False):
|
||||
assert os.path.exists(path), f'>> {path}: File not found'
|
||||
def _load_img(self, img, width, height, fit=False):
|
||||
if isinstance(img, Image.Image):
|
||||
image = img
|
||||
print(
|
||||
f'>> using provided input image of size {image.width}x{image.height}'
|
||||
)
|
||||
elif isinstance(img, str):
|
||||
assert os.path.exists(img), f'>> {img}: File not found'
|
||||
|
||||
image = Image.open(img)
|
||||
print(
|
||||
f'>> loaded input image of size {image.width}x{image.height} from {img}'
|
||||
)
|
||||
else:
|
||||
image = Image.open(img)
|
||||
print(
|
||||
f'>> loaded input image of size {image.width}x{image.height}'
|
||||
)
|
||||
|
||||
# with Image.open(path) as img:
|
||||
# image = img.convert('RGBA')
|
||||
image = Image.open(path)
|
||||
print(
|
||||
f'>> loaded input image of size {image.width}x{image.height} from {path}'
|
||||
)
|
||||
if fit:
|
||||
image = self._fit_image(image,(width,height))
|
||||
image = self._fit_image(image, (width, height))
|
||||
else:
|
||||
image = self._squeeze_image(image)
|
||||
return image
|
||||
|
||||
def _create_init_image(self,image):
|
||||
def _create_init_image(self, image):
|
||||
image = image.convert('RGB')
|
||||
# print(
|
||||
# f'>> DEBUG: writing the image to img.png'
|
||||
@@ -629,16 +870,77 @@ class Generate:
|
||||
image = np.array(image).astype(np.float32) / 255.0
|
||||
image = image[None].transpose(0, 3, 1, 2)
|
||||
image = torch.from_numpy(image)
|
||||
image = 2.0 * image - 1.0
|
||||
image = 2.0 * image - 1.0
|
||||
return image.to(self.device)
|
||||
|
||||
# TODO: outpainting is a post-processing application and should be made to behave
|
||||
# like the other ones.
|
||||
def _create_outpaint_image(self, image, direction_args):
|
||||
assert len(direction_args) in [1, 2], 'Direction (-D) must have exactly one or two arguments.'
|
||||
|
||||
if len(direction_args) == 1:
|
||||
direction = direction_args[0]
|
||||
pixels = None
|
||||
elif len(direction_args) == 2:
|
||||
direction = direction_args[0]
|
||||
pixels = int(direction_args[1])
|
||||
|
||||
assert direction in ['top', 'left', 'bottom', 'right'], 'Direction (-D) must be one of "top", "left", "bottom", "right"'
|
||||
|
||||
image = image.convert("RGBA")
|
||||
# we always extend top, but rotate to extend along the requested side
|
||||
if direction == 'left':
|
||||
image = image.transpose(Image.Transpose.ROTATE_270)
|
||||
elif direction == 'bottom':
|
||||
image = image.transpose(Image.Transpose.ROTATE_180)
|
||||
elif direction == 'right':
|
||||
image = image.transpose(Image.Transpose.ROTATE_90)
|
||||
|
||||
pixels = image.height//2 if pixels is None else int(pixels)
|
||||
assert 0 < pixels < image.height, 'Direction (-D) pixels length must be in the range 0 - image.size'
|
||||
|
||||
# the top part of the image is taken from the source image mirrored
|
||||
# coordinates (0,0) are the upper left corner of an image
|
||||
top = image.transpose(Image.Transpose.FLIP_TOP_BOTTOM).convert("RGBA")
|
||||
top = top.crop((0, top.height - pixels, top.width, top.height))
|
||||
|
||||
# setting all alpha of the top part to 0
|
||||
alpha = top.getchannel("A")
|
||||
alpha.paste(0, (0, 0, top.width, top.height))
|
||||
top.putalpha(alpha)
|
||||
|
||||
# taking the bottom from the original image
|
||||
bottom = image.crop((0, 0, image.width, image.height - pixels))
|
||||
|
||||
new_img = image.copy()
|
||||
new_img.paste(top, (0, 0))
|
||||
new_img.paste(bottom, (0, pixels))
|
||||
|
||||
# create a 10% dither in the middle
|
||||
dither = min(image.height//10, pixels)
|
||||
for x in range(0, image.width, 2):
|
||||
for y in range(pixels - dither, pixels + dither):
|
||||
(r, g, b, a) = new_img.getpixel((x, y))
|
||||
new_img.putpixel((x, y), (r, g, b, 0))
|
||||
|
||||
# let's rotate back again
|
||||
if direction == 'left':
|
||||
new_img = new_img.transpose(Image.Transpose.ROTATE_90)
|
||||
elif direction == 'bottom':
|
||||
new_img = new_img.transpose(Image.Transpose.ROTATE_180)
|
||||
elif direction == 'right':
|
||||
new_img = new_img.transpose(Image.Transpose.ROTATE_270)
|
||||
|
||||
return new_img
|
||||
|
||||
def _create_init_mask(self, image):
|
||||
# convert into a black/white mask
|
||||
image = self._image_to_mask(image)
|
||||
image = image.convert('RGB')
|
||||
# BUG: We need to use the model's downsample factor rather than hardcoding "8"
|
||||
from ldm.dream.generator.base import downsampling
|
||||
image = image.resize((image.width//downsampling, image.height//downsampling), resample=Image.Resampling.LANCZOS)
|
||||
image = image.resize((image.width//downsampling, image.height //
|
||||
downsampling), resample=Image.Resampling.NEAREST)
|
||||
# print(
|
||||
# f'>> DEBUG: writing the mask to mask.png'
|
||||
# )
|
||||
@@ -660,7 +962,7 @@ class Generate:
|
||||
mask = ImageOps.invert(mask)
|
||||
return mask
|
||||
|
||||
def _has_transparency(self,image):
|
||||
def _has_transparency(self, image):
|
||||
if image.info.get("transparency", None) is not None:
|
||||
return True
|
||||
if image.mode == "P":
|
||||
@@ -674,11 +976,10 @@ class Generate:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _check_for_erasure(self,image):
|
||||
def _check_for_erasure(self, image):
|
||||
width, height = image.size
|
||||
pixdata = image.load()
|
||||
colored = 0
|
||||
pixdata = image.load()
|
||||
colored = 0
|
||||
for y in range(height):
|
||||
for x in range(width):
|
||||
if pixdata[x, y][3] == 0:
|
||||
@@ -688,28 +989,28 @@ class Generate:
|
||||
colored += 1
|
||||
return colored == 0
|
||||
|
||||
def _squeeze_image(self,image):
|
||||
x,y,resize_needed = self._resolution_check(image.width,image.height)
|
||||
def _squeeze_image(self, image):
|
||||
x, y, resize_needed = self._resolution_check(image.width, image.height)
|
||||
if resize_needed:
|
||||
return InitImageResizer(image).resize(x,y)
|
||||
return InitImageResizer(image).resize(x, y)
|
||||
return image
|
||||
|
||||
|
||||
def _fit_image(self,image,max_dimensions):
|
||||
w,h = max_dimensions
|
||||
def _fit_image(self, image, max_dimensions):
|
||||
w, h = max_dimensions
|
||||
print(
|
||||
f'>> image will be resized to fit inside a box {w}x{h} in size.'
|
||||
)
|
||||
if image.width > image.height:
|
||||
h = None # by setting h to none, we tell InitImageResizer to fit into the width and calculate height
|
||||
h = None # by setting h to none, we tell InitImageResizer to fit into the width and calculate height
|
||||
elif image.height > image.width:
|
||||
w = None # ditto for w
|
||||
w = None # ditto for w
|
||||
else:
|
||||
pass
|
||||
image = InitImageResizer(image).resize(w,h) # note that InitImageResizer does the multiple of 64 truncation internally
|
||||
# note that InitImageResizer does the multiple of 64 truncation internally
|
||||
image = InitImageResizer(image).resize(w, h)
|
||||
print(
|
||||
f'>> after adjusting image dimensions to be multiples of 64, init image is {image.width}x{image.height}'
|
||||
)
|
||||
)
|
||||
return image
|
||||
|
||||
def _resolution_check(self, width, height, log=False):
|
||||
@@ -723,7 +1024,7 @@ class Generate:
|
||||
f'>> Provided width and height must be multiples of 64. Auto-resizing to {w}x{h}'
|
||||
)
|
||||
height = h
|
||||
width = w
|
||||
width = w
|
||||
resize_needed = True
|
||||
|
||||
if (width * height) > (self.width * self.height):
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
import torch
|
||||
import warnings
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
from PIL import Image
|
||||
#from scripts.dream import create_argv_parser
|
||||
from ldm.dream.args import Args
|
||||
|
||||
opt = Args()
|
||||
opt.parse_args()
|
||||
model_path = os.path.join(opt.gfpgan_dir, opt.gfpgan_model_path)
|
||||
gfpgan_model_exists = os.path.isfile(model_path)
|
||||
|
||||
def run_gfpgan(image, strength, seed, upsampler_scale=4):
|
||||
print(f'>> GFPGAN - Restoring Faces for image seed:{seed}')
|
||||
gfpgan = None
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
try:
|
||||
if not gfpgan_model_exists:
|
||||
raise Exception('GFPGAN model not found at path ' + model_path)
|
||||
|
||||
sys.path.append(os.path.abspath(opt.gfpgan_dir))
|
||||
from gfpgan import GFPGANer
|
||||
|
||||
bg_upsampler = _load_gfpgan_bg_upsampler(
|
||||
opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
|
||||
)
|
||||
|
||||
gfpgan = GFPGANer(
|
||||
model_path=model_path,
|
||||
upscale=upsampler_scale,
|
||||
arch='clean',
|
||||
channel_multiplier=2,
|
||||
bg_upsampler=bg_upsampler,
|
||||
)
|
||||
except Exception:
|
||||
import traceback
|
||||
|
||||
print('>> Error loading GFPGAN:', file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
if gfpgan is None:
|
||||
print(
|
||||
f'>> WARNING: GFPGAN not initialized.'
|
||||
)
|
||||
print(
|
||||
f'>> Download https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth to {model_path}, \nor change GFPGAN directory with --gfpgan_dir.'
|
||||
)
|
||||
return image
|
||||
|
||||
image = image.convert('RGB')
|
||||
|
||||
cropped_faces, restored_faces, restored_img = gfpgan.enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
has_aligned=False,
|
||||
only_center_face=False,
|
||||
paste_back=True,
|
||||
)
|
||||
res = Image.fromarray(restored_img)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if restored_img.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
gfpgan = None
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _load_gfpgan_bg_upsampler(bg_upsampler, upsampler_scale, bg_tile=400):
|
||||
if bg_upsampler == 'realesrgan':
|
||||
if not torch.cuda.is_available(): # CPU or MPS on M1
|
||||
use_half_precision = False
|
||||
else:
|
||||
use_half_precision = True
|
||||
|
||||
model_path = {
|
||||
2: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
|
||||
4: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
|
||||
}
|
||||
|
||||
if upsampler_scale not in model_path:
|
||||
return None
|
||||
|
||||
from basicsr.archs.rrdbnet_arch import RRDBNet
|
||||
from realesrgan import RealESRGANer
|
||||
|
||||
if upsampler_scale == 4:
|
||||
model = RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=4,
|
||||
)
|
||||
if upsampler_scale == 2:
|
||||
model = RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=2,
|
||||
)
|
||||
|
||||
bg_upsampler = RealESRGANer(
|
||||
scale=upsampler_scale,
|
||||
model_path=model_path[upsampler_scale],
|
||||
model=model,
|
||||
tile=bg_tile,
|
||||
tile_pad=10,
|
||||
pre_pad=0,
|
||||
half=use_half_precision,
|
||||
)
|
||||
else:
|
||||
bg_upsampler = None
|
||||
|
||||
return bg_upsampler
|
||||
|
||||
|
||||
def real_esrgan_upscale(image, strength, upsampler_scale, seed):
|
||||
print(
|
||||
f'>> Real-ESRGAN Upscaling seed:{seed} : scale:{upsampler_scale}x'
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
try:
|
||||
upsampler = _load_gfpgan_bg_upsampler(
|
||||
opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
|
||||
)
|
||||
except Exception:
|
||||
import traceback
|
||||
|
||||
print('>> Error loading Real-ESRGAN:', file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
output, img_mode = upsampler.enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
outscale=upsampler_scale,
|
||||
alpha_upsampler=opt.gfpgan_bg_upsampler,
|
||||
)
|
||||
|
||||
res = Image.fromarray(output)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if output.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
upsampler = None
|
||||
|
||||
return res
|
||||
@@ -90,7 +90,7 @@ class LinearAttention(nn.Module):
|
||||
b, c, h, w = x.shape
|
||||
qkv = self.to_qkv(x)
|
||||
q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
|
||||
k = k.softmax(dim=-1)
|
||||
k = k.softmax(dim=-1)
|
||||
context = torch.einsum('bhdn,bhen->bhde', k, v)
|
||||
out = torch.einsum('bhde,bhdn->bhen', context, q)
|
||||
out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
|
||||
@@ -167,101 +167,85 @@ class CrossAttention(nn.Module):
|
||||
nn.Linear(inner_dim, query_dim),
|
||||
nn.Dropout(dropout)
|
||||
)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self.einsum_op = self.einsum_op_cuda
|
||||
else:
|
||||
self.mem_total = psutil.virtual_memory().total / (1024**3)
|
||||
self.einsum_op = self.einsum_op_mps_v1 if self.mem_total >= 32 else self.einsum_op_mps_v2
|
||||
|
||||
def einsum_op_compvis(self, q, k, v, r1):
|
||||
s1 = einsum('b i d, b j d -> b i j', q, k) * self.scale # faster
|
||||
s2 = s1.softmax(dim=-1, dtype=q.dtype)
|
||||
del s1
|
||||
r1 = einsum('b i j, b j d -> b i d', s2, v)
|
||||
del s2
|
||||
return r1
|
||||
self.mem_total_gb = psutil.virtual_memory().total // (1 << 30)
|
||||
|
||||
def einsum_op_mps_v1(self, q, k, v, r1):
|
||||
def einsum_op_compvis(self, q, k, v):
|
||||
s = einsum('b i d, b j d -> b i j', q, k)
|
||||
s = s.softmax(dim=-1, dtype=s.dtype)
|
||||
return einsum('b i j, b j d -> b i d', s, v)
|
||||
|
||||
def einsum_op_slice_0(self, q, k, v, slice_size):
|
||||
r = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
|
||||
for i in range(0, q.shape[0], slice_size):
|
||||
end = i + slice_size
|
||||
r[i:end] = self.einsum_op_compvis(q[i:end], k[i:end], v[i:end])
|
||||
return r
|
||||
|
||||
def einsum_op_slice_1(self, q, k, v, slice_size):
|
||||
r = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
|
||||
for i in range(0, q.shape[1], slice_size):
|
||||
end = i + slice_size
|
||||
r[:, i:end] = self.einsum_op_compvis(q[:, i:end], k, v)
|
||||
return r
|
||||
|
||||
def einsum_op_mps_v1(self, q, k, v):
|
||||
if q.shape[1] <= 4096: # (512x512) max q.shape[1]: 4096
|
||||
r1 = self.einsum_op_compvis(q, k, v, r1)
|
||||
return self.einsum_op_compvis(q, k, v)
|
||||
else:
|
||||
slice_size = math.floor(2**30 / (q.shape[0] * q.shape[1]))
|
||||
for i in range(0, q.shape[1], slice_size):
|
||||
end = i + slice_size
|
||||
s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * self.scale
|
||||
s2 = s1.softmax(dim=-1, dtype=r1.dtype)
|
||||
del s1
|
||||
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
|
||||
del s2
|
||||
return r1
|
||||
return self.einsum_op_slice_1(q, k, v, slice_size)
|
||||
|
||||
def einsum_op_mps_v2(self, q, k, v, r1):
|
||||
if self.mem_total >= 8 and q.shape[1] <= 4096:
|
||||
r1 = self.einsum_op_compvis(q, k, v, r1)
|
||||
def einsum_op_mps_v2(self, q, k, v):
|
||||
if self.mem_total_gb > 8 and q.shape[1] <= 4096:
|
||||
return self.einsum_op_compvis(q, k, v)
|
||||
else:
|
||||
slice_size = 1
|
||||
for i in range(0, q.shape[0], slice_size):
|
||||
end = min(q.shape[0], i + slice_size)
|
||||
s1 = einsum('b i d, b j d -> b i j', q[i:end], k[i:end])
|
||||
s1 *= self.scale
|
||||
s2 = s1.softmax(dim=-1, dtype=r1.dtype)
|
||||
del s1
|
||||
r1[i:end] = einsum('b i j, b j d -> b i d', s2, v[i:end])
|
||||
del s2
|
||||
return r1
|
||||
|
||||
def einsum_op_cuda(self, q, k, v, r1):
|
||||
return self.einsum_op_slice_0(q, k, v, 1)
|
||||
|
||||
def einsum_op_tensor_mem(self, q, k, v, max_tensor_mb):
|
||||
size_mb = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size() // (1 << 20)
|
||||
if size_mb <= max_tensor_mb:
|
||||
return self.einsum_op_compvis(q, k, v)
|
||||
div = 1 << int((size_mb - 1) / max_tensor_mb).bit_length()
|
||||
if div <= q.shape[0]:
|
||||
return self.einsum_op_slice_0(q, k, v, q.shape[0] // div)
|
||||
return self.einsum_op_slice_1(q, k, v, max(q.shape[1] // div, 1))
|
||||
|
||||
def einsum_op_cuda(self, q, k, v):
|
||||
stats = torch.cuda.memory_stats(q.device)
|
||||
mem_active = stats['active_bytes.all.current']
|
||||
mem_reserved = stats['reserved_bytes.all.current']
|
||||
mem_free_cuda, _ = torch.cuda.mem_get_info(torch.cuda.current_device())
|
||||
mem_free_cuda, _ = torch.cuda.mem_get_info(q.device)
|
||||
mem_free_torch = mem_reserved - mem_active
|
||||
mem_free_total = mem_free_cuda + mem_free_torch
|
||||
# Divide factor of safety as there's copying and fragmentation
|
||||
return self.einsum_op_tensor_mem(q, k, v, mem_free_total / 3.3 / (1 << 20))
|
||||
|
||||
gb = 1024 ** 3
|
||||
tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * 4
|
||||
mem_required = tensor_size * 2.5
|
||||
steps = 1
|
||||
def einsum_op(self, q, k, v):
|
||||
if q.device.type == 'cuda':
|
||||
return self.einsum_op_cuda(q, k, v)
|
||||
|
||||
if mem_required > mem_free_total:
|
||||
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
||||
if q.device.type == 'mps':
|
||||
if self.mem_total_gb >= 32:
|
||||
return self.einsum_op_mps_v1(q, k, v)
|
||||
return self.einsum_op_mps_v2(q, k, v)
|
||||
|
||||
if steps > 64:
|
||||
max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
|
||||
raise RuntimeError(f'Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). '
|
||||
f'Need: {mem_required/64/gb:0.1f}GB free, Have:{mem_free_total/gb:0.1f}GB free')
|
||||
|
||||
slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
|
||||
for i in range(0, q.shape[1], slice_size):
|
||||
end = min(q.shape[1], i + slice_size)
|
||||
s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * self.scale
|
||||
s2 = s1.softmax(dim=-1, dtype=r1.dtype)
|
||||
del s1
|
||||
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
|
||||
del s2
|
||||
return r1
|
||||
# Smaller slices are faster due to L2/L3/SLC caches.
|
||||
# Tested on i7 with 8MB L3 cache.
|
||||
return self.einsum_op_tensor_mem(q, k, v, 32)
|
||||
|
||||
def forward(self, x, context=None, mask=None):
|
||||
h = self.heads
|
||||
|
||||
q_in = self.to_q(x)
|
||||
q = self.to_q(x)
|
||||
context = default(context, x)
|
||||
k_in = self.to_k(context)
|
||||
v_in = self.to_v(context)
|
||||
device_type = 'mps' if x.device.type == 'mps' else 'cuda'
|
||||
k = self.to_k(context) * self.scale
|
||||
v = self.to_v(context)
|
||||
del context, x
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
|
||||
del q_in, k_in, v_in
|
||||
r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
|
||||
r1 = self.einsum_op(q, k, v, r1)
|
||||
del q, k, v
|
||||
|
||||
r2 = rearrange(r1, '(b h) n d -> b n (h d)', h=h)
|
||||
del r1
|
||||
|
||||
return self.to_out(r2)
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
|
||||
r = self.einsum_op(q, k, v)
|
||||
return self.to_out(rearrange(r, '(b h) n d -> b n (h d)', h=h))
|
||||
|
||||
|
||||
class BasicTransformerBlock(nn.Module):
|
||||
|
||||
@@ -3,6 +3,7 @@ import gc
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn.functional import silu
|
||||
import numpy as np
|
||||
from einops import rearrange
|
||||
|
||||
@@ -32,11 +33,6 @@ def get_timestep_embedding(timesteps, embedding_dim):
|
||||
return emb
|
||||
|
||||
|
||||
def nonlinearity(x):
|
||||
# swish
|
||||
return x*torch.sigmoid(x)
|
||||
|
||||
|
||||
def Normalize(in_channels, num_groups=32):
|
||||
return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
|
||||
@@ -122,14 +118,14 @@ class ResnetBlock(nn.Module):
|
||||
|
||||
def forward(self, x, temb):
|
||||
h = self.norm1(x)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
h = self.conv1(h)
|
||||
|
||||
if temb is not None:
|
||||
h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
|
||||
h = h + self.temb_proj(silu(temb))[:,:,None,None]
|
||||
|
||||
h = self.norm2(h)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
h = self.dropout(h)
|
||||
h = self.conv2(h)
|
||||
|
||||
@@ -368,7 +364,7 @@ class Model(nn.Module):
|
||||
assert t is not None
|
||||
temb = get_timestep_embedding(t, self.ch)
|
||||
temb = self.temb.dense[0](temb)
|
||||
temb = nonlinearity(temb)
|
||||
temb = silu(temb)
|
||||
temb = self.temb.dense[1](temb)
|
||||
else:
|
||||
temb = None
|
||||
@@ -402,7 +398,7 @@ class Model(nn.Module):
|
||||
|
||||
# end
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
h = self.conv_out(h)
|
||||
return h
|
||||
|
||||
@@ -499,7 +495,7 @@ class Encoder(nn.Module):
|
||||
|
||||
# end
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
h = self.conv_out(h)
|
||||
return h
|
||||
|
||||
@@ -611,7 +607,7 @@ class Decoder(nn.Module):
|
||||
return h
|
||||
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
h = self.conv_out(h)
|
||||
if self.tanh_out:
|
||||
h = torch.tanh(h)
|
||||
@@ -649,7 +645,7 @@ class SimpleDecoder(nn.Module):
|
||||
x = layer(x)
|
||||
|
||||
h = self.norm_out(x)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
x = self.conv_out(h)
|
||||
return x
|
||||
|
||||
@@ -697,7 +693,7 @@ class UpsampleDecoder(nn.Module):
|
||||
if i_level != self.num_resolutions - 1:
|
||||
h = self.upsample_blocks[k](h)
|
||||
h = self.norm_out(h)
|
||||
h = nonlinearity(h)
|
||||
h = silu(h)
|
||||
h = self.conv_out(h)
|
||||
return h
|
||||
|
||||
@@ -873,7 +869,7 @@ class FirstStagePostProcessor(nn.Module):
|
||||
z_fs = self.encode_with_pretrained(x)
|
||||
z = self.proj_norm(z_fs)
|
||||
z = self.proj(z)
|
||||
z = nonlinearity(z)
|
||||
z = silu(z)
|
||||
|
||||
for submodel, downmodel in zip(self.model,self.downsampler):
|
||||
z = submodel(z,temb=None)
|
||||
|
||||
@@ -252,12 +252,6 @@ def normalization(channels):
|
||||
return GroupNorm32(32, channels)
|
||||
|
||||
|
||||
# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
|
||||
class SiLU(nn.Module):
|
||||
def forward(self, x):
|
||||
return x * torch.sigmoid(x)
|
||||
|
||||
|
||||
class GroupNorm32(nn.GroupNorm):
|
||||
def forward(self, x):
|
||||
return super().forward(x.float()).type(x.dtype)
|
||||
|
||||
@@ -82,7 +82,9 @@ class EmbeddingManager(nn.Module):
|
||||
get_embedding_for_clip_token,
|
||||
embedder.transformer.text_model.embeddings,
|
||||
)
|
||||
token_dim = 1280
|
||||
# per bug report #572
|
||||
#token_dim = 1280
|
||||
token_dim = 768
|
||||
else: # using LDM's BERT encoder
|
||||
self.is_clip = False
|
||||
get_token_for_string = partial(
|
||||
|
||||
Reference in New Issue
Block a user