Compare commits

..

79 Commits

Author SHA1 Message Date
Lincoln Stein
2114c386ad moved index.js .html and .css files into static/dream_web/; changed batch to iterations again 2022-08-25 15:27:43 -04:00
Lincoln Stein
6d2b4cbda1 Merge branch 'main' of github.com:lstein/stable-diffusion into main 2022-08-25 15:15:07 -04:00
Lincoln Stein
562831fc4b Merge branch 'TesseractCat-main' into main 2022-08-25 15:14:50 -04:00
Lincoln Stein
d04518e65e resolved conflicts in use of batch vs iterations 2022-08-25 15:14:38 -04:00
Lincoln Stein
d598b6c79d Update README.md 2022-08-25 15:11:06 -04:00
Lincoln Stein
4ec21a5423 resolved conflicts 2022-08-25 15:09:55 -04:00
Lincoln Stein
b64c902354 added missing image 2022-08-25 15:06:10 -04:00
Lincoln Stein
2ada3288e7 Small cleanups.
- Quenched tokenizer warnings during model initialization.
- Changed "batch" to "iterations" for generating multiple images in
  order to conserve vram.
- Updated README.
- Moved static folder from under scripts to top level. Can store other
  static content there in future.
- Added screenshot of web server in action (to static folder).
2022-08-25 15:03:40 -04:00
tesseractcat
91966e9ffa Fix appearance on mobile 2022-08-25 15:01:08 -04:00
tesseractcat
2ad73246f9 Normalize working directory 2022-08-25 14:27:33 -04:00
tesseractcat
d3a802db69 Fix horizontal divider 2022-08-25 14:18:29 -04:00
tesseractcat
b95908daec Move style and script to individual files 2022-08-25 14:15:08 -04:00
Lincoln Stein
79add5f0b6 Merge branch 'main' of https://github.com/TesseractCat/stable-diffusion into TesseractCat-main 2022-08-25 13:52:44 -04:00
Lincoln Stein
650ae3eb13 Merge pull request #89 from BlueAmulet/remove-accelerate
Remove accelerate library
2022-08-25 13:48:48 -04:00
Lincoln Stein
0e3059728c Merge pull request #85 from JigenD/VRAMutilizationFix
fix VRAM utilization
2022-08-25 13:47:49 -04:00
BlueAmulet
b7735b3788 Fix attribution 2022-08-25 11:13:12 -06:00
BlueAmulet
39b55ae016 Remove accelerate library
This library is not required to use k-diffusion
Make k-diffusion wrapper closer to the other samplers
2022-08-25 11:04:57 -06:00
JigenD
e82c5eba18 PR revision: replace cuda call with dynamic type 2022-08-25 12:18:35 -04:00
Lincoln Stein
1c8ecacddf remove src directory, which is gumming up conda installs; addresses issue #77 2022-08-25 10:43:05 -04:00
JigenD
eb58276a2c fix VRAM utilization 2022-08-25 08:34:51 -04:00
tesseractcat
72a9d75330 404 on missing file 2022-08-25 01:25:22 -04:00
Lincoln Stein
1a7743f3c2 Merge pull request #79 from BaristaLabs/update-readme-with-variant-disc
Update readme with variant disc
2022-08-25 00:44:45 -04:00
Sean McLellan
c521ac08ee Another update 2022-08-25 00:00:39 -04:00
Sean McLellan
29727f3e12 Another update 2022-08-24 23:59:37 -04:00
Sean McLellan
51b9a1d8d3 Update readme.md 2022-08-24 23:55:31 -04:00
tesseractcat
ab131cb55e Add img2img support, fix naming conventions 2022-08-24 23:03:02 -04:00
tesseractcat
269fcf92d9 Reapply prompt config on image click 2022-08-24 21:38:47 -04:00
Lincoln Stein
8b682ac83b Merge pull request #75 from tildebyte/docs-readme-update-109-notes
DOCS: update release features for v1.09 in README - add k_diffusion samplers note
2022-08-24 19:55:10 -04:00
Lincoln Stein
36e4130f1c Merge pull request #72 from BaristaLabs/fix-dependencies
Various fixes in requirements and variant counting.
2022-08-24 19:54:38 -04:00
tesseractcat
0a7fe6f2d9 Switch to ThreadingHTTPServer 2022-08-24 18:19:50 -04:00
Ben Alkov
25fa0ad1f2 docs(readme): update release features for v1.09 2022-08-24 17:50:29 -04:00
tesseractcat
df9f088eb4 Preserve prompt across generations 2022-08-24 17:28:59 -04:00
tesseractcat
b1600d4ca3 Update seed on click 2022-08-24 17:26:22 -04:00
tesseractcat
0efc3bf780 Add bare bones web UI 2022-08-24 17:04:30 -04:00
Sean McLellan
dd16fe16bb Fix issue where more than the expected number of variants are generated 2022-08-24 16:26:58 -04:00
Sean McLellan
4d72644db4 Housekeeping 2022-08-24 15:54:49 -04:00
Lincoln Stein
7ea168227c Update README.md
Added a few features that were missed in initial 1.09 commit.
2022-08-24 15:35:10 -04:00
Lincoln Stein
ef8ddffe46 updated README 2022-08-24 15:28:19 -04:00
Lincoln Stein
81cbcb919e add ability to generate variants on an img2img; warnings quieted 2022-08-24 15:26:59 -04:00
Lincoln Stein
1eec6b776b tweaked documentation and comments slightly 2022-08-24 15:25:52 -04:00
Lincoln Stein
776c747978 added warning message when width/height specified along with init img 2022-08-24 14:04:27 -04:00
Lincoln Stein
caf4dd4155 added a TODO list to keep track of user requests 2022-08-24 13:54:59 -04:00
Sean McLellan
ee10021ea2 bikeshedding 2022-08-24 13:36:27 -04:00
Sean McLellan
ca82acfd3b Remove unnecessary print, small optmi 2022-08-24 13:33:19 -04:00
Sean McLellan
feea5fb063 Merge branch 'main' of https://github.com/BaristaLabs/stable-diffusion-dream into add-simple-variant-mechanism 2022-08-24 13:16:15 -04:00
Sean McLellan
b5cdbd3b0b Fixes issue with cuda/current mismatch 2022-08-24 13:14:08 -04:00
Lincoln Stein
e043f238af backed out change from PR #44 that was causing ddim sampler to fail with the message 'sqrt _vml_cpu not implemented for 'Half' 2022-08-24 13:10:42 -04:00
Lincoln Stein
47a5da25b7 runtime errors now produce a stack trace 2022-08-24 12:57:04 -04:00
Lincoln Stein
f55f4d7156 squelch warnings about modified untracked content due to the src/* submodules 2022-08-24 12:31:48 -04:00
Lincoln Stein
5055e9e1d5 corrected double clip reference from requirements.txt 2022-08-24 12:09:22 -04:00
Sean McLellan
c6b5e930dc Merge branch 'main' of https://github.com/BaristaLabs/stable-diffusion-dream into add-simple-variant-mechanism 2022-08-24 12:06:29 -04:00
Sean McLellan
d33e1bf563 Add simple way to make variants 2022-08-24 12:02:36 -04:00
Lincoln Stein
923466387f minor tweak to .gitignore 2022-08-24 11:52:13 -04:00
Lincoln Stein
56f7b0f434 Merge branch 'warner-benjamin-small-improvements' into main 2022-08-24 11:51:16 -04:00
Lincoln Stein
c24a16ccb0 resolved merge conflicts 2022-08-24 11:50:48 -04:00
Lincoln Stein
ab8ee9bbb6 Merge branch 'BaristaLabs-ensure-image-exists' into main
This catches the case in which user specifies an img2img init img that
doesn't exist.
2022-08-24 11:42:55 -04:00
Lincoln Stein
37609d6e53 resolved merge conflicts 2022-08-24 11:42:44 -04:00
Lincoln Stein
fb9b845fda Merge branch 'BaristaLabs-add-textual-inversion' into main
This allows the use of a custom dataset to fine tune model outputs.
2022-08-24 11:33:39 -04:00
Lincoln Stein
9050ce152b Fixed up a few merge conflicts, looks good so far 2022-08-24 11:29:32 -04:00
Lincoln Stein
73901a2777 Merge pull request #58 from nicolai256/main
init img didn't work in textual inversion, now it does :)
2022-08-24 11:24:30 -04:00
Lincoln Stein
decd1a58d2 Merge branch 'escape-single-quotes' into main
This prevents single quotes in the prompt from generating a parse error.
2022-08-24 11:21:09 -04:00
Lincoln Stein
7f4a5e946d Merge branch 'tildebyte-feat-samplers-add-remaining-k' into main
This adds the remaining k_* samplers to the dream.py script.
2022-08-24 11:19:45 -04:00
Lincoln Stein
4bc64a6aff sampler now written to PNG metadata 2022-08-24 11:18:51 -04:00
Lincoln Stein
02cf5879a1 Merge branch 'feat-samplers-add-remaining-k' of https://github.com/tildebyte/stable-diffusion into tildebyte-feat-samplers-add-remaining-k
This adds all the remaining k_* sampling algorithms.
2022-08-24 10:56:24 -04:00
Benjamin Warner
886f1c0138 Undo more 'cuda' hardcoding 2022-08-24 00:39:25 -05:00
nicolai256
9588444f0e textual inversion + init img fix 2022-08-24 05:16:01 +02:00
Sean McLellan
24b11ecf9f Fix silent SystemExit if embedding_path is not specified 2022-08-23 22:45:02 -04:00
Sean McLellan
84989f0d05 Remote token output on startup 2022-08-23 22:39:10 -04:00
Sean McLellan
a93a79568d Tweak save_top_k setting 2022-08-23 21:56:05 -04:00
Sean McLellan
7081a84600 Fix GPU parameter in readme, add SD style finetune config 2022-08-23 20:14:40 -04:00
Sean McLellan
1df1e5c38b Test for the presence of the specified img2img 2022-08-23 19:22:35 -04:00
Sean McLellan
5a513426bd Remove test .ps1 file 2022-08-23 18:32:38 -04:00
Sean McLellan
611ccb991e Remove another duplicate file 2022-08-23 18:31:41 -04:00
Sean McLellan
bde956647f Remove duplicate t2i file 2022-08-23 18:29:50 -04:00
Sean McLellan
8952196bbf Add personalization 2022-08-23 18:26:28 -04:00
Ben Alkov
050dffd269 feat(samplers): add ability use all k_* samplers
Signed-off-by: Ben Alkov <ben.alkov@gmail.com>
2022-08-23 17:26:22 -04:00
Sean McLellan
0cdf5e61b0 Merge pull request #1 from lstein/main
Upstream changes
2022-08-23 15:44:36 -04:00
Benjamin Warner
de1cea92ce Small QoL imporvements 2022-08-23 12:49:17 -05:00
Lincoln Stein
3a58988e4a escape single quotes in the command stream so as not to confuse the shlex parser 2022-08-23 13:46:50 -04:00
31 changed files with 1939 additions and 211 deletions

175
.gitignore vendored Normal file
View File

@@ -0,0 +1,175 @@
# ignore default image save location and model symbolic link
outputs/
models/ldm/stable-diffusion-v1/model.ckpt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# emacs autosave and recovery files
*~
.#*
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
src
logs/
**/__pycache__/
outputs

0
.gitmodules vendored Normal file
View File

110
README.md
View File

@@ -31,13 +31,7 @@ runs from the command-line (CMD or Terminal window), and does not have a GUI.
(ldm) ~/stable-diffusion$ python3 ./scripts/dream.py
* Initializing, be patient...
Loading model from models/ldm/text2img-large/model.ckpt
LatentDiffusion: Running in eps-prediction mode
DiffusionWrapper has 872.30 M params.
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
making attention of type 'vanilla' with 512 in_channels
Loading Bert tokenizer from "models/bert"
setting sampler to plms
(...more initialization messages...)
* Initialization done! Awaiting your command...
dream> ashley judd riding a camel -n2 -s150
@@ -84,6 +78,27 @@ The --init_img (-I) option gives the path to the seed picture. --strength (-f) c
the original will be modified, ranging from 0.0 (keep the original intact), to 1.0 (ignore the original
completely). The default is 0.75, and ranges from 0.25-0.75 give interesting results.
You may also pass a -v<count> option to generate count variants on the original image. This is done by
passing the first generated image back into img2img the requested number of times. It generates interesting
variants.
## Barebones Web Server
As of version 1.10, this distribution comes with a bare bones web server (see screenshot). To use it,
run the command:
~~~~
(ldm) ~/stable-diffusion$ python3 scripts/dream_web.py
~~~~
You can then connect to the server by pointing your web browser at
http://localhost:9090, or to the network name or IP address of the server.
Kudos to [Tesseract Cat](https://github.com/TesseractCat) for
contributing this code.
![Dream Web Server](static/dream_web_server.png)
## Weighted Prompts
You may weight different sections of the prompt to tell the sampler to attach different levels of
@@ -100,8 +115,79 @@ cat aspect of the image and 75% on the white duck aspect
use any combination of integers and floating point numbers, and they
do not need to add up to 1.
## Personalizing Text-to-Image Generation
You may personalize the generated images to provide your own styles or objects by training a new LDM checkpoint
and introducing a new vocabulary to the fixed model.
To train, prepare a folder that contains images sized at 512x512 and execute the following:
~~~~
# As the default backend is not available on Windows, if you're using that platform, execute SET PL_TORCH_DISTRIBUTED_BACKEND=gloo
(ldm) ~/stable-diffusion$ python3 ./main.py --base ./configs/stable-diffusion/v1-finetune.yaml \
-t \
--actual_resume ./models/ldm/stable-diffusion-v1/model.ckpt \
-n my_cat \
--gpus 0, \
--data_root D:/textual-inversion/my_cat \
--init_word 'cat'
~~~~
During the training process, files will be created in /logs/[project][time][project]/
where you can see the process.
conditioning* contains the training prompts
inputs, reconstruction the input images for the training epoch
samples, samples scaled for a sample of the prompt and one with the init word provided
On a RTX3090, the process for SD will take ~1h @1.6 iterations/sec.
Note: According to the associated paper, the optimal number of images
is 3-5. Your model may not converge if you use more images than that.
Training will run indefinately, but you may wish to stop it before the
heat death of the universe, when you find a low loss epoch or around
~5000 iterations.
Once the model is trained, specify the trained .pt file when starting
dream using
~~~~
(ldm) ~/stable-diffusion$ python3 ./scripts/dream.py --embedding_path /path/to/embedding.pt --full_precision
~~~~
Then, to utilize your subject at the dream prompt
~~~
dream> "a photo of *"
~~~
this also works with image2image
~~~~
dream> "waterfall and rainbow in the style of *" --init_img=./init-images/crude_drawing.png --strength=0.5 -s100 -n4
~~~~
It's also possible to train multiple tokens (modify the placeholder string in configs/stable-diffusion/v1-finetune.yaml) and combine LDM checkpoints using:
~~~~
(ldm) ~/stable-diffusion$ python3 ./scripts/merge_embeddings.py \
--manager_ckpts /path/to/first/embedding.pt /path/to/second/embedding.pt [...] \
--output_path /path/to/output/embedding.pt
~~~~
Credit goes to @rinongal and the repository located at
https://github.com/rinongal/textual_inversion Please see the
repository and associated paper for details and limitations.
## Changes
* v1.09 (24 August 2022)
* A barebone web server for interactive online generation of txt2img and img2img.
* A new -v option allows you to generate multiple variants of an initial image
in img2img mode. (kudos to [Oceanswave](https://github.com/Oceanswave). [See this discussion in the PR for examples and details on use](https://github.com/lstein/stable-diffusion/pull/71#issuecomment-1226700810))
* Added ability to personalize text to image generation (kudos to [Oceanswave](https://github.com/Oceanswave) and [nicolai256](https://github.com/nicolai256))
* Enabled all of the samplers from k_diffusion
* v1.08 (24 August 2022)
* Escape single quotes on the dream> command before trying to parse. This avoids
parse errors.
@@ -381,11 +467,17 @@ to send me an email if you use and like the script.
*Original Author:* Lincoln D. Stein <lincoln.stein@gmail.com>
*Contributions by:* [Peter Kowalczyk](https://github.com/slix), [Henry Harrison](https://github.com/hwharrison), [xraxra](https://github.com/xraxra), and [bmaltais](https://github.com/bmaltais)
*Contributions by:*
[Peter Kowalczyk](https://github.com/slix), [Henry Harrison](https://github.com/hwharrison),
[xraxra](https://github.com/xraxra), [bmaltais](https://github.com/bmaltais), [Sean McLellan](https://github.com/Oceanswave),
[nicolai256](https://github.com/nicolai256), [Benjamin Warner](https://github.com/warner-benjamin),
[tildebyte](https://github.com/tildebyte),
and [Tesseract Cat](https://github.com/TesseractCat)
Original portions of the software are Copyright (c) 2020 Lincoln D. Stein (https://github.com/lstein)
#Further Reading
Please see the original README for more information on this software
and underlying algorithm, located in the file README-CompViz.md.
and underlying algorithm, located in the file README-CompViz.md.

31
TODO.txt Normal file
View File

@@ -0,0 +1,31 @@
Feature requests:
1. "gobig" mode - split image into strips, scale up, add detail using
img2img and reassemble with feathering. Issue #66.
2. Port basujindal low VRAM optimizations. Issue #62
3. Store images under folders named after the prompt. Issue #27.
4. Some sort of automation for generating variations. Issues #32 and #47.
5. Support for inpainting masks #68.
6. Support for loading variations of the stable-diffusion
weights #49
7. Support for klms and other non-ddim samplers in img2img() #36
8. Pass a shell command to open up an image viewer on the last
batch of images generated #29.
Code Refactorization:
1. Move the PNG file generation code out of simplet2i and into
separate module. txt2img() and img2img() should return Image
objects, and parent code is responsible for filenaming logic.
2. Refactor redundant code that is shared between txt2img() and
img2img().
3. Experiment with replacing CompViz code with HuggingFace.

View File

@@ -0,0 +1,105 @@
model:
base_learning_rate: 5.0e-03
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
image_size: 64
channels: 4
cond_stage_trainable: true # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
embedding_reg_weight: 0.0
personalization_config:
target: ldm.modules.embedding_manager.EmbeddingManager
params:
placeholder_strings: ["*"]
initializer_words: ["sculpture"]
per_image_tokens: false
num_vectors_per_token: 1
progressive_words: False
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
data:
target: main.DataModuleFromConfig
params:
batch_size: 2
num_workers: 16
wrap: false
train:
target: ldm.data.personalized.PersonalizedBase
params:
size: 512
set: train
per_image_tokens: false
repeats: 100
validation:
target: ldm.data.personalized.PersonalizedBase
params:
size: 512
set: val
per_image_tokens: false
repeats: 10
lightning:
callbacks:
image_logger:
target: main.ImageLogger
params:
batch_frequency: 500
max_images: 8
increase_log_steps: False
trainer:
benchmark: True
max_steps: 6100

View File

@@ -0,0 +1,103 @@
model:
base_learning_rate: 5.0e-03
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
image_size: 64
channels: 4
cond_stage_trainable: true # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
embedding_reg_weight: 0.0
personalization_config:
target: ldm.modules.embedding_manager.EmbeddingManager
params:
placeholder_strings: ["*"]
initializer_words: ["painting"]
per_image_tokens: false
num_vectors_per_token: 1
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
data:
target: main.DataModuleFromConfig
params:
batch_size: 2
num_workers: 16
wrap: false
train:
target: ldm.data.personalized_style.PersonalizedBase
params:
size: 512
set: train
per_image_tokens: false
repeats: 100
validation:
target: ldm.data.personalized_style.PersonalizedBase
params:
size: 512
set: val
per_image_tokens: false
repeats: 10
lightning:
callbacks:
image_logger:
target: main.ImageLogger
params:
batch_frequency: 500
max_images: 8
increase_log_steps: False
trainer:
benchmark: True

View File

@@ -26,6 +26,15 @@ model:
f_max: [ 1. ]
f_min: [ 1. ]
personalization_config:
target: ldm.modules.embedding_manager.EmbeddingManager
params:
placeholder_strings: ["*"]
initializer_words: ["sculpture"]
per_image_tokens: false
num_vectors_per_token: 1
progressive_words: False
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:

View File

@@ -18,13 +18,13 @@ dependencies:
- pytorch-lightning==1.4.2
- omegaconf==2.1.1
- test-tube>=0.7.5
- streamlit>=0.73.1
- streamlit==1.12.0
- pillow==9.2.0
- einops==0.3.0
- torch-fidelity==0.3.0
- transformers==4.19.2
- torchmetrics==0.6.0
- kornia==0.6
- accelerate==0.12.0
- kornia==0.6.0
- -e git+https://github.com/openai/CLIP.git@main#egg=clip
- -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
- -e git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion

160
ldm/data/personalized.py Normal file
View File

@@ -0,0 +1,160 @@
import os
import numpy as np
import PIL
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
import random
imagenet_templates_smallest = [
'a photo of a {}',
]
imagenet_templates_small = [
'a photo of a {}',
'a rendering of a {}',
'a cropped photo of the {}',
'the photo of a {}',
'a photo of a clean {}',
'a photo of a dirty {}',
'a dark photo of the {}',
'a photo of my {}',
'a photo of the cool {}',
'a close-up photo of a {}',
'a bright photo of the {}',
'a cropped photo of a {}',
'a photo of the {}',
'a good photo of the {}',
'a photo of one {}',
'a close-up photo of the {}',
'a rendition of the {}',
'a photo of the clean {}',
'a rendition of a {}',
'a photo of a nice {}',
'a good photo of a {}',
'a photo of the nice {}',
'a photo of the small {}',
'a photo of the weird {}',
'a photo of the large {}',
'a photo of a cool {}',
'a photo of a small {}',
]
imagenet_dual_templates_small = [
'a photo of a {} with {}',
'a rendering of a {} with {}',
'a cropped photo of the {} with {}',
'the photo of a {} with {}',
'a photo of a clean {} with {}',
'a photo of a dirty {} with {}',
'a dark photo of the {} with {}',
'a photo of my {} with {}',
'a photo of the cool {} with {}',
'a close-up photo of a {} with {}',
'a bright photo of the {} with {}',
'a cropped photo of a {} with {}',
'a photo of the {} with {}',
'a good photo of the {} with {}',
'a photo of one {} with {}',
'a close-up photo of the {} with {}',
'a rendition of the {} with {}',
'a photo of the clean {} with {}',
'a rendition of a {} with {}',
'a photo of a nice {} with {}',
'a good photo of a {} with {}',
'a photo of the nice {} with {}',
'a photo of the small {} with {}',
'a photo of the weird {} with {}',
'a photo of the large {} with {}',
'a photo of a cool {} with {}',
'a photo of a small {} with {}',
]
per_img_token_list = [
'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת',
]
class PersonalizedBase(Dataset):
def __init__(self,
data_root,
size=None,
repeats=100,
interpolation="bicubic",
flip_p=0.5,
set="train",
placeholder_token="*",
per_image_tokens=False,
center_crop=False,
mixing_prob=0.25,
coarse_class_text=None,
):
self.data_root = data_root
self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
# self._length = len(self.image_paths)
self.num_images = len(self.image_paths)
self._length = self.num_images
self.placeholder_token = placeholder_token
self.per_image_tokens = per_image_tokens
self.center_crop = center_crop
self.mixing_prob = mixing_prob
self.coarse_class_text = coarse_class_text
if per_image_tokens:
assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
if set == "train":
self._length = self.num_images * repeats
self.size = size
self.interpolation = {"linear": PIL.Image.LINEAR,
"bilinear": PIL.Image.BILINEAR,
"bicubic": PIL.Image.BICUBIC,
"lanczos": PIL.Image.LANCZOS,
}[interpolation]
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
def __len__(self):
return self._length
def __getitem__(self, i):
example = {}
image = Image.open(self.image_paths[i % self.num_images])
if not image.mode == "RGB":
image = image.convert("RGB")
placeholder_string = self.placeholder_token
if self.coarse_class_text:
placeholder_string = f"{self.coarse_class_text} {placeholder_string}"
if self.per_image_tokens and np.random.uniform() < self.mixing_prob:
text = random.choice(imagenet_dual_templates_small).format(placeholder_string, per_img_token_list[i % self.num_images])
else:
text = random.choice(imagenet_templates_small).format(placeholder_string)
example["caption"] = text
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
if self.center_crop:
crop = min(img.shape[0], img.shape[1])
h, w, = img.shape[0], img.shape[1]
img = img[(h - crop) // 2:(h + crop) // 2,
(w - crop) // 2:(w + crop) // 2]
image = Image.fromarray(img)
if self.size is not None:
image = image.resize((self.size, self.size), resample=self.interpolation)
image = self.flip(image)
image = np.array(image).astype(np.uint8)
example["image"] = (image / 127.5 - 1.0).astype(np.float32)
return example

View File

@@ -0,0 +1,129 @@
import os
import numpy as np
import PIL
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
import random
imagenet_templates_small = [
'a painting in the style of {}',
'a rendering in the style of {}',
'a cropped painting in the style of {}',
'the painting in the style of {}',
'a clean painting in the style of {}',
'a dirty painting in the style of {}',
'a dark painting in the style of {}',
'a picture in the style of {}',
'a cool painting in the style of {}',
'a close-up painting in the style of {}',
'a bright painting in the style of {}',
'a cropped painting in the style of {}',
'a good painting in the style of {}',
'a close-up painting in the style of {}',
'a rendition in the style of {}',
'a nice painting in the style of {}',
'a small painting in the style of {}',
'a weird painting in the style of {}',
'a large painting in the style of {}',
]
imagenet_dual_templates_small = [
'a painting in the style of {} with {}',
'a rendering in the style of {} with {}',
'a cropped painting in the style of {} with {}',
'the painting in the style of {} with {}',
'a clean painting in the style of {} with {}',
'a dirty painting in the style of {} with {}',
'a dark painting in the style of {} with {}',
'a cool painting in the style of {} with {}',
'a close-up painting in the style of {} with {}',
'a bright painting in the style of {} with {}',
'a cropped painting in the style of {} with {}',
'a good painting in the style of {} with {}',
'a painting of one {} in the style of {}',
'a nice painting in the style of {} with {}',
'a small painting in the style of {} with {}',
'a weird painting in the style of {} with {}',
'a large painting in the style of {} with {}',
]
per_img_token_list = [
'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת',
]
class PersonalizedBase(Dataset):
def __init__(self,
data_root,
size=None,
repeats=100,
interpolation="bicubic",
flip_p=0.5,
set="train",
placeholder_token="*",
per_image_tokens=False,
center_crop=False,
):
self.data_root = data_root
self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
# self._length = len(self.image_paths)
self.num_images = len(self.image_paths)
self._length = self.num_images
self.placeholder_token = placeholder_token
self.per_image_tokens = per_image_tokens
self.center_crop = center_crop
if per_image_tokens:
assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
if set == "train":
self._length = self.num_images * repeats
self.size = size
self.interpolation = {"linear": PIL.Image.LINEAR,
"bilinear": PIL.Image.BILINEAR,
"bicubic": PIL.Image.BICUBIC,
"lanczos": PIL.Image.LANCZOS,
}[interpolation]
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
def __len__(self):
return self._length
def __getitem__(self, i):
example = {}
image = Image.open(self.image_paths[i % self.num_images])
if not image.mode == "RGB":
image = image.convert("RGB")
if self.per_image_tokens and np.random.uniform() < 0.25:
text = random.choice(imagenet_dual_templates_small).format(self.placeholder_token, per_img_token_list[i % self.num_images])
else:
text = random.choice(imagenet_templates_small).format(self.placeholder_token)
example["caption"] = text
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
if self.center_crop:
crop = min(img.shape[0], img.shape[1])
h, w, = img.shape[0], img.shape[1]
img = img[(h - crop) // 2:(h + crop) // 2,
(w - crop) // 2:(w + crop) // 2]
image = Image.fromarray(img)
if self.size is not None:
image = image.resize((self.size, self.size), resample=self.interpolation)
image = self.flip(image)
image = np.array(image).astype(np.uint8)
example["image"] = (image / 127.5 - 1.0).astype(np.float32)
return example

View File

@@ -10,16 +10,17 @@ from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, mak
class DDIMSampler(object):
def __init__(self, model, schedule="linear", **kwargs):
def __init__(self, model, schedule="linear", device="cuda", **kwargs):
super().__init__()
self.model = model
self.ddpm_num_timesteps = model.num_timesteps
self.schedule = schedule
self.device = device
def register_buffer(self, name, attr):
if type(attr) == torch.Tensor:
if attr.device != torch.device("cuda"):
attr = attr.to(torch.device("cuda"))
if attr.device != torch.device(self.device):
attr = attr.to(torch.device(self.device))
setattr(self, name, attr)
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):

View File

@@ -7,7 +7,9 @@ https://github.com/CompVis/taming-transformers
"""
import torch
import torch.nn as nn
import os
import numpy as np
import pytorch_lightning as pl
from torch.optim.lr_scheduler import LambdaLR
@@ -64,6 +66,7 @@ class DDPM(pl.LightningModule):
cosine_s=8e-3,
given_betas=None,
original_elbo_weight=0.,
embedding_reg_weight=0.,
v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
l_simple_weight=1.,
conditioning_key=None,
@@ -98,6 +101,7 @@ class DDPM(pl.LightningModule):
self.v_posterior = v_posterior
self.original_elbo_weight = original_elbo_weight
self.l_simple_weight = l_simple_weight
self.embedding_reg_weight = embedding_reg_weight
if monitor is not None:
self.monitor = monitor
@@ -427,6 +431,7 @@ class LatentDiffusion(DDPM):
def __init__(self,
first_stage_config,
cond_stage_config,
personalization_config,
num_timesteps_cond=None,
cond_stage_key="image",
cond_stage_trainable=False,
@@ -436,6 +441,7 @@ class LatentDiffusion(DDPM):
scale_factor=1.0,
scale_by_std=False,
*args, **kwargs):
self.num_timesteps_cond = default(num_timesteps_cond, 1)
self.scale_by_std = scale_by_std
assert self.num_timesteps_cond <= kwargs['timesteps']
@@ -450,6 +456,7 @@ class LatentDiffusion(DDPM):
self.concat_mode = concat_mode
self.cond_stage_trainable = cond_stage_trainable
self.cond_stage_key = cond_stage_key
try:
self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
except:
@@ -460,6 +467,7 @@ class LatentDiffusion(DDPM):
self.register_buffer('scale_factor', torch.tensor(scale_factor))
self.instantiate_first_stage(first_stage_config)
self.instantiate_cond_stage(cond_stage_config)
self.cond_stage_forward = cond_stage_forward
self.clip_denoised = False
self.bbox_tokenizer = None
@@ -469,6 +477,25 @@ class LatentDiffusion(DDPM):
self.init_from_ckpt(ckpt_path, ignore_keys)
self.restarted_from_ckpt = True
self.cond_stage_model.train = disabled_train
for param in self.cond_stage_model.parameters():
param.requires_grad = False
self.model.eval()
self.model.train = disabled_train
for param in self.model.parameters():
param.requires_grad = False
self.embedding_manager = self.instantiate_embedding_manager(personalization_config, self.cond_stage_model)
self.emb_ckpt_counter = 0
# if self.embedding_manager.is_clip:
# self.cond_stage_model.update_embedding_func(self.embedding_manager)
for param in self.embedding_manager.embedding_parameters():
param.requires_grad = True
def make_cond_schedule(self, ):
self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
@@ -530,6 +557,15 @@ class LatentDiffusion(DDPM):
except urllib.error.URLError:
raise SystemExit("* Couldn't load a dependency. Try running scripts/preload_models.py from an internet-conected machine.")
self.cond_stage_model = model
def instantiate_embedding_manager(self, config, embedder):
model = instantiate_from_config(config, embedder=embedder)
if config.params.get("embedding_manager_ckpt", None): # do not load if missing OR empty string
model.load(config.params.embedding_manager_ckpt)
return model
def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
denoise_row = []
@@ -555,7 +591,7 @@ class LatentDiffusion(DDPM):
def get_learned_conditioning(self, c):
if self.cond_stage_forward is None:
if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
c = self.cond_stage_model.encode(c)
c = self.cond_stage_model.encode(c, embedding_manager=self.embedding_manager)
if isinstance(c, DiagonalGaussianDistribution):
c = c.mode()
else:
@@ -880,6 +916,7 @@ class LatentDiffusion(DDPM):
if self.shorten_cond_schedule: # TODO: drop this option
tc = self.cond_ids[t].to(self.device)
c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
return self.p_losses(x, c, t, *args, **kwargs)
def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: move to dataset
@@ -1046,6 +1083,14 @@ class LatentDiffusion(DDPM):
loss += (self.original_elbo_weight * loss_vlb)
loss_dict.update({f'{prefix}/loss': loss})
if self.embedding_reg_weight > 0:
loss_embedding_reg = self.embedding_manager.embedding_to_coarse_loss().mean()
loss_dict.update({f'{prefix}/loss_emb_reg': loss_embedding_reg})
loss += (self.embedding_reg_weight * loss_embedding_reg)
loss_dict.update({f'{prefix}/loss': loss})
return loss, loss_dict
def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
@@ -1250,11 +1295,10 @@ class LatentDiffusion(DDPM):
return samples, intermediates
@torch.no_grad()
def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
plot_diffusion_rows=True, **kwargs):
quantize_denoised=True, inpaint=False, plot_denoise_rows=False, plot_progressive_rows=False,
plot_diffusion_rows=False, **kwargs):
use_ddim = ddim_steps is not None
@@ -1312,6 +1356,16 @@ class LatentDiffusion(DDPM):
if plot_denoise_rows:
denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
log["denoise_row"] = denoise_grid
uc = self.get_learned_conditioning(len(c) * [""])
sample_scaled, _ = self.sample_log(cond=c,
batch_size=N,
ddim=use_ddim,
ddim_steps=ddim_steps,
eta=ddim_eta,
unconditional_guidance_scale=5.0,
unconditional_conditioning=uc)
log["samples_scaled"] = self.decode_first_stage(sample_scaled)
if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
self.first_stage_model, IdentityFirstStage):
@@ -1364,13 +1418,18 @@ class LatentDiffusion(DDPM):
def configure_optimizers(self):
lr = self.learning_rate
params = list(self.model.parameters())
if self.cond_stage_trainable:
print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
params = params + list(self.cond_stage_model.parameters())
if self.learn_logvar:
print('Diffusion model optimizing logvar')
params.append(self.logvar)
if self.embedding_manager is not None:
params = list(self.embedding_manager.embedding_parameters())
# params = list(self.cond_stage_model.transformer.text_model.embeddings.embedding_manager.embedding_parameters())
else:
params = list(self.model.parameters())
if self.cond_stage_trainable:
print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
params = params + list(self.cond_stage_model.parameters())
if self.learn_logvar:
print('Diffusion model optimizing logvar')
params.append(self.logvar)
opt = torch.optim.AdamW(params, lr=lr)
if self.use_scheduler:
assert 'target' in self.scheduler_config
@@ -1395,6 +1454,18 @@ class LatentDiffusion(DDPM):
x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
return x
@rank_zero_only
def on_save_checkpoint(self, checkpoint):
checkpoint.clear()
if os.path.isdir(self.trainer.checkpoint_callback.dirpath):
self.embedding_manager.save(os.path.join(self.trainer.checkpoint_callback.dirpath, "embeddings.pt"))
if (self.global_step - self.emb_ckpt_counter) > 500:
self.embedding_manager.save(os.path.join(self.trainer.checkpoint_callback.dirpath, f"embeddings_gs-{self.global_step}.pt"))
self.emb_ckpt_counter += 500
class DiffusionWrapper(pl.LightningModule):
def __init__(self, diff_model_config, conditioning_key):

View File

@@ -1,8 +1,7 @@
'''wrapper around part of Karen Crownson's k-duffsion library, making it call compatible with other Samplers'''
'''wrapper around part of Katherine Crowson's k-diffusion library, making it call compatible with other Samplers'''
import k_diffusion as K
import torch
import torch.nn as nn
import accelerate
class CFGDenoiser(nn.Module):
def __init__(self, model):
@@ -17,12 +16,11 @@ class CFGDenoiser(nn.Module):
return uncond + (cond - uncond) * cond_scale
class KSampler(object):
def __init__(self,model,schedule="lms", **kwargs):
def __init__(self, model, schedule="lms", device="cuda", **kwargs):
super().__init__()
self.model = K.external.CompVisDenoiser(model)
self.accelerator = accelerate.Accelerator()
self.device = self.accelerator.device
self.model = K.external.CompVisDenoiser(model)
self.schedule = schedule
self.device = device
def forward(self, x, sigma, uncond, cond, cond_scale):
x_in = torch.cat([x] * 2)
@@ -67,8 +65,5 @@ class KSampler(object):
x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0] # for GPU draw
model_wrap_cfg = CFGDenoiser(self.model)
extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': unconditional_guidance_scale}
return (K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not self.accelerator.is_main_process),
return (K.sampling.__dict__[f'sample_{self.schedule}'](model_wrap_cfg, x, sigmas, extra_args=extra_args),
None)
def gather(samples_ddim):
return self.accelerator.gather(samples_ddim)

View File

@@ -9,16 +9,18 @@ from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, mak
class PLMSSampler(object):
def __init__(self, model, schedule="linear", **kwargs):
def __init__(self, model, schedule="linear", device="cuda", **kwargs):
super().__init__()
self.model = model
self.ddpm_num_timesteps = model.num_timesteps
self.schedule = schedule
self.device = device
def register_buffer(self, name, attr):
if type(attr) == torch.Tensor:
if attr.device != torch.device("cuda"):
attr = attr.to(torch.device("cuda"))
if attr.device != torch.device(self.device):
attr = attr.to(torch.device(self.device))
setattr(self, name, attr)
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):

View File

@@ -109,7 +109,7 @@ def checkpoint(func, inputs, params, flag):
explicitly take as arguments.
:param flag: if False, disable gradient checkpointing.
"""
if flag:
if False: # disabled checkpointing to allow requires_grad = False for main model
args = tuple(inputs) + tuple(params)
return CheckpointFunction.apply(func, len(inputs), *args)
else:

View File

@@ -0,0 +1,164 @@
from cmath import log
import torch
from torch import nn
import sys
from ldm.data.personalized import per_img_token_list
from transformers import CLIPTokenizer
from functools import partial
DEFAULT_PLACEHOLDER_TOKEN = ["*"]
PROGRESSIVE_SCALE = 2000
def get_clip_token_for_string(tokenizer, string):
batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True,
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
tokens = batch_encoding["input_ids"]
assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string"
return tokens[0, 1]
def get_bert_token_for_string(tokenizer, string):
token = tokenizer(string)
# assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
token = token[0, 1]
return token
def get_embedding_for_clip_token(embedder, token):
return embedder(token.unsqueeze(0))[0, 0]
class EmbeddingManager(nn.Module):
def __init__(
self,
embedder,
placeholder_strings=None,
initializer_words=None,
per_image_tokens=False,
num_vectors_per_token=1,
progressive_words=False,
**kwargs
):
super().__init__()
self.string_to_token_dict = {}
self.string_to_param_dict = nn.ParameterDict()
self.initial_embeddings = nn.ParameterDict() # These should not be optimized
self.progressive_words = progressive_words
self.progressive_counter = 0
self.max_vectors_per_token = num_vectors_per_token
if hasattr(embedder, 'tokenizer'): # using Stable Diffusion's CLIP encoder
self.is_clip = True
get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer)
get_embedding_for_tkn = partial(get_embedding_for_clip_token, embedder.transformer.text_model.embeddings)
token_dim = 1280
else: # using LDM's BERT encoder
self.is_clip = False
get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn)
get_embedding_for_tkn = embedder.transformer.token_emb
token_dim = 1280
if per_image_tokens:
placeholder_strings.extend(per_img_token_list)
for idx, placeholder_string in enumerate(placeholder_strings):
token = get_token_for_string(placeholder_string)
if initializer_words and idx < len(initializer_words):
init_word_token = get_token_for_string(initializer_words[idx])
with torch.no_grad():
init_word_embedding = get_embedding_for_tkn(init_word_token.cpu())
token_params = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=True)
self.initial_embeddings[placeholder_string] = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=False)
else:
token_params = torch.nn.Parameter(torch.rand(size=(num_vectors_per_token, token_dim), requires_grad=True))
self.string_to_token_dict[placeholder_string] = token
self.string_to_param_dict[placeholder_string] = token_params
def forward(
self,
tokenized_text,
embedded_text,
):
b, n, device = *tokenized_text.shape, tokenized_text.device
for placeholder_string, placeholder_token in self.string_to_token_dict.items():
placeholder_embedding = self.string_to_param_dict[placeholder_string].to(device)
if self.max_vectors_per_token == 1: # If there's only one vector per token, we can do a simple replacement
placeholder_idx = torch.where(tokenized_text == placeholder_token.to(device))
embedded_text[placeholder_idx] = placeholder_embedding
else: # otherwise, need to insert and keep track of changing indices
if self.progressive_words:
self.progressive_counter += 1
max_step_tokens = 1 + self.progressive_counter // PROGRESSIVE_SCALE
else:
max_step_tokens = self.max_vectors_per_token
num_vectors_for_token = min(placeholder_embedding.shape[0], max_step_tokens)
placeholder_rows, placeholder_cols = torch.where(tokenized_text == placeholder_token.to(device))
if placeholder_rows.nelement() == 0:
continue
sorted_cols, sort_idx = torch.sort(placeholder_cols, descending=True)
sorted_rows = placeholder_rows[sort_idx]
for idx in range(len(sorted_rows)):
row = sorted_rows[idx]
col = sorted_cols[idx]
new_token_row = torch.cat([tokenized_text[row][:col], placeholder_token.repeat(num_vectors_for_token).to(device), tokenized_text[row][col + 1:]], axis=0)[:n]
new_embed_row = torch.cat([embedded_text[row][:col], placeholder_embedding[:num_vectors_for_token], embedded_text[row][col + 1:]], axis=0)[:n]
embedded_text[row] = new_embed_row
tokenized_text[row] = new_token_row
return embedded_text
def save(self, ckpt_path):
torch.save({"string_to_token": self.string_to_token_dict,
"string_to_param": self.string_to_param_dict}, ckpt_path)
def load(self, ckpt_path):
ckpt = torch.load(ckpt_path, map_location='cpu')
self.string_to_token_dict = ckpt["string_to_token"]
self.string_to_param_dict = ckpt["string_to_param"]
def get_embedding_norms_squared(self):
all_params = torch.cat(list(self.string_to_param_dict.values()), axis=0) # num_placeholders x embedding_dim
param_norm_squared = (all_params * all_params).sum(axis=-1) # num_placeholders
return param_norm_squared
def embedding_parameters(self):
return self.string_to_param_dict.parameters()
def embedding_to_coarse_loss(self):
loss = 0.
num_embeddings = len(self.initial_embeddings)
for key in self.initial_embeddings:
optimized = self.string_to_param_dict[key]
coarse = self.initial_embeddings[key].clone().to(optimized.device)
loss = loss + (optimized - coarse) @ (optimized - coarse).T / num_embeddings
return loss

View File

@@ -8,6 +8,27 @@ import kornia
from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
def _expand_mask(mask, dtype, tgt_len = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
def _build_causal_attention_mask(bsz, seq_len, dtype):
# lazily create causal attention mask, with full attention between the vision tokens
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
mask.fill_(torch.tensor(torch.finfo(dtype).min))
mask.triu_(1) # zero out the lower diagonal
mask = mask.unsqueeze(1) # expand mask
return mask
class AbstractEncoder(nn.Module):
def __init__(self):
@@ -98,18 +119,17 @@ class BERTEmbedder(AbstractEncoder):
attn_layers=Encoder(dim=n_embed, depth=n_layer),
emb_dropout=embedding_dropout)
def forward(self, text):
def forward(self, text, embedding_manager=None):
if self.use_tknz_fn:
tokens = self.tknz_fn(text)#.to(self.device)
else:
tokens = text
z = self.transformer(tokens, return_embeddings=True)
z = self.transformer(tokens, return_embeddings=True, embedding_manager=embedding_manager)
return z
def encode(self, text):
def encode(self, text, **kwargs):
# output of length 77
return self(text)
return self(text, **kwargs)
class SpatialRescaler(nn.Module):
def __init__(self,
@@ -152,22 +172,165 @@ class FrozenCLIPEmbedder(AbstractEncoder):
self.max_length = max_length
self.freeze()
def embedding_forward(
self,
input_ids = None,
position_ids = None,
inputs_embeds = None,
embedding_manager = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
if embedding_manager is not None:
inputs_embeds = embedding_manager(input_ids, inputs_embeds)
position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings
return embeddings
self.transformer.text_model.embeddings.forward = embedding_forward.__get__(self.transformer.text_model.embeddings)
def encoder_forward(
self,
inputs_embeds,
attention_mask = None,
causal_attention_mask = None,
output_attentions = None,
output_hidden_states = None,
return_dict = None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
return hidden_states
self.transformer.text_model.encoder.forward = encoder_forward.__get__(self.transformer.text_model.encoder)
def text_encoder_forward(
self,
input_ids = None,
attention_mask = None,
position_ids = None,
output_attentions = None,
output_hidden_states = None,
return_dict = None,
embedding_manager = None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is None:
raise ValueError("You have to specify either input_ids")
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, embedding_manager=embedding_manager)
bsz, seq_len = input_shape
# CLIP's text model uses causal mask, prepare it here.
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
hidden_states.device
)
# expand attention_mask
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
last_hidden_state = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = self.final_layer_norm(last_hidden_state)
return last_hidden_state
self.transformer.text_model.forward = text_encoder_forward.__get__(self.transformer.text_model)
def transformer_forward(
self,
input_ids = None,
attention_mask = None,
position_ids = None,
output_attentions = None,
output_hidden_states = None,
return_dict = None,
embedding_manager = None,
):
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
embedding_manager = embedding_manager
)
self.transformer.forward = transformer_forward.__get__(self.transformer)
def freeze(self):
self.transformer = self.transformer.eval()
for param in self.parameters():
param.requires_grad = False
def forward(self, text):
def forward(self, text, **kwargs):
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
tokens = batch_encoding["input_ids"].to(self.device)
outputs = self.transformer(input_ids=tokens)
z = self.transformer(input_ids=tokens, **kwargs)
z = outputs.last_hidden_state
return z
def encode(self, text):
return self(text)
def encode(self, text, **kwargs):
return self(text, **kwargs)
class FrozenCLIPTextEmbedder(nn.Module):

View File

@@ -485,7 +485,8 @@ class AttentionLayers(nn.Module):
mask=None,
context_mask=None,
mems=None,
return_hiddens=False
return_hiddens=False,
**kwargs
):
hiddens = []
intermediates = []
@@ -603,11 +604,19 @@ class TransformerWrapper(nn.Module):
return_mems=False,
return_attn=False,
mems=None,
embedding_manager=None,
**kwargs
):
b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
x = self.token_emb(x)
x += self.pos_emb(x)
embedded_x = self.token_emb(x)
if embedding_manager:
x = embedding_manager(x, embedded_x)
else:
x = embedded_x
x = x + self.pos_emb(x)
x = self.emb_dropout(x)
x = self.project_emb(x)

View File

@@ -18,7 +18,7 @@ t2i = T2I(outdir = <path> // outputs/txt2img-samples
batch_size = <integer> // how many images to generate per sampling (1)
steps = <integer> // 50
seed = <integer> // current system time
sampler_name= ['ddim','plms','klms'] // klms
sampler_name= ['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'] // k_lms
grid = <boolean> // false
width = <integer> // image width, multiple of 64 (512)
height = <integer> // image height, multiple of 64 (512)
@@ -68,6 +68,7 @@ from contextlib import contextmanager, nullcontext
import time
import math
import re
import traceback
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
@@ -96,6 +97,7 @@ class T2I:
downsampling_factor
precision
strength
embedding_path
The vast majority of these arguments default to reasonable values.
"""
@@ -120,7 +122,9 @@ The vast majority of these arguments default to reasonable values.
precision='autocast',
full_precision=False,
strength=0.75, # default in scripts/img2img.py
latent_diffusion_weights=False # just to keep track of this parameter when regenerating prompt
embedding_path=None,
latent_diffusion_weights=False, # just to keep track of this parameter when regenerating prompt
device='cuda'
):
self.outdir = outdir
self.batch_size = batch_size
@@ -140,17 +144,21 @@ The vast majority of these arguments default to reasonable values.
self.precision = precision
self.full_precision = full_precision
self.strength = strength
self.embedding_path = embedding_path
self.model = None # empty for now
self.sampler = None
self.latent_diffusion_weights=latent_diffusion_weights
self.device = device
if seed is None:
self.seed = self._new_seed()
else:
self.seed = seed
@torch.no_grad()
def txt2img(self,prompt,outdir=None,batch_size=None,iterations=None,
steps=None,seed=None,grid=None,individual=None,width=None,height=None,
cfg_scale=None,ddim_eta=None,strength=None,init_img=None,skip_normalize=False):
cfg_scale=None,ddim_eta=None,strength=None,embedding_path=None,init_img=None,
skip_normalize=False,variants=None): # note the "variants" option is an unused hack caused by how options are passed
"""
Generate an image from the prompt, writing iteration images into the outdir
The output is a list of lists in the format: [[filename1,seed1], [filename2,seed2],...]
@@ -165,6 +173,7 @@ The vast majority of these arguments default to reasonable values.
batch_size = batch_size or self.batch_size
iterations = iterations or self.iterations
strength = strength or self.strength # not actually used here, but preserved for code refactoring
embedding_path = embedding_path or self.embedding_path
model = self.load_model() # will instantiate the model or return it from cache
@@ -201,69 +210,67 @@ The vast majority of these arguments default to reasonable values.
# Gawd. Too many levels of indent here. Need to refactor into smaller routines!
try:
with torch.no_grad():
with precision_scope("cuda"):
with model.ema_scope():
all_samples = list()
for n in trange(iterations, desc="Sampling"):
seed_everything(seed)
for prompts in tqdm(data, desc="data", dynamic_ncols=True):
uc = None
if cfg_scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
with precision_scope(self.device.type), model.ema_scope():
all_samples = list()
for n in trange(iterations, desc="Sampling"):
seed_everything(seed)
for prompts in tqdm(data, desc="data", dynamic_ncols=True):
uc = None
if cfg_scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
# weighted sub-prompts
subprompts,weights = T2I._split_weighted_subprompts(prompts[0])
if len(subprompts) > 1:
# i dont know if this is correct.. but it works
c = torch.zeros_like(uc)
# get total weight for normalizing
totalWeight = sum(weights)
# normalize each "sub prompt" and add it
for i in range(0,len(subprompts)):
weight = weights[i]
if not skip_normalize:
weight = weight / totalWeight
c = torch.add(c,model.get_learned_conditioning(subprompts[i]), alpha=weight)
else: # just standard 1 prompt
c = model.get_learned_conditioning(prompts)
# weighted sub-prompts
subprompts,weights = T2I._split_weighted_subprompts(prompts[0])
if len(subprompts) > 1:
# i dont know if this is correct.. but it works
c = torch.zeros_like(uc)
# get total weight for normalizing
totalWeight = sum(weights)
# normalize each "sub prompt" and add it
for i in range(0,len(subprompts)):
weight = weights[i]
if not skip_normalize:
weight = weight / totalWeight
c = torch.add(c,model.get_learned_conditioning(subprompts[i]), alpha=weight)
else: # just standard 1 prompt
c = model.get_learned_conditioning(prompts)
shape = [self.latent_channels, height // self.downsampling_factor, width // self.downsampling_factor]
samples_ddim, _ = sampler.sample(S=steps,
conditioning=c,
batch_size=batch_size,
shape=shape,
verbose=False,
unconditional_guidance_scale=cfg_scale,
unconditional_conditioning=uc,
eta=ddim_eta,
x_T=start_code)
shape = [self.latent_channels, height // self.downsampling_factor, width // self.downsampling_factor]
samples_ddim, _ = sampler.sample(S=steps,
conditioning=c,
batch_size=batch_size,
shape=shape,
verbose=False,
unconditional_guidance_scale=cfg_scale,
unconditional_conditioning=uc,
eta=ddim_eta,
x_T=start_code)
x_samples_ddim = model.decode_first_stage(samples_ddim)
x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
x_samples_ddim = model.decode_first_stage(samples_ddim)
x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
if not grid:
for x_sample in x_samples_ddim:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
filename = self._unique_filename(outdir,previousname=filename,
seed=seed,isbatch=(batch_size>1))
assert not os.path.exists(filename)
Image.fromarray(x_sample.astype(np.uint8)).save(filename)
images.append([filename,seed])
else:
all_samples.append(x_samples_ddim)
seeds.append(seed)
if not grid:
for x_sample in x_samples_ddim:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
filename = self._unique_filename(outdir,previousname=filename,
seed=seed,isbatch=(batch_size>1))
assert not os.path.exists(filename)
Image.fromarray(x_sample.astype(np.uint8)).save(filename)
images.append([filename,seed])
else:
all_samples.append(x_samples_ddim)
seeds.append(seed)
image_count += 1
seed = self._new_seed()
if grid:
images = self._make_grid(samples=all_samples,
seeds=seeds,
batch_size=batch_size,
iterations=iterations,
outdir=outdir)
image_count += 1
seed = self._new_seed()
if grid:
images = self._make_grid(samples=all_samples,
seeds=seeds,
batch_size=batch_size,
iterations=iterations,
outdir=outdir)
except KeyboardInterrupt:
print('*interrupted*')
print('Partial results will be returned; if --grid was requested, nothing will be returned.')
@@ -276,9 +283,11 @@ The vast majority of these arguments default to reasonable values.
return images
# There is lots of shared code between this and txt2img and should be refactored.
@torch.no_grad()
def img2img(self,prompt,outdir=None,init_img=None,batch_size=None,iterations=None,
steps=None,seed=None,grid=None,individual=None,width=None,height=None,
cfg_scale=None,ddim_eta=None,strength=None,skip_normalize=False):
cfg_scale=None,ddim_eta=None,strength=None,embedding_path=None,
skip_normalize=False,variants=None): # note the "variants" option is an unused hack caused by how options are passed
"""
Generate an image from the prompt and the initial image, writing iteration images into the outdir
The output is a list of lists in the format: [[filename1,seed1], [filename2,seed2],...]
@@ -291,6 +300,7 @@ The vast majority of these arguments default to reasonable values.
batch_size = batch_size or self.batch_size
iterations = iterations or self.iterations
strength = strength or self.strength
embedding_path = embedding_path or self.embedding_path
assert strength<1.0 and strength>=0.0, "strength (-f) must be >=0.0 and <1.0"
assert cfg_scale>1.0, "CFG_Scale (-C) must be >1.0"
@@ -315,7 +325,7 @@ The vast majority of these arguments default to reasonable values.
# PLMS sampler not supported yet, so ignore previous sampler
if self.sampler_name!='ddim':
print(f"sampler '{self.sampler_name}' is not yet supported. Using DDM sampler")
sampler = DDIMSampler(model)
sampler = DDIMSampler(model, device=self.device)
else:
sampler = self.sampler
@@ -325,7 +335,7 @@ The vast majority of these arguments default to reasonable values.
assert os.path.isfile(init_img)
init_image = self._load_img(init_img).to(self.device)
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
with precision_scope("cuda"):
with precision_scope(self.device.type):
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image)) # move to latent space
sampler.make_schedule(ddim_num_steps=steps, ddim_eta=ddim_eta, verbose=False)
@@ -347,69 +357,68 @@ The vast majority of these arguments default to reasonable values.
# Gawd. Too many levels of indent here. Need to refactor into smaller routines!
try:
with torch.no_grad():
with precision_scope("cuda"):
with model.ema_scope():
all_samples = list()
for n in trange(iterations, desc="Sampling"):
seed_everything(seed)
for prompts in tqdm(data, desc="data", dynamic_ncols=True):
uc = None
if cfg_scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
with precision_scope(self.device.type), model.ema_scope():
all_samples = list()
for n in trange(iterations, desc="Sampling"):
seed_everything(seed)
for prompts in tqdm(data, desc="data", dynamic_ncols=True):
uc = None
if cfg_scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
# weighted sub-prompts
subprompts,weights = T2I._split_weighted_subprompts(prompts[0])
if len(subprompts) > 1:
# i dont know if this is correct.. but it works
c = torch.zeros_like(uc)
# get total weight for normalizing
totalWeight = sum(weights)
# normalize each "sub prompt" and add it
for i in range(0,len(subprompts)):
weight = weights[i]
if not skip_normalize:
weight = weight / totalWeight
c = torch.add(c,model.get_learned_conditioning(subprompts[i]), alpha=weight)
else: # just standard 1 prompt
c = model.get_learned_conditioning(prompts)
# weighted sub-prompts
subprompts,weights = T2I._split_weighted_subprompts(prompts[0])
if len(subprompts) > 1:
# i dont know if this is correct.. but it works
c = torch.zeros_like(uc)
# get total weight for normalizing
totalWeight = sum(weights)
# normalize each "sub prompt" and add it
for i in range(0,len(subprompts)):
weight = weights[i]
if not skip_normalize:
weight = weight / totalWeight
c = torch.add(c,model.get_learned_conditioning(subprompts[i]), alpha=weight)
else: # just standard 1 prompt
c = model.get_learned_conditioning(prompts)
# encode (scaled latent)
z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(self.device))
# decode it
samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=cfg_scale,
unconditional_conditioning=uc,)
# encode (scaled latent)
z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(self.device))
# decode it
samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=cfg_scale,
unconditional_conditioning=uc,)
x_samples = model.decode_first_stage(samples)
x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
x_samples = model.decode_first_stage(samples)
x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
if not grid:
for x_sample in x_samples:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
filename = self._unique_filename(outdir,previousname=filename,
seed=seed,isbatch=(batch_size>1))
assert not os.path.exists(filename)
Image.fromarray(x_sample.astype(np.uint8)).save(filename)
images.append([filename,seed])
else:
all_samples.append(x_samples)
seeds.append(seed)
image_count +=1
seed = self._new_seed()
if grid:
images = self._make_grid(samples=all_samples,
seeds=seeds,
batch_size=batch_size,
iterations=iterations,
outdir=outdir)
if not grid:
for x_sample in x_samples:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
filename = self._unique_filename(outdir,previousname=filename,
seed=seed,isbatch=(batch_size>1))
assert not os.path.exists(filename)
Image.fromarray(x_sample.astype(np.uint8)).save(filename)
images.append([filename,seed])
else:
all_samples.append(x_samples)
seeds.append(seed)
image_count +=1
seed = self._new_seed()
if grid:
images = self._make_grid(samples=all_samples,
seeds=seeds,
batch_size=batch_size,
iterations=iterations,
outdir=outdir)
except KeyboardInterrupt:
print('*interrupted*')
print('Partial results will be returned; if --grid was requested, nothing will be returned.')
except RuntimeError as e:
print(str(e))
print("Oops! A runtime error has occurred. If this is unexpected, please copy-and-paste this stack trace and post it as an Issue to http://github.com/lstein/stable-diffusion")
traceback.print_exc()
toc = time.time()
print(f'{image_count} images generated in',"%4.2fs"% (toc-tic))
@@ -442,24 +451,38 @@ The vast majority of these arguments default to reasonable values.
seed_everything(self.seed)
try:
config = OmegaConf.load(self.config)
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
self.device = torch.device(self.device) if torch.cuda.is_available() else torch.device("cpu")
model = self._load_model_from_config(config,self.weights)
if self.embedding_path is not None:
model.embedding_manager.load(self.embedding_path)
self.model = model.to(self.device)
# model.to doesn't change the cond_stage_model.device used to move the tokenizer output, so set it here
self.model.cond_stage_model.device = self.device
except AttributeError:
raise SystemExit
msg = f'setting sampler to {self.sampler_name}'
if self.sampler_name=='plms':
print("setting sampler to plms")
self.sampler = PLMSSampler(self.model)
self.sampler = PLMSSampler(self.model, device=self.device)
elif self.sampler_name == 'ddim':
print("setting sampler to ddim")
self.sampler = DDIMSampler(self.model)
elif self.sampler_name == 'klms':
print("setting sampler to klms")
self.sampler = KSampler(self.model,'lms')
self.sampler = DDIMSampler(self.model, device=self.device)
elif self.sampler_name == 'k_dpm_2_a':
self.sampler = KSampler(self.model, 'dpm_2_ancestral', device=self.device)
elif self.sampler_name == 'k_dpm_2':
self.sampler = KSampler(self.model, 'dpm_2', device=self.device)
elif self.sampler_name == 'k_euler_a':
self.sampler = KSampler(self.model, 'euler_ancestral', device=self.device)
elif self.sampler_name == 'k_euler':
self.sampler = KSampler(self.model, 'euler', device=self.device)
elif self.sampler_name == 'k_heun':
self.sampler = KSampler(self.model, 'heun', device=self.device)
elif self.sampler_name == 'k_lms':
self.sampler = KSampler(self.model, 'lms', device=self.device)
else:
print(f"unsupported sampler {self.sampler_name}, defaulting to plms")
self.sampler = PLMSSampler(self.model)
msg = f'unsupported sampler {self.sampler_name}, defaulting to plms'
self.sampler = PLMSSampler(self.model, device=self.device)
print(msg)
return self.model
@@ -471,7 +494,7 @@ The vast majority of these arguments default to reasonable values.
sd = pl_sd["state_dict"]
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
model.cuda()
model.to(self.device)
model.eval()
if self.full_precision:
print('Using slower but more accurate full-precision math (--full_precision)')

View File

@@ -12,6 +12,7 @@ from queue import Queue
from inspect import isfunction
from PIL import Image, ImageDraw, ImageFont
def log_txt_as_img(wh, xc, size=10):
# wh a tuple of (width, height)
# xc a list of captions to plot
@@ -20,7 +21,7 @@ def log_txt_as_img(wh, xc, size=10):
for bi in range(b):
txt = Image.new("RGB", wh, color="white")
draw = ImageDraw.Draw(txt)
font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
font = ImageFont.load_default()
nc = int(40 * (wh[0] / 256))
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
@@ -73,14 +74,14 @@ def count_params(model, verbose=False):
return total_params
def instantiate_from_config(config):
def instantiate_from_config(config, **kwargs):
if not "target" in config:
if config == '__is_first_stage__':
return None
elif config == "__is_unconditional__":
return None
raise KeyError("Expected key `target` to instantiate.")
return get_obj_from_str(config["target"])(**config.get("params", dict()))
return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs)
def get_obj_from_str(string, reload=False):

64
main.py
View File

@@ -2,6 +2,7 @@ import argparse, os, sys, datetime, glob, importlib, csv
import numpy as np
import time
import torch
import torchvision
import pytorch_lightning as pl
@@ -20,6 +21,22 @@ from pytorch_lightning.utilities import rank_zero_info
from ldm.data.base import Txt2ImgIterableBaseDataset
from ldm.util import instantiate_from_config
def load_model_from_config(config, ckpt, verbose=False):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt, map_location="cpu")
sd = pl_sd["state_dict"]
config.model.params.ckpt_path = ckpt
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
if len(m) > 0 and verbose:
print("missing keys:")
print(m)
if len(u) > 0 and verbose:
print("unexpected keys:")
print(u)
model.cuda()
return model
def get_parser(**parser_kwargs):
def str2bool(v):
@@ -120,6 +137,23 @@ def get_parser(**parser_kwargs):
default=True,
help="scale base-lr by ngpu * batch_size * n_accumulate",
)
parser.add_argument(
"--datadir_in_name",
type=str2bool,
nargs="?",
const=True,
default=True,
help="Prepend the final directory in the data_root to the output directory name")
parser.add_argument("--actual_resume", type=str, default="", help="Path to model to actually resume from")
parser.add_argument("--data_root", type=str, required=True, help="Path to directory with training images")
parser.add_argument("--embedding_manager_ckpt", type=str, default="", help="Initialize embedding manager from a checkpoint")
parser.add_argument("--placeholder_tokens", type=str, nargs="+", default=["*"])
parser.add_argument("--init_word", type=str, help="Word to use as source for initial token embedding.")
return parser
@@ -502,6 +536,10 @@ if __name__ == "__main__":
name = "_" + cfg_name
else:
name = ""
if opt.datadir_in_name:
now = os.path.basename(os.path.normpath(opt.data_root)) + now
nowname = now + name + opt.postfix
logdir = os.path.join(opt.logdir, nowname)
@@ -532,7 +570,18 @@ if __name__ == "__main__":
lightning_config.trainer = trainer_config
# model
model = instantiate_from_config(config.model)
# config.model.params.personalization_config.params.init_word = opt.init_word
config.model.params.personalization_config.params.embedding_manager_ckpt = opt.embedding_manager_ckpt
config.model.params.personalization_config.params.placeholder_tokens = opt.placeholder_tokens
if opt.init_word:
config.model.params.personalization_config.params.initializer_words[0] = opt.init_word
if opt.actual_resume:
model = load_model_from_config(config, opt.actual_resume)
else:
model = instantiate_from_config(config.model)
# trainer and callbacks
trainer_kwargs = dict()
@@ -655,11 +704,16 @@ if __name__ == "__main__":
del callbacks_cfg['ignore_keys_callback']
trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
trainer_kwargs["max_steps"] = opt.max_steps
trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
trainer.logdir = logdir ###
# data
config.data.params.train.params.data_root = opt.data_root
config.data.params.validation.params.data_root = opt.data_root
data = instantiate_from_config(config.data)
data = instantiate_from_config(config.data)
# NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
# calling these ourselves should not be necessary but it is.
@@ -710,8 +764,8 @@ if __name__ == "__main__":
import signal
signal.signal(signal.SIGUSR1, melk)
signal.signal(signal.SIGUSR2, divein)
signal.signal(signal.SIGTERM, melk)
signal.signal(signal.SIGTERM, divein)
# run
if opt.train:
@@ -737,5 +791,5 @@ if __name__ == "__main__":
dst = os.path.join(dst, "debug_runs", name)
os.makedirs(os.path.split(dst)[0], exist_ok=True)
os.rename(logdir, dst)
if trainer.global_rank == 0:
print(trainer.profiler.summary())
# if trainer.global_rank == 0:
# print(trainer.profiler.summary())

View File

@@ -1,6 +1,4 @@
accelerate==0.12.0
albumentations==0.4.3
clip==1.0
einops==0.3.0
huggingface-hub==0.8.1
imageio==2.9.0
@@ -9,14 +7,15 @@ kornia==0.6.0
numpy==1.19.2
omegaconf==2.1.1
opencv-python==4.1.2.30
pillow==9.2.0
pudb==2019.2
pytorch
torch==1.11.0
torchvision==0.12.0
pytorch-lightning==1.4.2
streamlit==1.12.0
test-tube>=0.7.5
torch-fidelity==0.3.0
torchmetrics==0.6.0
torchvision
transformers==4.19.2
-e git+https://github.com/openai/CLIP.git@main#egg=clip
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers

View File

@@ -6,6 +6,7 @@ import shlex
import atexit
import os
import sys
import copy
from PIL import Image,PngImagePlugin
# readline unavailable on windows systems
@@ -59,7 +60,9 @@ def main():
weights=weights,
full_precision=opt.full_precision,
config=config,
latent_diffusion_weights=opt.laion400m # this is solely for recreating the prompt
latent_diffusion_weights=opt.laion400m, # this is solely for recreating the prompt
embedding_path=opt.embedding_path,
device=opt.device
)
# make sure the output directory exists
@@ -78,8 +81,7 @@ def main():
exit(-1)
# preload the model
if not debugging:
t2i.load_model()
t2i.load_model()
print("\n* Initialization done! Awaiting your command (-h for help, 'q' to quit, 'cd' to change output dir, 'pwd' to print output dir)...")
log_path = os.path.join(opt.outdir,'dream_log.txt')
@@ -109,6 +111,10 @@ def main_loop(t2i,parser,log,infile):
if command.startswith(('#','//')):
continue
# before splitting, escape single quotes so as not to mess
# up the parser
command = command.replace("'","\\'")
try:
elements = shlex.split(command)
except ValueError as e:
@@ -164,13 +170,41 @@ def main_loop(t2i,parser,log,infile):
if opt.init_img is None:
results = t2i.txt2img(**vars(opt))
else:
assert os.path.exists(opt.init_img),f"No file found at {opt.init_img}. On Linux systems, pressing <tab> after -I will autocomplete a list of possible image files."
if None not in (opt.width,opt.height):
print('Warning: width and height options are ignored when modifying an init image')
results = t2i.img2img(**vars(opt))
except AssertionError as e:
print(e)
continue
allVariantResults = []
if opt.variants is not None:
print(f"Generating {opt.variants} variant(s)...")
newopt = copy.deepcopy(opt)
newopt.iterations = 1
newopt.variants = None
for r in results:
newopt.init_img = r[0]
print(f"\t generating variant for {newopt.init_img}")
for j in range(0, opt.variants):
try:
variantResults = t2i.img2img(**vars(newopt))
allVariantResults.append([newopt,variantResults])
except AssertionError as e:
print(e)
continue
print(f"{opt.variants} Variants generated!")
print("Outputs:")
write_log_message(t2i,opt,results,log)
if allVariantResults:
print("Variant outputs:")
for vr in allVariantResults:
write_log_message(t2i,vr[0],vr[1],log)
print("goodbye!")
@@ -226,6 +260,9 @@ def _reconstruct_switches(t2i,opt):
switches.append(f'-W{opt.width or t2i.width}')
switches.append(f'-H{opt.height or t2i.height}')
switches.append(f'-C{opt.cfg_scale or t2i.cfg_scale}')
switches.append(f'-m{t2i.sampler_name}')
if opt.variants:
switches.append(f'-v{opt.variants}')
if opt.init_img:
switches.append(f'-I{opt.init_img}')
if opt.strength and opt.init_img is not None:
@@ -266,14 +303,22 @@ def create_argv_parser():
help="number of images to produce per iteration (faster, but doesn't generate individual seeds")
parser.add_argument('--sampler','-m',
dest="sampler_name",
choices=['plms','ddim', 'klms'],
default='klms',
help="which sampler to use (klms) - can only be set on command line")
choices=['ddim', 'k_dpm_2_a', 'k_dpm_2', 'k_euler_a', 'k_euler', 'k_heun', 'k_lms', 'plms'],
default='k_lms',
help="which sampler to use (k_lms) - can only be set on command line")
parser.add_argument('--outdir',
'-o',
type=str,
default="outputs/img-samples",
help="directory in which to place generated images and a log of prompts and seeds")
parser.add_argument('--embedding_path',
type=str,
help="Path to a pre-trained embedding manager checkpoint - can only be set on command line")
parser.add_argument('--device',
'-d',
type=str,
default="cuda",
help="device to run stable diffusion on. defaults to cuda `torch.cuda.current_device()` if avalible")
return parser
@@ -289,8 +334,9 @@ def create_cmd_parser():
parser.add_argument('-C','--cfg_scale',default=7.5,type=float,help="prompt configuration scale")
parser.add_argument('-g','--grid',action='store_true',help="generate a grid")
parser.add_argument('-i','--individual',action='store_true',help="generate individual files (default)")
parser.add_argument('-I','--init_img',type=str,help="path to input image (supersedes width and height)")
parser.add_argument('-I','--init_img',type=str,help="path to input image for img2img mode (supersedes width and height)")
parser.add_argument('-f','--strength',default=0.75,type=float,help="strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely")
parser.add_argument('-v','--variants',type=int,help="in img2img mode, the first generated image will get passed back to img2img to generate the requested number of variants")
parser.add_argument('-x','--skip_normalize',action='store_true',help="skip subprompt weight normalization")
return parser
@@ -299,7 +345,7 @@ if readline_available:
readline.set_completer(Completer(['cd','pwd',
'--steps','-s','--seed','-S','--iterations','-n','--batch_size','-b',
'--width','-W','--height','-H','--cfg_scale','-C','--grid','-g',
'--individual','-i','--init_img','-I','--strength','-f']).complete)
'--individual','-i','--init_img','-I','--strength','-f','-v','--variants']).complete)
readline.set_completer_delims(" ")
readline.parse_and_bind('tab: complete')
load_history()

108
scripts/dream_web.py Normal file
View File

@@ -0,0 +1,108 @@
import json
import base64
import mimetypes
import os
from pytorch_lightning import logging
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
print("Loading model...")
from ldm.simplet2i import T2I
model = T2I(sampler_name='k_lms')
# to get rid of annoying warning messages from pytorch
import transformers
transformers.logging.set_verbosity_error()
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
print("Initializing model, be patient...")
model.load_model()
class DreamServer(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
with open("./static/dream_web/index.html", "rb") as content:
self.wfile.write(content.read())
elif os.path.exists("." + self.path):
mime_type = mimetypes.guess_type(self.path)[0]
if mime_type is not None:
self.send_response(200)
self.send_header("Content-type", mime_type)
self.end_headers()
with open("." + self.path, "rb") as content:
self.wfile.write(content.read())
else:
self.send_response(404)
else:
self.send_response(404)
def do_POST(self):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
content_length = int(self.headers['Content-Length'])
post_data = json.loads(self.rfile.read(content_length))
prompt = post_data['prompt']
initimg = post_data['initimg']
iterations = int(post_data['iterations'])
steps = int(post_data['steps'])
width = int(post_data['width'])
height = int(post_data['height'])
cfgscale = float(post_data['cfgscale'])
seed = None if int(post_data['seed']) == -1 else int(post_data['seed'])
print(f"Request to generate with prompt: {prompt}")
outputs = []
if initimg is None:
# Run txt2img
outputs = model.txt2img(prompt,
iterations=iterations,
cfg_scale = cfgscale,
width = width,
height = height,
seed = seed,
steps = steps)
else:
# Decode initimg as base64 to temp file
with open("./img2img-tmp.png", "wb") as f:
initimg = initimg.split(",")[1] # Ignore mime type
f.write(base64.b64decode(initimg))
# Run img2img
outputs = model.img2img(prompt,
init_img = "./img2img-tmp.png",
iterations = iterations,
cfg_scale = cfgscale,
seed = seed,
steps = steps)
# Remove the temp file
os.remove("./img2img-tmp.png")
print(f"Prompt generated with output: {outputs}")
post_data['initimg'] = '' # Don't send init image back
outputs = [x + [post_data] for x in outputs] # Append config to each output
result = {'outputs': outputs}
self.wfile.write(bytes(json.dumps(result), "utf-8"))
if __name__ == "__main__":
# Change working directory to the stable-diffusion directory
os.chdir(
os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))
)
# Start server
dream_server = ThreadingHTTPServer(("0.0.0.0", 9090), DreamServer)
print("\n\n* Started Stable Diffusion dream server! Point your browser at http://localhost:9090 or use the host's DNS name or IP address. *")
try:
dream_server.serve_forever()
except KeyboardInterrupt:
pass
dream_server.server_close()

View File

@@ -0,0 +1,83 @@
from ldm.modules.encoders.modules import BERTTokenizer
from ldm.modules.embedding_manager import EmbeddingManager
import argparse, os
from functools import partial
import torch
def get_placeholder_loop(placeholder_string, tokenizer):
new_placeholder = None
while True:
if new_placeholder is None:
new_placeholder = input(f"Placeholder string {placeholder_string} was already used. Please enter a replacement string: ")
else:
new_placeholder = input(f"Placeholder string '{new_placeholder}' maps to more than a single token. Please enter another string: ")
token = tokenizer(new_placeholder)
if torch.count_nonzero(token) == 3:
return new_placeholder, token[0, 1]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--manager_ckpts",
type=str,
nargs="+",
required=True,
help="Paths to a set of embedding managers to be merged."
)
parser.add_argument(
"--output_path",
type=str,
required=True,
help="Output path for the merged manager",
)
args = parser.parse_args()
tokenizer = BERTTokenizer(vq_interface=False, max_length=77)
EmbeddingManager = partial(EmbeddingManager, tokenizer, ["*"])
string_to_token_dict = {}
string_to_param_dict = torch.nn.ParameterDict()
placeholder_to_src = {}
for manager_ckpt in args.manager_ckpts:
print(f"Parsing {manager_ckpt}...")
manager = EmbeddingManager()
manager.load(manager_ckpt)
for placeholder_string in manager.string_to_token_dict:
if not placeholder_string in string_to_token_dict:
string_to_token_dict[placeholder_string] = manager.string_to_token_dict[placeholder_string]
string_to_param_dict[placeholder_string] = manager.string_to_param_dict[placeholder_string]
placeholder_to_src[placeholder_string] = manager_ckpt
else:
new_placeholder, new_token = get_placeholder_loop(placeholder_string, tokenizer)
string_to_token_dict[new_placeholder] = new_token
string_to_param_dict[new_placeholder] = manager.string_to_param_dict[placeholder_string]
placeholder_to_src[new_placeholder] = manager_ckpt
print("Saving combined manager...")
merged_manager = EmbeddingManager()
merged_manager.string_to_param_dict = string_to_param_dict
merged_manager.string_to_token_dict = string_to_token_dict
merged_manager.save(args.output_path)
print("Managers merged. Final list of placeholders: ")
print(placeholder_to_src)

View File

@@ -12,7 +12,6 @@ from pytorch_lightning import seed_everything
from torch import autocast
from contextlib import contextmanager, nullcontext
import accelerate
import k_diffusion as K
import torch.nn as nn
@@ -201,8 +200,6 @@ def main():
#for klms
model_wrap = K.external.CompVisDenoiser(model)
accelerator = accelerate.Accelerator()
device = accelerator.device
class CFGDenoiser(nn.Module):
def __init__(self, model):
super().__init__()
@@ -251,8 +248,8 @@ def main():
with model.ema_scope():
tic = time.time()
all_samples = list()
for n in trange(opt.n_iter, desc="Sampling", disable =not accelerator.is_main_process):
for prompts in tqdm(data, desc="data", disable =not accelerator.is_main_process):
for n in trange(opt.n_iter, desc="Sampling"):
for prompts in tqdm(data, desc="data"):
uc = None
if opt.scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
@@ -279,13 +276,10 @@ def main():
x = torch.randn([opt.n_samples, *shape], device=device) * sigmas[0] # for GPU draw
model_wrap_cfg = CFGDenoiser(model_wrap)
extra_args = {'cond': c, 'uncond': uc, 'cond_scale': opt.scale}
samples_ddim = K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not accelerator.is_main_process)
samples_ddim = K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args)
x_samples_ddim = model.decode_first_stage(samples_ddim)
x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
if opt.klms:
x_sample = accelerator.gather(x_samples_ddim)
if not opt.skip_save:
for x_sample in x_samples_ddim:

1
src/k-diffusion Submodule

Submodule src/k-diffusion added at ef1bf07627

View File

@@ -0,0 +1,61 @@
* {
font-family: 'Arial';
}
#header {
text-decoration: dotted underline;
}
#search {
margin-top: 20vh;
margin-left: auto;
margin-right: auto;
max-width: 800px;
text-align: center;
}
fieldset {
border: none;
}
#fieldset-search {
display: flex;
}
#prompt {
flex-grow: 1;
border-radius: 20px 0px 0px 20px;
padding: 5px 10px 5px 10px;
border: 1px solid black;
border-right: none;
outline: none;
}
#submit {
border-radius: 0px 20px 20px 0px;
padding: 5px 10px 5px 10px;
border: 1px solid black;
}
#results {
text-align: center;
max-width: 1000px;
margin: auto;
padding-top: 10px;
}
img {
cursor: pointer;
height: 30vh;
border-radius: 5px;
margin: 10px;
}
#fieldset-config {
line-height:2em;
}
input[type="number"] {
width: 60px;
}
#seed {
width: 150px;
}
hr {
width: 200px;
}
label {
white-space: nowrap;
}

View File

@@ -0,0 +1,48 @@
<html>
<head>
<title>Stable Diffusion Dream Server</title>
<link rel="icon" href="data:,">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="static/dream_web/index.css">
<script src="static/dream_web/index.js"></script>
</head>
<body>
<div id="search">
<h2 id="header">Stable Diffusion Dream Server</h2>
<form id="generate-form" method="post" action="#">
<fieldset id="fieldset-search">
<input type="text" id="prompt" name="prompt">
<input type="submit" id="submit" value="Generate">
</fieldset>
<fieldset id="fieldset-config">
<label for="iterations">Images to generate:</label>
<input value="1" type="number" id="iterations" name="iterations">
<label for="steps">Steps:</label>
<input value="50" type="number" id="steps" name="steps">
<label for="cfgscale">Cfg Scale:</label>
<input value="7.5" type="number" id="cfgscale" name="cfgscale" step="any">
<span>&bull;</span>
<label title="Set to multiple of 64" for="width">Width:</label>
<input value="512" type="number" id="width" name="width">
<label title="Set to multiple of 64" for="height">Height:</label>
<input value="512" type="number" id="height" name="height">
<br>
<label title="Upload an image to use img2img" for="initimg">Img2Img Init:</label>
<input type="file" id="initimg" name="initimg" accept=".jpg, .jpeg, .png">
<label title="Set to -1 for random seed" for="seed">Seed:</label>
<input value="-1" type="number" id="seed" name="seed">
<button type="button" id="reset">&olarr;</button>
</fieldset>
</form>
<div id="about">For news and support for this web service, visit our <a href="http://github.com/lstein/stable-diffusion">GitHub site</a></div>
</div>
<hr>
<div id="results">
<div id="no-results-message">
<i><p>No results...</p></i>
</div>
</div>
</body>
</html>

101
static/dream_web/index.js Normal file
View File

@@ -0,0 +1,101 @@
function toBase64(file) {
return new Promise((resolve, reject) => {
const r = new FileReader();
r.readAsDataURL(file);
r.onload = () => resolve(r.result);
r.onerror = (error) => reject(error);
});
}
function appendOutput(output) {
let outputNode = document.createElement("img");
outputNode.src = output[0];
let outputConfig = output[2];
let altText = output[1].toString() + " | " + outputConfig.prompt;
outputNode.alt = altText;
outputNode.title = altText;
// Reload image config
outputNode.addEventListener('click', () => {
let form = document.querySelector("#generate-form");
for (const [k, v] of new FormData(form)) {
form.querySelector(`*[name=${k}]`).value = outputConfig[k];
}
document.querySelector("#seed").value = output[1];
saveFields(document.querySelector("#generate-form"));
});
document.querySelector("#results").prepend(outputNode);
}
function appendOutputs(outputs) {
for (const output of outputs) {
appendOutput(output);
}
}
function saveFields(form) {
for (const [k, v] of new FormData(form)) {
if (typeof v !== 'object') { // Don't save 'file' type
localStorage.setItem(k, v);
}
}
}
function loadFields(form) {
for (const [k, v] of new FormData(form)) {
const item = localStorage.getItem(k);
if (item != null) {
form.querySelector(`*[name=${k}]`).value = item;
}
}
}
async function generateSubmit(form) {
const prompt = document.querySelector("#prompt").value;
// Convert file data to base64
let formData = Object.fromEntries(new FormData(form));
formData.initimg = formData.initimg.name !== '' ? await toBase64(formData.initimg) : null;
// Post as JSON
fetch(form.action, {
method: form.method,
body: JSON.stringify(formData),
}).then(async (result) => {
let data = await result.json();
// Re-enable form, remove no-results-message
form.querySelector('fieldset').removeAttribute('disabled');
document.querySelector("#prompt").value = prompt;
if (data.outputs.length != 0) {
document.querySelector("#no-results-message")?.remove();
appendOutputs(data.outputs);
} else {
alert("Error occurred while generating.");
}
});
// Disable form while generating
form.querySelector('fieldset').setAttribute('disabled','');
document.querySelector("#prompt").value = `Generating: "${prompt}"`;
}
window.onload = () => {
document.querySelector("#generate-form").addEventListener('submit', (e) => {
e.preventDefault();
const form = e.target;
generateSubmit(form);
});
document.querySelector("#generate-form").addEventListener('change', (e) => {
saveFields(e.target.form);
});
document.querySelector("#reset").addEventListener('click', (e) => {
document.querySelector("#seed").value = -1;
saveFields(e.target.form);
});
loadFields(document.querySelector("#generate-form"));
};

BIN
static/dream_web_server.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 536 KiB