Compare commits
360 Commits
release-1.
...
release-1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2882c2d0a6 | ||
|
|
4ffdf73412 | ||
|
|
9130ad7e08 | ||
|
|
d66010410c | ||
|
|
6566c2298c | ||
|
|
063b4a1995 | ||
|
|
18cdb556bd | ||
|
|
8d16a69b80 | ||
|
|
a406b588b4 | ||
|
|
5454a0edc2 | ||
|
|
fe5cc79249 | ||
|
|
361cc42829 | ||
|
|
91cce6b4c3 | ||
|
|
d0df894c9f | ||
|
|
f46916d521 | ||
|
|
12755c6ef6 | ||
|
|
cc4f33bf3a | ||
|
|
d8c0d020eb | ||
|
|
e918cb1a8a | ||
|
|
0163310a47 | ||
|
|
423d25716d | ||
|
|
1d999ba974 | ||
|
|
27d4bb5624 | ||
|
|
c78b496da6 | ||
|
|
dd2af3f93c | ||
|
|
2d65b03f05 | ||
|
|
2288412ef2 | ||
|
|
6bff985496 | ||
|
|
918ade12ed | ||
|
|
68f62c8352 | ||
|
|
33936430d0 | ||
|
|
81b3de9c65 | ||
|
|
ad6cf6f2f7 | ||
|
|
ecef72ca39 | ||
|
|
92d1ed744a | ||
|
|
da4bf95fbc | ||
|
|
d43c5c01e3 | ||
|
|
51278c7a10 | ||
|
|
6ef7c1ad4e | ||
|
|
33cc16473f | ||
|
|
1701c2ea94 | ||
|
|
2e299a1daf | ||
|
|
0b582a40d0 | ||
|
|
1306457b27 | ||
|
|
f4a19af04f | ||
|
|
58545ba057 | ||
|
|
4fe265735a | ||
|
|
2b7f32502c | ||
|
|
3ee82d8a3b | ||
|
|
629ca09fda | ||
|
|
833de06299 | ||
|
|
68eabab2af | ||
|
|
a4f69e62d7 | ||
|
|
7db51d0171 | ||
|
|
1b3c7acce3 | ||
|
|
e6b2c15fc5 | ||
|
|
d319b8a762 | ||
|
|
db580ccefd | ||
|
|
9e99fcbc16 | ||
|
|
346c9b66ec | ||
|
|
a52870684a | ||
|
|
2455bb38a4 | ||
|
|
01e05a98de | ||
|
|
2cac4697aa | ||
|
|
c5e95adb49 | ||
|
|
91565970c2 | ||
|
|
09bd9fa47e | ||
|
|
dc30adfbb4 | ||
|
|
fa98601bfb | ||
|
|
66fe110148 | ||
|
|
bf50ab9dd6 | ||
|
|
70119602a0 | ||
|
|
28fe84177e | ||
|
|
35d3f0ed90 | ||
|
|
0433b3d625 | ||
|
|
4b560b50c2 | ||
|
|
9ad79207c2 | ||
|
|
0be2351c97 | ||
|
|
ed513397b2 | ||
|
|
c52ba1b022 | ||
|
|
d022d0dd11 | ||
|
|
a14fd69a5a | ||
|
|
0d2e6f90c8 | ||
|
|
58e3562652 | ||
|
|
b622819051 | ||
|
|
a547c33327 | ||
|
|
31b77dbaf8 | ||
|
|
4280788c18 | ||
|
|
146e75a1de | ||
|
|
8a2b849620 | ||
|
|
462a1961e4 | ||
|
|
84c10346fb | ||
|
|
2aa8393272 | ||
|
|
c83d01b369 | ||
|
|
5354122094 | ||
|
|
64444025a9 | ||
|
|
d566ee092a | ||
|
|
b983d61e93 | ||
|
|
153c93bdd4 | ||
|
|
3be1cee17c | ||
|
|
bdb0651eb2 | ||
|
|
1480ef84dc | ||
|
|
1714816fe2 | ||
|
|
b5565d2c82 | ||
|
|
4fad71cd8c | ||
|
|
d126db2413 | ||
|
|
7811d20f21 | ||
|
|
d524e5797d | ||
|
|
8ca4d6542d | ||
|
|
a51e18ea98 | ||
|
|
8bf321f6ae | ||
|
|
5d13207aa6 | ||
|
|
dae2b26765 | ||
|
|
713b2a03dc | ||
|
|
186d0f9d10 | ||
|
|
55b448818e | ||
|
|
b4babf7680 | ||
|
|
85f32752fe | ||
|
|
b757384aba | ||
|
|
a5d21d7c94 | ||
|
|
8f3520e2d5 | ||
|
|
19e4298cf9 | ||
|
|
42ffcd7204 | ||
|
|
d48299e56c | ||
|
|
2e22d9ecf1 | ||
|
|
18597ad1d9 | ||
|
|
0173d3a8fc | ||
|
|
e7658b941e | ||
|
|
a7a62d39d4 | ||
|
|
24ce56b3db | ||
|
|
3220f73f0a | ||
|
|
27a1044e65 | ||
|
|
39c56f20be | ||
|
|
f6b2ec61b2 | ||
|
|
e57d6fd1a6 | ||
|
|
1b40a31a89 | ||
|
|
4fce1063c4 | ||
|
|
f9862a3d88 | ||
|
|
81ad239197 | ||
|
|
ed38c97ed8 | ||
|
|
4f8e7356b3 | ||
|
|
c363f033e8 | ||
|
|
22c25b3615 | ||
|
|
7fe7cdc8c9 | ||
|
|
e26fee78b5 | ||
|
|
63178c6a8c | ||
|
|
6fb2f1ed6e | ||
|
|
38701a6d7b | ||
|
|
31fa92a83f | ||
|
|
0abfc3cac6 | ||
|
|
d483fcb53a | ||
|
|
c7db038c96 | ||
|
|
132d23e55d | ||
|
|
90cbc6362c | ||
|
|
f33ae1bdf4 | ||
|
|
754525be82 | ||
|
|
d9eab7f383 | ||
|
|
f695988915 | ||
|
|
5d19294810 | ||
|
|
77803cf233 | ||
|
|
4acfb76be6 | ||
|
|
fd13526454 | ||
|
|
7718af041c | ||
|
|
30dbf0e589 | ||
|
|
070795a3b4 | ||
|
|
e351d6ffe5 | ||
|
|
46464ac677 | ||
|
|
03d8eb19e0 | ||
|
|
fef632e0e1 | ||
|
|
05061a70b3 | ||
|
|
617a029ae7 | ||
|
|
7ae79b350e | ||
|
|
9a8cd9684e | ||
|
|
18899be4ae | ||
|
|
3ea505bc2d | ||
|
|
e2ae6d288d | ||
|
|
41b26e0520 | ||
|
|
b6053108c1 | ||
|
|
22365a3f12 | ||
|
|
594c0eeb8c | ||
|
|
529040708b | ||
|
|
f0e2fa781f | ||
|
|
87b7446228 | ||
|
|
8a517fdc17 | ||
|
|
373a2d9c32 | ||
|
|
1f8bc9482a | ||
|
|
b85773f332 | ||
|
|
ddc0e9b4d8 | ||
|
|
44a48d0981 | ||
|
|
8bbe7936bd | ||
|
|
9e7865704a | ||
|
|
ac02a775e4 | ||
|
|
7c485a1a4a | ||
|
|
36bc989a27 | ||
|
|
ea2ee33be8 | ||
|
|
5d67986997 | ||
|
|
7dfca3dcb5 | ||
|
|
e0de42bd03 | ||
|
|
614974a8e8 | ||
|
|
6e49c070bb | ||
|
|
08a9702b73 | ||
|
|
042a9043d1 | ||
|
|
a7ac93a899 | ||
|
|
3b2569ebdd | ||
|
|
8b9a520c5c | ||
|
|
ba03289c14 | ||
|
|
d1551b1bd4 | ||
|
|
fab9e1a423 | ||
|
|
59be6c815d | ||
|
|
ff6c11406b | ||
|
|
6f90c7daf6 | ||
|
|
38ed6393fa | ||
|
|
a5a3300fc6 | ||
|
|
0ab03a5fde | ||
|
|
800132970e | ||
|
|
555f13e469 | ||
|
|
9b5101cd8d | ||
|
|
7040995ceb | ||
|
|
5129f256a3 | ||
|
|
b0b4ccf521 | ||
|
|
ed72ff3268 | ||
|
|
89805a5239 | ||
|
|
e00397f9ca | ||
|
|
12f59e1daa | ||
|
|
cf750f62db | ||
|
|
0f28663805 | ||
|
|
f3fad22cb6 | ||
|
|
7bf0bc5208 | ||
|
|
4e5aa7e714 | ||
|
|
46a223f229 | ||
|
|
eb9f0be91a | ||
|
|
4f02b72c9c | ||
|
|
dd670200bb | ||
|
|
8f89a2456a | ||
|
|
407d70a987 | ||
|
|
f1ffb5b51b | ||
|
|
4f1664ec4f | ||
|
|
fcdd95b652 | ||
|
|
470a62dbbe | ||
|
|
2c08cf7175 | ||
|
|
539c15966d | ||
|
|
5f844807cb | ||
|
|
cb86b9ae6e | ||
|
|
3a30a8f2d2 | ||
|
|
60ed004328 | ||
|
|
dbb9132f4d | ||
|
|
5711b6d611 | ||
|
|
f1bed52530 | ||
|
|
23fb4a72bb | ||
|
|
c38b6964b4 | ||
|
|
e202441f0c | ||
|
|
d051d86df6 | ||
|
|
b49475a54f | ||
|
|
797de3257c | ||
|
|
31b22e057d | ||
|
|
078859207d | ||
|
|
a10baf5808 | ||
|
|
0eba55ddbc | ||
|
|
19fa222810 | ||
|
|
b3e3b0e861 | ||
|
|
dde2994d10 | ||
|
|
888ca39ce2 | ||
|
|
f4c95bfec0 | ||
|
|
91d3e4605e | ||
|
|
652c67c90e | ||
|
|
2114c386ad | ||
|
|
6d2b4cbda1 | ||
|
|
562831fc4b | ||
|
|
d04518e65e | ||
|
|
d598b6c79d | ||
|
|
4ec21a5423 | ||
|
|
b64c902354 | ||
|
|
2ada3288e7 | ||
|
|
91966e9ffa | ||
|
|
2ad73246f9 | ||
|
|
d3a802db69 | ||
|
|
b95908daec | ||
|
|
79add5f0b6 | ||
|
|
650ae3eb13 | ||
|
|
0e3059728c | ||
|
|
b7735b3788 | ||
|
|
39b55ae016 | ||
|
|
e82c5eba18 | ||
|
|
1c8ecacddf | ||
|
|
26dc05e0e0 | ||
|
|
49247b4aa4 | ||
|
|
eb58276a2c | ||
|
|
72a9d75330 | ||
|
|
1a7743f3c2 | ||
|
|
0b4459b707 | ||
|
|
c521ac08ee | ||
|
|
29727f3e12 | ||
|
|
51b9a1d8d3 | ||
|
|
ab131cb55e | ||
|
|
269fcf92d9 | ||
|
|
8b682ac83b | ||
|
|
36e4130f1c | ||
|
|
b978536385 | ||
|
|
0a7fe6f2d9 | ||
|
|
b12955c963 | ||
|
|
9133087850 | ||
|
|
25fa0ad1f2 | ||
|
|
df9f088eb4 | ||
|
|
b1600d4ca3 | ||
|
|
0efc3bf780 | ||
|
|
dd16fe16bb | ||
|
|
4d72644db4 | ||
|
|
7ea168227c | ||
|
|
ef8ddffe46 | ||
|
|
81cbcb919e | ||
|
|
1eec6b776b | ||
|
|
776c747978 | ||
|
|
caf4dd4155 | ||
|
|
ee10021ea2 | ||
|
|
ca82acfd3b | ||
|
|
feea5fb063 | ||
|
|
b5cdbd3b0b | ||
|
|
e043f238af | ||
|
|
47a5da25b7 | ||
|
|
f55f4d7156 | ||
|
|
5055e9e1d5 | ||
|
|
c6b5e930dc | ||
|
|
d33e1bf563 | ||
|
|
923466387f | ||
|
|
56f7b0f434 | ||
|
|
c24a16ccb0 | ||
|
|
ab8ee9bbb6 | ||
|
|
37609d6e53 | ||
|
|
fb9b845fda | ||
|
|
9050ce152b | ||
|
|
73901a2777 | ||
|
|
decd1a58d2 | ||
|
|
7f4a5e946d | ||
|
|
4bc64a6aff | ||
|
|
02cf5879a1 | ||
|
|
d495bac307 | ||
|
|
3393b8cad1 | ||
|
|
886f1c0138 | ||
|
|
9588444f0e | ||
|
|
24b11ecf9f | ||
|
|
84989f0d05 | ||
|
|
a93a79568d | ||
|
|
7081a84600 | ||
|
|
1df1e5c38b | ||
|
|
5a513426bd | ||
|
|
611ccb991e | ||
|
|
bde956647f | ||
|
|
8952196bbf | ||
|
|
050dffd269 | ||
|
|
0cdf5e61b0 | ||
|
|
de1cea92ce | ||
|
|
3a58988e4a | ||
|
|
7a67d3d837 | ||
|
|
9050f3d399 | ||
|
|
a21156e3e3 | ||
|
|
716dbbdf8c | ||
|
|
1f2e52a1d6 | ||
|
|
dc788f92b3 | ||
|
|
13774912f4 | ||
|
|
cb9e6d544a |
32
.dev_scripts/diff_images.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def read_image_int16(image_path):
|
||||
image = Image.open(image_path)
|
||||
return np.array(image).astype(np.int16)
|
||||
|
||||
|
||||
def calc_images_mean_L1(image1_path, image2_path):
|
||||
image1 = read_image_int16(image1_path)
|
||||
image2 = read_image_int16(image2_path)
|
||||
assert image1.shape == image2.shape
|
||||
|
||||
mean_L1 = np.abs(image1 - image2).mean()
|
||||
return mean_L1
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('image1_path')
|
||||
parser.add_argument('image2_path')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
mean_L1 = calc_images_mean_L1(args.image1_path, args.image2_path)
|
||||
print(mean_L1)
|
||||
|
After Width: | Height: | Size: 416 KiB |
1
.dev_scripts/sample_command.txt
Normal file
@@ -0,0 +1 @@
|
||||
"a photograph of an astronaut riding a horse" -s50 -S42
|
||||
20
.dev_scripts/test_regression_txt2img_dream_v1_4.sh
Normal file
@@ -0,0 +1,20 @@
|
||||
# generate an image
|
||||
PROMPT_FILE=".dev_scripts/sample_command.txt"
|
||||
OUT_DIR="outputs/img-samples/test_regression_txt2img_v1_4"
|
||||
SAMPLES_DIR=${OUT_DIR}
|
||||
python scripts/dream.py \
|
||||
--from_file ${PROMPT_FILE} \
|
||||
--outdir ${OUT_DIR} \
|
||||
--sampler plms \
|
||||
--full_precision
|
||||
|
||||
# original output by CompVis/stable-diffusion
|
||||
IMAGE1=".dev_scripts/images/v1_4_astronaut_rides_horse_plms_step50_seed42.png"
|
||||
# new output
|
||||
IMAGE2=`ls -A ${SAMPLES_DIR}/*.png | sort | tail -n 1`
|
||||
|
||||
echo ""
|
||||
echo "comparing the following two images"
|
||||
echo "IMAGE1: ${IMAGE1}"
|
||||
echo "IMAGE2: ${IMAGE2}"
|
||||
python .dev_scripts/diff_images.py ${IMAGE1} ${IMAGE2}
|
||||
23
.dev_scripts/test_regression_txt2img_v1_4.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
# generate an image
|
||||
PROMPT="a photograph of an astronaut riding a horse"
|
||||
OUT_DIR="outputs/txt2img-samples/test_regression_txt2img_v1_4"
|
||||
SAMPLES_DIR="outputs/txt2img-samples/test_regression_txt2img_v1_4/samples"
|
||||
python scripts/orig_scripts/txt2img.py \
|
||||
--prompt "${PROMPT}" \
|
||||
--outdir ${OUT_DIR} \
|
||||
--plms \
|
||||
--ddim_steps 50 \
|
||||
--n_samples 1 \
|
||||
--n_iter 1 \
|
||||
--seed 42
|
||||
|
||||
# original output by CompVis/stable-diffusion
|
||||
IMAGE1=".dev_scripts/images/v1_4_astronaut_rides_horse_plms_step50_seed42.png"
|
||||
# new output
|
||||
IMAGE2=`ls -A ${SAMPLES_DIR}/*.png | sort | tail -n 1`
|
||||
|
||||
echo ""
|
||||
echo "comparing the following two images"
|
||||
echo "IMAGE1: ${IMAGE1}"
|
||||
echo "IMAGE2: ${IMAGE2}"
|
||||
python .dev_scripts/diff_images.py ${IMAGE1} ${IMAGE2}
|
||||
4
.gitattributes
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
# Auto normalizes line endings on commit so devs don't need to change local settings.
|
||||
# Only affects text files and ignores other file types.
|
||||
# For more info see: https://www.aleksandrhovhannisyan.com/blog/crlf-vs-lf-normalizing-line-endings-in-git/
|
||||
* text=auto
|
||||
36
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe your environment**
|
||||
- GPU: [cuda/amd/mps/cpu]
|
||||
- VRAM: [if known]
|
||||
- CPU arch: [x86/arm]
|
||||
- OS: [Linux/Windows/macOS]
|
||||
- Python: [Anaconda/miniconda/miniforge/pyenv/other (explain)]
|
||||
- Branch: [if `git status` says anything other than "On branch main" paste it here]
|
||||
- Commit: [run `git show` and paste the line that starts with "Merge" here]
|
||||
|
||||
**Describe the bug**
|
||||
A clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior:
|
||||
1. Go to '...'
|
||||
2. Click on '....'
|
||||
3. Scroll down to '....'
|
||||
4. See error
|
||||
|
||||
**Expected behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
**Additional context**
|
||||
Add any other context about the problem here.
|
||||
20
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Is your feature request related to a problem? Please describe.**
|
||||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
||||
|
||||
**Describe the solution you'd like**
|
||||
A clear and concise description of what you want to happen.
|
||||
|
||||
**Describe alternatives you've considered**
|
||||
A clear and concise description of any alternative solutions or features you've considered.
|
||||
|
||||
**Additional context**
|
||||
Add any other context or screenshots about the feature request here.
|
||||
185
.gitignore
vendored
Normal file
@@ -0,0 +1,185 @@
|
||||
# ignore default image save location and model symbolic link
|
||||
outputs/
|
||||
models/ldm/stable-diffusion-v1/model.ckpt
|
||||
|
||||
# ignore a directory which serves as a place for initial images
|
||||
inputs/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# emacs autosave and recovery files
|
||||
*~
|
||||
.#*
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
src
|
||||
**/__pycache__/
|
||||
outputs
|
||||
|
||||
# Logs and associated folders
|
||||
# created from generated embeddings.
|
||||
logs
|
||||
testtube
|
||||
checkpoints
|
||||
# If it's a Mac
|
||||
.DS_Store
|
||||
0
.gitmodules
vendored
Normal file
137
CHANGELOG.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# **Changelog**
|
||||
|
||||
## v1.13 (in process)
|
||||
|
||||
- Supports a Google Colab notebook for a standalone server running on Google hardware [Arturo Mendivil](https://github.com/artmen1516)
|
||||
- WebUI supports GFPGAN/ESRGAN facial reconstruction and upscaling [Kevin Gibbons](https://github.com/bakkot)
|
||||
- WebUI supports incremental display of in-progress images during generation [Kevin Gibbons](https://github.com/bakkot)
|
||||
- Output directory can be specified on the dream> command line.
|
||||
- The grid was displaying duplicated images when not enough images to fill the final row [Muhammad Usama](https://github.com/SMUsamaShah)
|
||||
- Can specify --grid on dream.py command line as the default.
|
||||
- Miscellaneous internal bug and stability fixes.
|
||||
|
||||
---
|
||||
|
||||
## v1.12 (28 August 2022)
|
||||
|
||||
- Improved file handling, including ability to read prompts from standard input.
|
||||
(kudos to [Yunsaki](https://github.com/yunsaki)
|
||||
- The web server is now integrated with the dream.py script. Invoke by adding --web to
|
||||
the dream.py command arguments.
|
||||
- Face restoration and upscaling via GFPGAN and Real-ESGAN are now automatically
|
||||
enabled if the GFPGAN directory is located as a sibling to Stable Diffusion.
|
||||
VRAM requirements are modestly reduced. Thanks to both [Blessedcoolant](https://github.com/blessedcoolant) and
|
||||
[Oceanswave](https://github.com/oceanswave) for their work on this.
|
||||
- You can now swap samplers on the dream> command line. [Blessedcoolant](https://github.com/blessedcoolant)
|
||||
|
||||
---
|
||||
|
||||
## v1.11 (26 August 2022)
|
||||
|
||||
- NEW FEATURE: Support upscaling and face enhancement using the GFPGAN module. (kudos to [Oceanswave](https://github.com/Oceanswave)
|
||||
- You now can specify a seed of -1 to use the previous image's seed, -2 to use the seed for the image generated before that, etc.
|
||||
Seed memory only extends back to the previous command, but will work on all images generated with the -n# switch.
|
||||
- Variant generation support temporarily disabled pending more general solution.
|
||||
- Created a feature branch named **yunsaki-morphing-dream** which adds experimental support for
|
||||
iteratively modifying the prompt and its parameters. Please see[ Pull Request #86](https://github.com/lstein/stable-diffusion/pull/86)
|
||||
for a synopsis of how this works. Note that when this feature is eventually added to the main branch, it will may be modified
|
||||
significantly.
|
||||
|
||||
---
|
||||
|
||||
## v1.10 (25 August 2022)
|
||||
|
||||
- A barebones but fully functional interactive web server for online generation of txt2img and img2img.
|
||||
|
||||
---
|
||||
|
||||
## v1.09 (24 August 2022)
|
||||
|
||||
- A new -v option allows you to generate multiple variants of an initial image
|
||||
in img2img mode. (kudos to [Oceanswave](https://github.com/Oceanswave). [
|
||||
See this discussion in the PR for examples and details on use](https://github.com/lstein/stable-diffusion/pull/71#issuecomment-1226700810))
|
||||
- Added ability to personalize text to image generation (kudos to [Oceanswave](https://github.com/Oceanswave) and [nicolai256](https://github.com/nicolai256))
|
||||
- Enabled all of the samplers from k_diffusion
|
||||
|
||||
---
|
||||
|
||||
## v1.08 (24 August 2022)
|
||||
|
||||
- Escape single quotes on the dream> command before trying to parse. This avoids
|
||||
parse errors.
|
||||
- Removed instruction to get Python3.8 as first step in Windows install.
|
||||
Anaconda3 does it for you.
|
||||
- Added bounds checks for numeric arguments that could cause crashes.
|
||||
- Cleaned up the copyright and license agreement files.
|
||||
|
||||
---
|
||||
|
||||
## v1.07 (23 August 2022)
|
||||
|
||||
- Image filenames will now never fill gaps in the sequence, but will be assigned the
|
||||
next higher name in the chosen directory. This ensures that the alphabetic and chronological
|
||||
sort orders are the same.
|
||||
|
||||
---
|
||||
|
||||
## v1.06 (23 August 2022)
|
||||
|
||||
- Added weighted prompt support contributed by [xraxra](https://github.com/xraxra)
|
||||
- Example of using weighted prompts to tweak a demonic figure contributed by [bmaltais](https://github.com/bmaltais)
|
||||
|
||||
---
|
||||
|
||||
## v1.05 (22 August 2022 - after the drop)
|
||||
|
||||
- Filenames now use the following formats:
|
||||
000010.95183149.png -- Two files produced by the same command (e.g. -n2),
|
||||
000010.26742632.png -- distinguished by a different seed.
|
||||
|
||||
000011.455191342.01.png -- Two files produced by the same command using
|
||||
000011.455191342.02.png -- a batch size>1 (e.g. -b2). They have the same seed.
|
||||
|
||||
000011.4160627868.grid#1-4.png -- a grid of four images (-g); the whole grid can
|
||||
be regenerated with the indicated key
|
||||
|
||||
- It should no longer be possible for one image to overwrite another
|
||||
- You can use the "cd" and "pwd" commands at the dream> prompt to set and retrieve
|
||||
the path of the output directory.
|
||||
|
||||
---
|
||||
|
||||
## v1.04 (22 August 2022 - after the drop)
|
||||
|
||||
- Updated README to reflect installation of the released weights.
|
||||
- Suppressed very noisy and inconsequential warning when loading the frozen CLIP
|
||||
tokenizer.
|
||||
|
||||
---
|
||||
|
||||
## v1.03 (22 August 2022)
|
||||
|
||||
- The original txt2img and img2img scripts from the CompViz repository have been moved into
|
||||
a subfolder named "orig_scripts", to reduce confusion.
|
||||
|
||||
---
|
||||
|
||||
## v1.02 (21 August 2022)
|
||||
|
||||
- A copy of the prompt and all of its switches and options is now stored in the corresponding
|
||||
image in a tEXt metadata field named "Dream". You can read the prompt using scripts/images2prompt.py,
|
||||
or an image editor that allows you to explore the full metadata.
|
||||
**Please run "conda env update -f environment.yaml" to load the k_lms dependencies!!**
|
||||
|
||||
---
|
||||
|
||||
## v1.01 (21 August 2022)
|
||||
|
||||
- added k_lms sampling.
|
||||
**Please run "conda env update -f environment.yaml" to load the k_lms dependencies!!**
|
||||
- use half precision arithmetic by default, resulting in faster execution and lower memory requirements
|
||||
Pass argument --full_precision to dream.py to get slower but more accurate image generation
|
||||
|
||||
---
|
||||
|
||||
## Links
|
||||
|
||||
- **[Read Me](readme.md)**
|
||||
30
LICENSE
@@ -1,9 +1,27 @@
|
||||
All rights reserved by the authors.
|
||||
You must not distribute the weights provided to you directly or indirectly without explicit consent of the authors.
|
||||
You must not distribute harmful, offensive, dehumanizing content or otherwise harmful representations of people or their environments, cultures, religions, etc. produced with the model weights
|
||||
or other generated content described in the "Misuse and Malicious Use" section in the model card.
|
||||
The model weights are provided for research purposes only.
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||
|
||||
This software is derived from a fork of the source code available from
|
||||
https://github.com/pesser/stable-diffusion and
|
||||
https://github.com/CompViz/stable-diffusion. They carry the following
|
||||
copyrights:
|
||||
|
||||
Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
|
||||
Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
|
||||
|
||||
Please see individual source code files for copyright and authorship
|
||||
attributions.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
@@ -11,4 +29,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
SOFTWARE.
|
||||
|
||||
294
LICENSE-ModelWeights.txt
Normal file
@@ -0,0 +1,294 @@
|
||||
Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
|
||||
|
||||
CreativeML Open RAIL-M
|
||||
dated August 22, 2022
|
||||
|
||||
Section I: PREAMBLE
|
||||
|
||||
Multimodal generative models are being widely adopted and used, and
|
||||
have the potential to transform the way artists, among other
|
||||
individuals, conceive and benefit from AI or ML technologies as a tool
|
||||
for content creation.
|
||||
|
||||
Notwithstanding the current and potential benefits that these
|
||||
artifacts can bring to society at large, there are also concerns about
|
||||
potential misuses of them, either due to their technical limitations
|
||||
or ethical considerations.
|
||||
|
||||
In short, this license strives for both the open and responsible
|
||||
downstream use of the accompanying model. When it comes to the open
|
||||
character, we took inspiration from open source permissive licenses
|
||||
regarding the grant of IP rights. Referring to the downstream
|
||||
responsible use, we added use-based restrictions not permitting the
|
||||
use of the Model in very specific scenarios, in order for the licensor
|
||||
to be able to enforce the license in case potential misuses of the
|
||||
Model may occur. At the same time, we strive to promote open and
|
||||
responsible research on generative models for art and content
|
||||
generation.
|
||||
|
||||
Even though downstream derivative versions of the model could be
|
||||
released under different licensing terms, the latter will always have
|
||||
to include - at minimum - the same use-based restrictions as the ones
|
||||
in the original license (this license). We believe in the intersection
|
||||
between open and responsible AI development; thus, this License aims
|
||||
to strike a balance between both in order to enable responsible
|
||||
open-science in the field of AI.
|
||||
|
||||
This License governs the use of the model (and its derivatives) and is
|
||||
informed by the model card associated with the model.
|
||||
|
||||
NOW THEREFORE, You and Licensor agree as follows:
|
||||
|
||||
1. Definitions
|
||||
|
||||
- "License" means the terms and conditions for use, reproduction, and
|
||||
Distribution as defined in this document.
|
||||
|
||||
- "Data" means a collection of information and/or content extracted
|
||||
from the dataset used with the Model, including to train, pretrain,
|
||||
or otherwise evaluate the Model. The Data is not licensed under this
|
||||
License.
|
||||
|
||||
- "Output" means the results of operating a Model as embodied in
|
||||
informational content resulting therefrom.
|
||||
|
||||
- "Model" means any accompanying machine-learning based assemblies
|
||||
(including checkpoints), consisting of learnt weights, parameters
|
||||
(including optimizer states), corresponding to the model
|
||||
architecture as embodied in the Complementary Material, that have
|
||||
been trained or tuned, in whole or in part on the Data, using the
|
||||
Complementary Material.
|
||||
|
||||
- "Derivatives of the Model" means all modifications to the Model,
|
||||
works based on the Model, or any other model which is created or
|
||||
initialized by transfer of patterns of the weights, parameters,
|
||||
activations or output of the Model, to the other model, in order to
|
||||
cause the other model to perform similarly to the Model, including -
|
||||
but not limited to - distillation methods entailing the use of
|
||||
intermediate data representations or methods based on the generation
|
||||
of synthetic data by the Model for training the other model.
|
||||
|
||||
- "Complementary Material" means the accompanying source code and
|
||||
scripts used to define, run, load, benchmark or evaluate the Model,
|
||||
and used to prepare data for training or evaluation, if any. This
|
||||
includes any accompanying documentation, tutorials, examples, etc,
|
||||
if any.
|
||||
|
||||
- "Distribution" means any transmission, reproduction, publication or
|
||||
other sharing of the Model or Derivatives of the Model to a third
|
||||
party, including providing the Model as a hosted service made
|
||||
available by electronic or other remote means - e.g. API-based or
|
||||
web access.
|
||||
|
||||
- "Licensor" means the copyright owner or entity authorized by the
|
||||
copyright owner that is granting the License, including the persons
|
||||
or entities that may have rights in the Model and/or distributing
|
||||
the Model.
|
||||
|
||||
- "You" (or "Your") means an individual or Legal Entity exercising
|
||||
permissions granted by this License and/or making use of the Model
|
||||
for whichever purpose and in any field of use, including usage of
|
||||
the Model in an end-use application - e.g. chatbot, translator,
|
||||
image generator.
|
||||
|
||||
- "Third Parties" means individuals or legal entities that are not
|
||||
under common control with Licensor or You.
|
||||
|
||||
- "Contribution" means any work of authorship, including the original
|
||||
version of the Model and any modifications or additions to that
|
||||
Model or Derivatives of the Model thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Model by the copyright
|
||||
owner or by an individual or Legal Entity authorized to submit on
|
||||
behalf of the copyright owner. For the purposes of this definition,
|
||||
"submitted" means any form of electronic, verbal, or written
|
||||
communication sent to the Licensor or its representatives, including
|
||||
but not limited to communication on electronic mailing lists, source
|
||||
code control systems, and issue tracking systems that are managed
|
||||
by, or on behalf of, the Licensor for the purpose of discussing and
|
||||
improving the Model, but excluding communication that is
|
||||
conspicuously marked or otherwise designated in writing by the
|
||||
copyright owner as "Not a Contribution."
|
||||
|
||||
- "Contributor" means Licensor and any individual or Legal Entity on
|
||||
behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Model.
|
||||
|
||||
Section II: INTELLECTUAL PROPERTY RIGHTS
|
||||
|
||||
Both copyright and patent grants apply to the Model, Derivatives of
|
||||
the Model and Complementary Material. The Model and Derivatives of the
|
||||
Model are subject to additional terms as described in Section III.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare, publicly display, publicly
|
||||
perform, sublicense, and distribute the Complementary Material, the
|
||||
Model, and Derivatives of the Model.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License and where and as applicable, each Contributor hereby
|
||||
grants to You a perpetual, worldwide, non-exclusive, no-charge,
|
||||
royalty-free, irrevocable (except as stated in this paragraph) patent
|
||||
license to make, have made, use, offer to sell, sell, import, and
|
||||
otherwise transfer the Model and the Complementary Material, where
|
||||
such license applies only to those patent claims licensable by such
|
||||
Contributor that are necessarily infringed by their Contribution(s)
|
||||
alone or by combination of their Contribution(s) with the Model to
|
||||
which such Contribution(s) was submitted. If You institute patent
|
||||
litigation against any entity (including a cross-claim or counterclaim
|
||||
in a lawsuit) alleging that the Model and/or Complementary Material or
|
||||
a Contribution incorporated within the Model and/or Complementary
|
||||
Material constitutes direct or contributory patent infringement, then
|
||||
any patent licenses granted to You under this License for the Model
|
||||
and/or Work shall terminate as of the date such litigation is asserted
|
||||
or filed.
|
||||
|
||||
Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
|
||||
|
||||
4. Distribution and Redistribution. You may host for Third Party
|
||||
remote access purposes (e.g. software-as-a-service), reproduce and
|
||||
distribute copies of the Model or Derivatives of the Model thereof in
|
||||
any medium, with or without modifications, provided that You meet the
|
||||
following conditions: Use-based restrictions as referenced in
|
||||
paragraph 5 MUST be included as an enforceable provision by You in any
|
||||
type of legal agreement (e.g. a license) governing the use and/or
|
||||
distribution of the Model or Derivatives of the Model, and You shall
|
||||
give notice to subsequent users You Distribute to, that the Model or
|
||||
Derivatives of the Model are subject to paragraph 5. This provision
|
||||
does not apply to the use of Complementary Material. You must give
|
||||
any Third Party recipients of the Model or Derivatives of the Model a
|
||||
copy of this License; You must cause any modified files to carry
|
||||
prominent notices stating that You changed the files; You must retain
|
||||
all copyright, patent, trademark, and attribution notices excluding
|
||||
those notices that do not pertain to any part of the Model,
|
||||
Derivatives of the Model. You may add Your own copyright statement to
|
||||
Your modifications and may provide additional or different license
|
||||
terms and conditions - respecting paragraph 4.a. - for use,
|
||||
reproduction, or Distribution of Your modifications, or for any such
|
||||
Derivatives of the Model as a whole, provided Your use, reproduction,
|
||||
and Distribution of the Model otherwise complies with the conditions
|
||||
stated in this License.
|
||||
|
||||
5. Use-based restrictions. The restrictions set forth in Attachment A
|
||||
are considered Use-based restrictions. Therefore You cannot use the
|
||||
Model and the Derivatives of the Model for the specified restricted
|
||||
uses. You may use the Model subject to this License, including only
|
||||
for lawful purposes and in accordance with the License. Use may
|
||||
include creating any content with, finetuning, updating, running,
|
||||
training, evaluating and/or reparametrizing the Model. You shall
|
||||
require all of Your users who use the Model or a Derivative of the
|
||||
Model to comply with the terms of this paragraph (paragraph 5).
|
||||
|
||||
6. The Output You Generate. Except as set forth herein, Licensor
|
||||
claims no rights in the Output You generate using the Model. You are
|
||||
accountable for the Output you generate and its subsequent uses. No
|
||||
use of the output can contravene any provision as stated in the
|
||||
License.
|
||||
|
||||
Section IV: OTHER PROVISIONS
|
||||
|
||||
7. Updates and Runtime Restrictions. To the maximum extent permitted
|
||||
by law, Licensor reserves the right to restrict (remotely or
|
||||
otherwise) usage of the Model in violation of this License, update the
|
||||
Model through electronic means, or modify the Output of the Model
|
||||
based on updates. You shall undertake reasonable efforts to use the
|
||||
latest version of the Model.
|
||||
|
||||
8. Trademarks and related. Nothing in this License permits You to make
|
||||
use of Licensors’ trademarks, trade names, logos or to otherwise
|
||||
suggest endorsement or misrepresent the relationship between the
|
||||
parties; and any rights not expressly granted herein are reserved by
|
||||
the Licensors.
|
||||
|
||||
9. Disclaimer of Warranty. Unless required by applicable law or agreed
|
||||
to in writing, Licensor provides the Model and the Complementary
|
||||
Material (and each Contributor provides its Contributions) on an "AS
|
||||
IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||
express or implied, including, without limitation, any warranties or
|
||||
conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR
|
||||
A PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Model, Derivatives of
|
||||
the Model, and the Complementary Material and assume any risks
|
||||
associated with Your exercise of permissions under this License.
|
||||
|
||||
10. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise, unless
|
||||
required by applicable law (such as deliberate and grossly negligent
|
||||
acts) or agreed to in writing, shall any Contributor be liable to You
|
||||
for damages, including any direct, indirect, special, incidental, or
|
||||
consequential damages of any character arising as a result of this
|
||||
License or out of the use or inability to use the Model and the
|
||||
Complementary Material (including but not limited to damages for loss
|
||||
of goodwill, work stoppage, computer failure or malfunction, or any
|
||||
and all other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
11. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Model, Derivatives of the Model and the Complementary Material
|
||||
thereof, You may choose to offer, and charge a fee for, acceptance of
|
||||
support, warranty, indemnity, or other liability obligations and/or
|
||||
rights consistent with this License. However, in accepting such
|
||||
obligations, You may act only on Your own behalf and on Your sole
|
||||
responsibility, not on behalf of any other Contributor, and only if
|
||||
You agree to indemnify, defend, and hold each Contributor harmless for
|
||||
any liability incurred by, or claims asserted against, such
|
||||
Contributor by reason of your accepting any such warranty or
|
||||
additional liability.
|
||||
|
||||
12. If any provision of this License is held to be invalid, illegal or
|
||||
unenforceable, the remaining provisions shall be unaffected thereby
|
||||
and remain valid as if such provision had not been set forth herein.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
|
||||
|
||||
|
||||
Attachment A
|
||||
|
||||
Use Restrictions
|
||||
|
||||
You agree not to use the Model or Derivatives of the Model:
|
||||
|
||||
- In any way that violates any applicable national, federal, state,
|
||||
local or international law or regulation;
|
||||
|
||||
- For the purpose of exploiting, harming or attempting to exploit or
|
||||
harm minors in any way;
|
||||
|
||||
- To generate or disseminate verifiably false information and/or
|
||||
content with the purpose of harming others;
|
||||
|
||||
- To generate or disseminate personal identifiable information that
|
||||
can be used to harm an individual;
|
||||
|
||||
- To defame, disparage or otherwise harass others;
|
||||
|
||||
- For fully automated decision making that adversely impacts an
|
||||
individual’s legal rights or otherwise creates or modifies a
|
||||
binding, enforceable obligation;
|
||||
|
||||
pp- For any use intended to or which has the effect of discriminating
|
||||
against or harming individuals or groups based on online or offline
|
||||
social behavior or known or predicted personal or personality
|
||||
characteristics;
|
||||
|
||||
- To exploit any of the vulnerabilities of a specific group of persons
|
||||
based on their age, social, physical or mental characteristics, in
|
||||
order to materially distort the behavior of a person pertaining to
|
||||
that group in a manner that causes or is likely to cause that person
|
||||
or another person physical or psychological harm;
|
||||
|
||||
- For any use intended to or which has the effect of discriminating
|
||||
against individuals or groups based on legally protected
|
||||
characteristics or categories;
|
||||
|
||||
- To provide medical advice and medical results interpretation;
|
||||
|
||||
- To generate or disseminate information for the purpose to be used
|
||||
for administration of justice, law enforcement, immigration or
|
||||
asylum processes, such as predicting an individual will commit
|
||||
fraud/crime commitment (e.g. by text profiling, drawing causal
|
||||
relationships between assertions made in documents, indiscriminate
|
||||
and arbitrarily-targeted use).
|
||||
210
README-CompViz.md
Normal file
@@ -0,0 +1,210 @@
|
||||
# Original README from CompViz/stable-diffusion
|
||||
*Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
|
||||
|
||||
[**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)<br/>
|
||||
[Robin Rombach](https://github.com/rromb)\*,
|
||||
[Andreas Blattmann](https://github.com/ablattmann)\*,
|
||||
[Dominik Lorenz](https://github.com/qp-qp)\,
|
||||
[Patrick Esser](https://github.com/pesser),
|
||||
[Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
|
||||
|
||||
**CVPR '22 Oral**
|
||||
|
||||
which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/).
|
||||
|
||||

|
||||
[Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
|
||||
model.
|
||||
Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database.
|
||||
Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487),
|
||||
this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
|
||||
With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
|
||||
See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
|
||||
|
||||
|
||||
## Requirements
|
||||
|
||||
A suitable [conda](https://conda.io/) environment named `ldm` can be created
|
||||
and activated with:
|
||||
|
||||
```
|
||||
conda env create -f environment.yaml
|
||||
conda activate ldm
|
||||
```
|
||||
|
||||
You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
|
||||
|
||||
```
|
||||
conda install pytorch torchvision -c pytorch
|
||||
pip install transformers==4.19.2
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Stable Diffusion v1
|
||||
|
||||
Stable Diffusion v1 refers to a specific configuration of the model
|
||||
architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
|
||||
and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and
|
||||
then finetuned on 512x512 images.
|
||||
|
||||
*Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
|
||||
in its training data.
|
||||
Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
|
||||
Research into the safe deployment of general text-to-image models is an ongoing effort. To prevent misuse and harm, we currently provide access to the checkpoints only for [academic research purposes upon request](https://stability.ai/academia-access-form).
|
||||
**This is an experiment in safe and community-driven publication of a capable and general text-to-image model. We are working on a public release with a more permissive license that also incorporates ethical considerations.***
|
||||
|
||||
[Request access to Stable Diffusion v1 checkpoints for academic research](https://stability.ai/academia-access-form)
|
||||
|
||||
### Weights
|
||||
|
||||
We currently provide three checkpoints, `sd-v1-1.ckpt`, `sd-v1-2.ckpt` and `sd-v1-3.ckpt`,
|
||||
which were trained as follows,
|
||||
|
||||
- `sd-v1-1.ckpt`: 237k steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en).
|
||||
194k steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`).
|
||||
- `sd-v1-2.ckpt`: Resumed from `sd-v1-1.ckpt`.
|
||||
515k steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en,
|
||||
filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)).
|
||||
- `sd-v1-3.ckpt`: Resumed from `sd-v1-2.ckpt`. 195k steps at resolution `512x512` on "laion-improved-aesthetics" and 10\% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
|
||||
|
||||
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
|
||||
5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
|
||||
steps show the relative improvements of the checkpoints:
|
||||

|
||||
|
||||
|
||||
|
||||
### Text-to-Image with Stable Diffusion
|
||||

|
||||

|
||||
|
||||
Stable Diffusion is a latent diffusion model conditioned on the (non-pooled) text embeddings of a CLIP ViT-L/14 text encoder.
|
||||
|
||||
|
||||
#### Sampling Script
|
||||
|
||||
After [obtaining the weights](#weights), link them
|
||||
```
|
||||
mkdir -p models/ldm/stable-diffusion-v1/
|
||||
ln -s <path/to/model.ckpt> models/ldm/stable-diffusion-v1/model.ckpt
|
||||
```
|
||||
and sample with
|
||||
```
|
||||
python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms
|
||||
```
|
||||
|
||||
By default, this uses a guidance scale of `--scale 7.5`, [Katherine Crowson's implementation](https://github.com/CompVis/latent-diffusion/pull/51) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler,
|
||||
and renders images of size 512x512 (which it was trained on) in 50 steps. All supported arguments are listed below (type `python scripts/txt2img.py --help`).
|
||||
|
||||
```commandline
|
||||
usage: txt2img.py [-h] [--prompt [PROMPT]] [--outdir [OUTDIR]] [--skip_grid] [--skip_save] [--ddim_steps DDIM_STEPS] [--plms] [--laion400m] [--fixed_code] [--ddim_eta DDIM_ETA] [--n_iter N_ITER] [--H H] [--W W] [--C C] [--f F] [--n_samples N_SAMPLES] [--n_rows N_ROWS]
|
||||
[--scale SCALE] [--from-file FROM_FILE] [--config CONFIG] [--ckpt CKPT] [--seed SEED] [--precision {full,autocast}]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--prompt [PROMPT] the prompt to render
|
||||
--outdir [OUTDIR] dir to write results to
|
||||
--skip_grid do not save a grid, only individual samples. Helpful when evaluating lots of samples
|
||||
--skip_save do not save individual samples. For speed measurements.
|
||||
--ddim_steps DDIM_STEPS
|
||||
number of ddim sampling steps
|
||||
--plms use plms sampling
|
||||
--laion400m uses the LAION400M model
|
||||
--fixed_code if enabled, uses the same starting code across samples
|
||||
--ddim_eta DDIM_ETA ddim eta (eta=0.0 corresponds to deterministic sampling
|
||||
--n_iter N_ITER sample this often
|
||||
--H H image height, in pixel space
|
||||
--W W image width, in pixel space
|
||||
--C C latent channels
|
||||
--f F downsampling factor
|
||||
--n_samples N_SAMPLES
|
||||
how many samples to produce for each given prompt. A.k.a. batch size
|
||||
(note that the seeds for each image in the batch will be unavailable)
|
||||
--n_rows N_ROWS rows in the grid (default: n_samples)
|
||||
--scale SCALE unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
|
||||
--from-file FROM_FILE
|
||||
if specified, load prompts from this file
|
||||
--config CONFIG path to config which constructs model
|
||||
--ckpt CKPT path to checkpoint of model
|
||||
--seed SEED the seed (for reproducible sampling)
|
||||
--precision {full,autocast}
|
||||
evaluate at this precision
|
||||
|
||||
```
|
||||
Note: The inference config for all v1 versions is designed to be used with EMA-only checkpoints.
|
||||
For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
|
||||
non-EMA to EMA weights. If you want to examine the effect of EMA vs no EMA, we provide "full" checkpoints
|
||||
which contain both types of weights. For these, `use_ema=False` will load and use the non-EMA weights.
|
||||
|
||||
|
||||
#### Diffusers Integration
|
||||
|
||||
Another way to download and sample Stable Diffusion is by using the [diffusers library](https://github.com/huggingface/diffusers/tree/main#new--stable-diffusion-is-now-fully-compatible-with-diffusers)
|
||||
```py
|
||||
# make sure you're logged in with `huggingface-cli login`
|
||||
from torch import autocast
|
||||
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-3-diffusers",
|
||||
use_auth_token=True
|
||||
)
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
with autocast("cuda"):
|
||||
image = pipe(prompt)["sample"][0]
|
||||
|
||||
image.save("astronaut_rides_horse.png")
|
||||
```
|
||||
|
||||
|
||||
|
||||
### Image Modification with Stable Diffusion
|
||||
|
||||
By using a diffusion-denoising mechanism as first proposed by [SDEdit](https://arxiv.org/abs/2108.01073), the model can be used for different
|
||||
tasks such as text-guided image-to-image translation and upscaling. Similar to the txt2img sampling script,
|
||||
we provide a script to perform image modification with Stable Diffusion.
|
||||
|
||||
The following describes an example where a rough sketch made in [Pinta](https://www.pinta-project.com/) is converted into a detailed artwork.
|
||||
```
|
||||
python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img <path-to-img.jpg> --strength 0.8
|
||||
```
|
||||
Here, strength is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
|
||||
Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. See the following example.
|
||||
|
||||
**Input**
|
||||
|
||||

|
||||
|
||||
**Outputs**
|
||||
|
||||

|
||||

|
||||
|
||||
This procedure can, for example, also be used to upscale samples from the base model.
|
||||
|
||||
|
||||
## Comments
|
||||
|
||||
- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
|
||||
and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch).
|
||||
Thanks for open-sourcing!
|
||||
|
||||
- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories).
|
||||
|
||||
|
||||
## BibTeX
|
||||
|
||||
```
|
||||
@misc{rombach2021highresolution,
|
||||
title={High-Resolution Image Synthesis with Latent Diffusion Models},
|
||||
author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
|
||||
year={2021},
|
||||
eprint={2112.10752},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CV}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
322
README-Mac-MPS.md
Normal file
@@ -0,0 +1,322 @@
|
||||
# macOS Instructions
|
||||
|
||||
Requirements
|
||||
|
||||
- macOS 12.3 Monterey or later
|
||||
- Python
|
||||
- Patience
|
||||
- Apple Silicon*
|
||||
|
||||
*I haven't tested any of this on Intel Macs but I have read that one person got
|
||||
it to work, so Apple Silicon might not be requried.
|
||||
|
||||
Things have moved really fast and so these instructions change often and are
|
||||
often out-of-date. One of the problems is that there are so many different ways to
|
||||
run this.
|
||||
|
||||
We are trying to build a testing setup so that when we make changes it doesn't
|
||||
always break.
|
||||
|
||||
How to (this hasn't been 100% tested yet):
|
||||
|
||||
First get the weights checkpoint download started - it's big:
|
||||
|
||||
1. Sign up at https://huggingface.co
|
||||
2. Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
|
||||
3. Accept the terms and click Access Repository:
|
||||
4. Download [sd-v1-4.ckpt (4.27 GB)](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/blob/main/sd-v1-4.ckpt) and note where you have saved it (probably the Downloads folder)
|
||||
|
||||
While that is downloading, open Terminal and run the following commands one at a time.
|
||||
|
||||
```
|
||||
# install brew (and Xcode command line tools):
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
||||
|
||||
# install python 3, git, cmake, protobuf:
|
||||
brew install cmake protobuf rust
|
||||
|
||||
# install miniconda (M1 arm64 version):
|
||||
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o Miniconda3-latest-MacOSX-arm64.sh
|
||||
/bin/bash Miniconda3-latest-MacOSX-arm64.sh
|
||||
|
||||
# clone the repo
|
||||
git clone https://github.com/lstein/stable-diffusion.git
|
||||
cd stable-diffusion
|
||||
|
||||
#
|
||||
# wait until the checkpoint file has downloaded, then proceed
|
||||
#
|
||||
|
||||
# create symlink to checkpoint
|
||||
mkdir -p models/ldm/stable-diffusion-v1/
|
||||
|
||||
PATH_TO_CKPT="$HOME/Downloads" # or wherever you saved sd-v1-4.ckpt
|
||||
|
||||
ln -s "$PATH_TO_CKPT/sd-v1-4.ckpt" models/ldm/stable-diffusion-v1/model.ckpt
|
||||
|
||||
# install packages
|
||||
PIP_EXISTS_ACTION=w CONDA_SUBDIR=osx-arm64 conda env create -f environment-mac.yaml
|
||||
conda activate ldm
|
||||
|
||||
# only need to do this once
|
||||
python scripts/preload_models.py
|
||||
|
||||
# run SD!
|
||||
python scripts/dream.py --full_precision # half-precision requires autocast and won't work
|
||||
```
|
||||
|
||||
The original scripts should work as well.
|
||||
|
||||
```
|
||||
python scripts/orig_scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms
|
||||
```
|
||||
|
||||
Note, `export PIP_EXISTS_ACTION=w` is a precaution to fix `conda env create -f environment-mac.yaml`
|
||||
never finishing in some situations. So it isn't required but wont hurt.
|
||||
|
||||
After you follow all the instructions and run dream.py you might get several
|
||||
errors. Here's the errors I've seen and found solutions for.
|
||||
|
||||
### Is it slow?
|
||||
|
||||
Be sure to specify 1 sample and 1 iteration.
|
||||
|
||||
python ./scripts/orig_scripts/txt2img.py --prompt "ocean" --ddim_steps 5 --n_samples 1 --n_iter 1
|
||||
|
||||
### Doesn't work anymore?
|
||||
|
||||
PyTorch nightly includes support for MPS. Because of this, this setup is
|
||||
inherently unstable. One morning I woke up and it no longer worked no matter
|
||||
what I did until I switched to miniforge. However, I have another Mac that works
|
||||
just fine with Anaconda. If you can't get it to work, please search a little
|
||||
first because many of the errors will get posted and solved. If you can't find
|
||||
a solution please [create an issue](https://github.com/lstein/stable-diffusion/issues).
|
||||
|
||||
One debugging step is to update to the latest version of PyTorch nightly.
|
||||
|
||||
conda install pytorch torchvision torchaudio -c pytorch-nightly
|
||||
|
||||
If `conda env create -f environment-mac.yaml` takes forever run this.
|
||||
|
||||
git clean -f
|
||||
|
||||
And run this.
|
||||
|
||||
conda clean --yes --all
|
||||
|
||||
Or you could reset Anaconda.
|
||||
|
||||
conda update --force-reinstall -y -n base -c defaults conda
|
||||
|
||||
### "No module named cv2", torch, 'ldm', 'transformers', 'taming', etc.
|
||||
|
||||
There are several causes of these errors.
|
||||
|
||||
First, did you remember to `conda activate ldm`? If your terminal prompt
|
||||
begins with "(ldm)" then you activated it. If it begins with "(base)"
|
||||
or something else you haven't.
|
||||
|
||||
Second, you might've run `./scripts/preload_models.py` or `./scripts/dream.py`
|
||||
instead of `python ./scripts/preload_models.py` or `python ./scripts/dream.py`.
|
||||
The cause of this error is long so it's below.
|
||||
|
||||
Third, if it says you're missing taming you need to rebuild your virtual
|
||||
environment.
|
||||
|
||||
conda env remove -n ldm
|
||||
conda env create -f environment-mac.yaml
|
||||
|
||||
Fourth, If you have activated the ldm virtual environment and tried rebuilding
|
||||
it, maybe the problem could be that I have something installed that
|
||||
you don't and you'll just need to manually install it. Make sure you
|
||||
activate the virtual environment so it installs there instead of
|
||||
globally.
|
||||
|
||||
conda activate ldm
|
||||
pip install *name*
|
||||
|
||||
You might also need to install Rust (I mention this again below).
|
||||
|
||||
### How many snakes are living in your computer?
|
||||
|
||||
Here's the reason why you have to specify which python to use.
|
||||
There are several versions of python on macOS and the computer is
|
||||
picking the wrong one. More specifically, preload_models.py and dream.py says to
|
||||
find the first `python3` in the path environment variable. You can see which one
|
||||
it is picking with `which python3`. These are the mostly likely paths you'll see.
|
||||
|
||||
% which python3
|
||||
/usr/bin/python3
|
||||
|
||||
The above path is part of the OS. However, that path is a stub that asks you if
|
||||
you want to install Xcode. If you have Xcode installed already,
|
||||
/usr/bin/python3 will execute /Library/Developer/CommandLineTools/usr/bin/python3 or
|
||||
/Applications/Xcode.app/Contents/Developer/usr/bin/python3 (depending on which
|
||||
Xcode you've selected with `xcode-select`).
|
||||
|
||||
% which python3
|
||||
/opt/homebrew/bin/python3
|
||||
|
||||
If you installed python3 with Homebrew and you've modified your path to search
|
||||
for Homebrew binaries before system ones, you'll see the above path.
|
||||
|
||||
% which python
|
||||
/opt/anaconda3/bin/python
|
||||
|
||||
If you drop the "3" you get an entirely different python. Note: starting in
|
||||
macOS 12.3, /usr/bin/python no longer exists (it was python 2 anyway).
|
||||
|
||||
If you have Anaconda installed, this is what you'll see. There is a
|
||||
/opt/anaconda3/bin/python3 also.
|
||||
|
||||
(ldm) % which python
|
||||
/Users/name/miniforge3/envs/ldm/bin/python
|
||||
|
||||
This is what you'll see if you have miniforge and you've correctly activated
|
||||
the ldm environment. This is the goal.
|
||||
|
||||
It's all a mess and you should know [how to modify the path environment variable](https://support.apple.com/guide/terminal/use-environment-variables-apd382cc5fa-4f58-4449-b20a-41c53c006f8f/mac)
|
||||
if you want to fix it. Here's a brief hint of all the ways you can modify it
|
||||
(don't really have the time to explain it all here).
|
||||
|
||||
- ~/.zshrc
|
||||
- ~/.bash_profile
|
||||
- ~/.bashrc
|
||||
- /etc/paths.d
|
||||
- /etc/path
|
||||
|
||||
Which one you use will depend on what you have installed except putting a file
|
||||
in /etc/paths.d is what I prefer to do.
|
||||
|
||||
### Debugging?
|
||||
|
||||
Tired of waiting for your renders to finish before you can see if it
|
||||
works? Reduce the steps! The image quality will be horrible but at least you'll
|
||||
get quick feedback.
|
||||
|
||||
python ./scripts/txt2img.py --prompt "ocean" --ddim_steps 5 --n_samples 1 --n_iter 1
|
||||
|
||||
### OSError: Can't load tokenizer for 'openai/clip-vit-large-patch14'...
|
||||
|
||||
python scripts/preload_models.py
|
||||
|
||||
### "The operator [name] is not current implemented for the MPS device." (sic)
|
||||
|
||||
Example error.
|
||||
|
||||
```
|
||||
...
|
||||
NotImplementedError: The operator 'aten::_index_put_impl_' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on [https://github.com/pytorch/pytorch/issues/77764](https://github.com/pytorch/pytorch/issues/77764). As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.
|
||||
```
|
||||
|
||||
The lstein branch includes this fix in [environment-mac.yaml](https://github.com/lstein/stable-diffusion/blob/main/environment-mac.yaml).
|
||||
|
||||
### "Could not build wheels for tokenizers"
|
||||
|
||||
I have not seen this error because I had Rust installed on my computer before I started playing with Stable Diffusion. The fix is to install Rust.
|
||||
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
|
||||
### How come `--seed` doesn't work?
|
||||
|
||||
First this:
|
||||
|
||||
> Completely reproducible results are not guaranteed across PyTorch
|
||||
releases, individual commits, or different platforms. Furthermore,
|
||||
results may not be reproducible between CPU and GPU executions, even
|
||||
when using identical seeds.
|
||||
|
||||
[PyTorch docs](https://pytorch.org/docs/stable/notes/randomness.html)
|
||||
|
||||
Second, we might have a fix that at least gets a consistent seed sort of. We're
|
||||
still working on it.
|
||||
|
||||
### libiomp5.dylib error?
|
||||
|
||||
OMP: Error #15: Initializing libiomp5.dylib, but found libomp.dylib already initialized.
|
||||
|
||||
You are likely using an Intel package by mistake. Be sure to run conda with
|
||||
the environment variable `CONDA_SUBDIR=osx-arm64`, like so:
|
||||
|
||||
`CONDA_SUBDIR=osx-arm64 conda install ...`
|
||||
|
||||
This error happens with Anaconda on Macs when the Intel-only `mkl` is pulled in by
|
||||
a dependency. [nomkl](https://stackoverflow.com/questions/66224879/what-is-the-nomkl-python-package-used-for)
|
||||
is a metapackage designed to prevent this, by making it impossible to install
|
||||
`mkl`, but if your environment is already broken it may not work.
|
||||
|
||||
Do *not* use `os.environ['KMP_DUPLICATE_LIB_OK']='True'` or equivalents as this
|
||||
masks the underlying issue of using Intel packages.
|
||||
|
||||
### Not enough memory.
|
||||
|
||||
This seems to be a common problem and is probably the underlying
|
||||
problem for a lot of symptoms (listed below). The fix is to lower your
|
||||
image size or to add `model.half()` right after the model is loaded. I
|
||||
should probably test it out. I've read that the reason this fixes
|
||||
problems is because it converts the model from 32-bit to 16-bit and
|
||||
that leaves more RAM for other things. I have no idea how that would
|
||||
affect the quality of the images though.
|
||||
|
||||
See [this issue](https://github.com/CompVis/stable-diffusion/issues/71).
|
||||
|
||||
### "Error: product of dimension sizes > 2**31'"
|
||||
|
||||
This error happens with img2img, which I haven't played with too much
|
||||
yet. But I know it's because your image is too big or the resolution
|
||||
isn't a multiple of 32x32. Because the stable-diffusion model was
|
||||
trained on images that were 512 x 512, it's always best to use that
|
||||
output size (which is the default). However, if you're using that size
|
||||
and you get the above error, try 256 x 256 or 512 x 256 or something
|
||||
as the source image.
|
||||
|
||||
BTW, 2**31-1 = [2,147,483,647](https://en.wikipedia.org/wiki/2,147,483,647#In_computing), which is also 32-bit signed [LONG_MAX](https://en.wikipedia.org/wiki/C_data_types) in C.
|
||||
|
||||
### I just got Rickrolled! Do I have a virus?
|
||||
|
||||
You don't have a virus. It's part of the project. Here's
|
||||
[Rick](https://github.com/lstein/stable-diffusion/blob/main/assets/rick.jpeg)
|
||||
and here's [the
|
||||
code](https://github.com/lstein/stable-diffusion/blob/69ae4b35e0a0f6ee1af8bb9a5d0016ccb27e36dc/scripts/txt2img.py#L79)
|
||||
that swaps him in. It's a NSFW filter, which IMO, doesn't work very
|
||||
good (and we call this "computer vision", sheesh).
|
||||
|
||||
Actually, this could be happening because there's not enough RAM. You could try the `model.half()` suggestion or specify smaller output images.
|
||||
|
||||
### My images come out black
|
||||
|
||||
We might have this fixed, we are still testing.
|
||||
|
||||
There's a [similar issue](https://github.com/CompVis/stable-diffusion/issues/69)
|
||||
on CUDA GPU's where the images come out green. Maybe it's the same issue?
|
||||
Someone in that issue says to use "--precision full", but this fork
|
||||
actually disables that flag. I don't know why, someone else provided
|
||||
that code and I don't know what it does. Maybe the `model.half()`
|
||||
suggestion above would fix this issue too. I should probably test it.
|
||||
|
||||
### "view size is not compatible with input tensor's size and stride"
|
||||
|
||||
```
|
||||
File "/opt/anaconda3/envs/ldm/lib/python3.10/site-packages/torch/nn/functional.py", line 2511, in layer_norm
|
||||
return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
|
||||
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
|
||||
```
|
||||
|
||||
Update to the latest version of lstein/stable-diffusion. We were
|
||||
patching pytorch but we found a file in stable-diffusion that we could
|
||||
change instead. This is a 32-bit vs 16-bit problem.
|
||||
|
||||
### The processor must support the Intel bla bla bla
|
||||
|
||||
What? Intel? On an Apple Silicon?
|
||||
|
||||
Intel MKL FATAL ERROR: This system does not meet the minimum requirements for use of the Intel(R) Math Kernel Library.
|
||||
The processor must support the Intel(R) Supplemental Streaming SIMD Extensions 3 (Intel(R) SSSE3) instructions.
|
||||
The processor must support the Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) instructions.
|
||||
The processor must support the Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
|
||||
|
||||
This is due to the Intel `mkl` package getting picked up when you try to install
|
||||
something that depends on it-- Rosetta can translate some Intel instructions but
|
||||
not the specialized ones here. To avoid this, make sure to use the environment
|
||||
variable `CONDA_SUBDIR=osx-arm64`, which restricts the Conda environment to only
|
||||
use ARM packages, and use `nomkl` as described above.
|
||||
259
Stable-Diffusion-local-Windows.ipynb
Normal file
@@ -0,0 +1,259 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Easy-peasy Windows install"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that you will need NVIDIA drivers, Python 3.10, and Git installed\n",
|
||||
"beforehand - simplified\n",
|
||||
"[step-by-step instructions](https://github.com/lstein/stable-diffusion/wiki/Easy-peasy-Windows-install)\n",
|
||||
"are available in the wiki (you'll only need steps 1, 2, & 3 )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run each cell in turn. In VSCode, either hit SHIFT-ENTER, or click on the little ▶️ to the left of the cell. In Jupyter/JupyterLab, you **must** hit SHIFT-ENTER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install pew"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%cmd\n",
|
||||
"git clone https://github.com/lstein/stable-diffusion.git"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%cd stable-diffusion"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile requirements.txt\n",
|
||||
"albumentations==0.4.3\n",
|
||||
"einops==0.3.0\n",
|
||||
"huggingface-hub==0.8.1\n",
|
||||
"imageio-ffmpeg==0.4.2\n",
|
||||
"imageio==2.9.0\n",
|
||||
"kornia==0.6.0\n",
|
||||
"omegaconf==2.1.1\n",
|
||||
"opencv-python==4.6.0.66\n",
|
||||
"pillow==9.2.0\n",
|
||||
"pudb==2019.2\n",
|
||||
"pytorch-lightning==1.4.2\n",
|
||||
"streamlit==1.12.0\n",
|
||||
"# Regular \"taming-transformers\" doesn't seem to work\n",
|
||||
"taming-transformers-rom1504==0.0.6\n",
|
||||
"test-tube>=0.7.5\n",
|
||||
"torch-fidelity==0.3.0\n",
|
||||
"torchmetrics==0.6.0\n",
|
||||
"torchvision==0.12.0\n",
|
||||
"transformers==4.19.2\n",
|
||||
"git+https://github.com/openai/CLIP.git@main#egg=clip\n",
|
||||
"git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion\n",
|
||||
"# No CUDA in PyPi builds\n",
|
||||
"torch@https://download.pytorch.org/whl/cu113/torch-1.11.0%2Bcu113-cp310-cp310-win_amd64.whl\n",
|
||||
"# No MKL in PyPi builds (faster, more robust than OpenBLAS)\n",
|
||||
"numpy@https://download.lfd.uci.edu/pythonlibs/archived/numpy-1.22.4+mkl-cp310-cp310-win_amd64.whl\n",
|
||||
"-e .\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%cmd\n",
|
||||
"pew new --python 3.10 -r requirements.txt --dont-activate ldm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Switch the notebook kernel to the new 'ldm' environment!\n",
|
||||
"\n",
|
||||
"## VSCode: restart VSCode and come back to this cell\n",
|
||||
"\n",
|
||||
"1. Ctrl+Shift+P\n",
|
||||
"1. Type \"Select Interpreter\" and select \"Jupyter: Select Interpreter to Start Jupyter Server\"\n",
|
||||
"1. VSCode will say that it needs to install packages. Click the \"Install\" button.\n",
|
||||
"1. Once the install is finished, do 1 & 2 again\n",
|
||||
"1. Pick 'ldm'\n",
|
||||
"1. Run the following cell"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%cd stable-diffusion"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## Jupyter/JupyterLab\n",
|
||||
"\n",
|
||||
"1. Run the cell below\n",
|
||||
"1. Click on the toolbar where it says \"(ipyknel)\" ↗️. You should get a pop-up asking you to \"Select Kernel\". Pick 'ldm' from the drop-down.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### DO NOT RUN THE FOLLOWING CELL IF YOU ARE USING VSCODE!!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT RUN THIS CELL IF YOU ARE USING VSCODE!!\n",
|
||||
"%%cmd\n",
|
||||
"pew workon ldm\n",
|
||||
"pip3 install ipykernel\n",
|
||||
"python -m ipykernel install --name=ldm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### When running the next cell, Jupyter/JupyterLab users might get a warning saying \"IProgress not found\". This can be ignored."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%run \"scripts/preload_models.py\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%cmd\n",
|
||||
"mkdir \"models/ldm/stable-diffusion-v1\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Now copy the SD model you downloaded from Hugging Face into the above new directory, and (if necessary) rename it to 'model.ckpt'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Now go create some magic!\n",
|
||||
"\n",
|
||||
"VSCode\n",
|
||||
"\n",
|
||||
"- The actual input box for the 'dream' prompt will appear at the very top of the VSCode window. Type in your commands and hit 'ENTER'.\n",
|
||||
"- To quit, hit the 'Interrupt' button in the toolbar up there ⬆️ a couple of times, then hit ENTER (you'll probably see a terrifying traceback from Python - just ignore it).\n",
|
||||
"\n",
|
||||
"Jupyter/JupyterLab\n",
|
||||
"\n",
|
||||
"- The input box for the 'dream' prompt will appear below. Type in your commands and hit 'ENTER'.\n",
|
||||
"- To quit, hit the interrupt button (⏹️) in the toolbar up there ⬆️ a couple of times, then hit ENTER (you'll probably see a terrifying traceback from Python - just ignore it)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%run \"scripts/dream.py\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Once this seems to be working well, you can try opening a terminal\n",
|
||||
"\n",
|
||||
"- VSCode: type ('CTRL+`')\n",
|
||||
"- Jupyter/JupyterLab: File|New Terminal\n",
|
||||
"- Or jump out of the notebook entirely, and open Powershell/Command Prompt\n",
|
||||
"\n",
|
||||
"Now:\n",
|
||||
"\n",
|
||||
"1. `cd` to wherever the 'stable-diffusion' directory is\n",
|
||||
"1. Run `pew workon ldm`\n",
|
||||
"1. Run `winpty python scripts\\dream.py`"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.10.6 ('ldm')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "a05e4574567b7bc2c98f7f9aa579f9ea5b8739b54844ab610ac85881c4be2659"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
256
Stable_Diffusion_AI_Notebook.ipynb
Normal file
@@ -0,0 +1,256 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "Stable_Diffusion_AI_Notebook.ipynb",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"private_outputs": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"accelerator": "GPU",
|
||||
"gpuClass": "standard"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Stable Diffusion AI Notebook\n",
|
||||
"\n",
|
||||
"<img src=\"https://user-images.githubusercontent.com/60411196/186547976-d9de378a-9de8-4201-9c25-c057a9c59bad.jpeg\" alt=\"stable-diffusion-ai\" width=\"170px\"/> <br>\n",
|
||||
"#### Instructions:\n",
|
||||
"1. Execute each cell in order to mount a Dream bot and create images from text. <br>\n",
|
||||
"2. Once cells 1-8 were run correctly you'll be executing a terminal in cell #9, you'll to enter `pipenv run scripts/dream.py` command to run Dream bot.<br> \n",
|
||||
"3. After launching dream bot, you'll see: <br> `Dream > ` in terminal. <br> Insert a command, eg. `Dream > Astronaut floating in a distant galaxy`, or type `-h` for help.\n",
|
||||
"3. After completion you'll see your generated images in path `stable-diffusion/outputs/img-samples/`, you can also display images in cell #10.\n",
|
||||
"4. To quit Dream bot use `q` command. <br> \n",
|
||||
"---\n",
|
||||
"<font color=\"red\">Note:</font> It takes some time to load, but after installing all dependencies you can use the bot all time you want while colab instance is up. <br>\n",
|
||||
"<font color=\"red\">Requirements:</font> For this notebook to work you need to have [Stable-Diffusion-v-1-4](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original) stored in your Google Drive, it will be needed in cell #6\n",
|
||||
"##### For more details visit Github repository: [lstein/stable-diffusion](https://github.com/lstein/stable-diffusion)\n",
|
||||
"---\n"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "ycYWcsEKc6w7"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 1. Check current GPU assigned\n",
|
||||
"!nvidia-smi -L\n",
|
||||
"!nvidia-smi"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "a2Z5Qu_o8VtQ"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "vbI9ZsQHzjqF"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#@title 2. Download stable-diffusion Repository\n",
|
||||
"from os.path import exists\n",
|
||||
"\n",
|
||||
"if exists(\"/content/stable-diffusion/\")==True:\n",
|
||||
" print(\"Already downloaded repo\")\n",
|
||||
"else:\n",
|
||||
" !git clone --quiet https://github.com/lstein/stable-diffusion.git # Original repo\n",
|
||||
" %cd stable-diffusion/\n",
|
||||
" !git checkout --quiet tags/release-1.09\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 3. Install Python 3.8 \n",
|
||||
"%%capture --no-stderr\n",
|
||||
"import gc\n",
|
||||
"!apt-get -qq install python3.8\n",
|
||||
"gc.collect()"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "daHlozvwKesj",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 4. Install dependencies from file in a VirtualEnv\n",
|
||||
"#@markdown Be patient, it takes ~ 5 - 7min <br>\n",
|
||||
"%%capture --no-stderr\n",
|
||||
"#Virtual environment\n",
|
||||
"!pip install pipenv -q\n",
|
||||
"!pip install colab-xterm\n",
|
||||
"%load_ext colabxterm\n",
|
||||
"!pipenv --python 3.8\n",
|
||||
"!pipenv install -r requirements.txt --skip-lock\n",
|
||||
"gc.collect()\n"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "QbXcGXYEFSNB"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 5. Mount google Drive\n",
|
||||
"from google.colab import drive\n",
|
||||
"drive.mount('/content/drive')"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "YEWPV-sF1RDM"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 6. Drive Path to model\n",
|
||||
"#@markdown Path should start with /content/drive/path-to-your-file <br>\n",
|
||||
"#@markdown <font color=\"red\">Note:</font> Model should be downloaded from https://huggingface.co <br>\n",
|
||||
"#@markdown Lastest release: [Stable-Diffusion-v-1-4](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)\n",
|
||||
"from os.path import exists\n",
|
||||
"\n",
|
||||
"model_path = \"\" #@param {type:\"string\"}\n",
|
||||
"if exists(model_path)==True:\n",
|
||||
" print(\"✅ Valid directory\")\n",
|
||||
"else: \n",
|
||||
" print(\"❌ File doesn't exist\")"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "zRTJeZ461WGu"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 7. Symlink to model\n",
|
||||
"\n",
|
||||
"from os.path import exists\n",
|
||||
"import os \n",
|
||||
"\n",
|
||||
"# Folder creation if it doesn't exist\n",
|
||||
"if exists(\"/content/stable-diffusion/models/ldm/stable-diffusion-v1\")==True:\n",
|
||||
" print(\"❗ Dir stable-diffusion-v1 already exists\")\n",
|
||||
"else:\n",
|
||||
" %mkdir /content/stable-diffusion/models/ldm/stable-diffusion-v1\n",
|
||||
" print(\"✅ Dir stable-diffusion-v1 created\")\n",
|
||||
"\n",
|
||||
"# Symbolic link if it doesn't exist\n",
|
||||
"if exists(\"/content/stable-diffusion/models/ldm/stable-diffusion-v1/model.ckpt\")==True:\n",
|
||||
" print(\"❗ Symlink already created\")\n",
|
||||
"else: \n",
|
||||
" src = model_path\n",
|
||||
" dst = '/content/stable-diffusion/models/ldm/stable-diffusion-v1/model.ckpt'\n",
|
||||
" os.symlink(src, dst) \n",
|
||||
" print(\"✅ Symbolic link created successfully\")"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "UY-NNz4I8_aG",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 8. Load small ML models required\n",
|
||||
"%%capture --no-stderr\n",
|
||||
"!pipenv run scripts/preload_models.py\n",
|
||||
"gc.collect()"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "ChIDWxLVHGGJ"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 9. Run Terminal and Execute Dream bot\n",
|
||||
"#@markdown <font color=\"blue\">Steps:</font> <br>\n",
|
||||
"#@markdown 1. Execute command `pipenv run scripts/dream.py` to run dream bot.<br>\n",
|
||||
"#@markdown 2. After initialized you'll see `Dream>` line.<br>\n",
|
||||
"#@markdown 3. Example text: `Astronaut floating in a distant galaxy` <br>\n",
|
||||
"#@markdown 4. To quit Dream bot use: `q` command.<br>\n",
|
||||
"\n",
|
||||
"#Run from virtual env\n",
|
||||
"\n",
|
||||
"%xterm\n",
|
||||
"gc.collect()"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "ir4hCrMIuUpl",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"#@title 10. Show generated images\n",
|
||||
"\n",
|
||||
"import glob\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
"images = []\n",
|
||||
"for img_path in glob.glob('/content/stable-diffusion/outputs/img-samples/*.png'):\n",
|
||||
" images.append(mpimg.imread(img_path))\n",
|
||||
"\n",
|
||||
"# Remove ticks and labels on x-axis and y-axis both\n",
|
||||
"\n",
|
||||
"plt.figure(figsize=(20,10))\n",
|
||||
"\n",
|
||||
"columns = 5\n",
|
||||
"for i, image in enumerate(images):\n",
|
||||
" ax = plt.subplot(len(images) / columns + 1, columns, i + 1)\n",
|
||||
" ax.axes.xaxis.set_visible(False)\n",
|
||||
" ax.axes.yaxis.set_visible(False)\n",
|
||||
" ax.axis('off')\n",
|
||||
" plt.imshow(image)\n",
|
||||
" gc.collect()\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "qnLohSHmKoGk"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
||||
113
VARIATIONS.md
Normal file
@@ -0,0 +1,113 @@
|
||||
# Cheat Sheat for Generating Variations
|
||||
|
||||
Release 1.13 of SD-Dream adds support for image variations. There are two things that you can do:
|
||||
|
||||
1. Generate a series of systematic variations of an image, given a
|
||||
prompt. The amount of variation from one image to the next can be
|
||||
controlled.
|
||||
|
||||
2. Given two or more variations that you like, you can combine them in
|
||||
a weighted fashion
|
||||
|
||||
This cheat sheet provides a quick guide for how this works in
|
||||
practice, using variations to create the desired image of Xena,
|
||||
Warrior Princess.
|
||||
|
||||
## Step 1 -- find a base image that you like
|
||||
|
||||
The prompt we will use throughout is "lucy lawless as xena, warrior
|
||||
princess, character portrait, high resolution." This will be indicated
|
||||
as "prompt" in the examples below.
|
||||
|
||||
First we let SD create a series of images in the usual way, in this case
|
||||
requesting six iterations:
|
||||
|
||||
~~~
|
||||
dream> lucy lawless as xena, warrior princess, character portrait, high resolution -n6
|
||||
...
|
||||
Outputs:
|
||||
./outputs/Xena/000001.1579445059.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S1579445059
|
||||
./outputs/Xena/000001.1880768722.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S1880768722
|
||||
./outputs/Xena/000001.332057179.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S332057179
|
||||
./outputs/Xena/000001.2224800325.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S2224800325
|
||||
./outputs/Xena/000001.465250761.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S465250761
|
||||
./outputs/Xena/000001.3357757885.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S3357757885
|
||||
~~~
|
||||
|
||||
The one with seed 3357757885 looks nice:
|
||||
|
||||
<img src="static/variation_walkthru/000001.3357757885.png"/>
|
||||
|
||||
Let's try to generate some variations. Using the same seed, we pass
|
||||
the argument -v0.1 (or --variant_amount), which generates a series of
|
||||
variations each differing by a variation amount of 0.2. This number
|
||||
ranges from 0 to 1.0, with higher numbers being larger amounts of
|
||||
variation.
|
||||
|
||||
~~~
|
||||
dream> "prompt" -n6 -S3357757885 -v0.2
|
||||
...
|
||||
Outputs:
|
||||
./outputs/Xena/000002.784039624.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 784039624:0.2 -S3357757885
|
||||
./outputs/Xena/000002.3647897225.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.2 -S3357757885
|
||||
./outputs/Xena/000002.917731034.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 917731034:0.2 -S3357757885
|
||||
./outputs/Xena/000002.4116285959.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 4116285959:0.2 -S3357757885
|
||||
./outputs/Xena/000002.1614299449.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 1614299449:0.2 -S3357757885
|
||||
./outputs/Xena/000002.1335553075.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 1335553075:0.2 -S3357757885
|
||||
~~~
|
||||
|
||||
Note that the output for each image has a -V option giving the
|
||||
"variant subseed" for that image, consisting of a seed followed by the
|
||||
variation amount used to generate it.
|
||||
|
||||
This gives us a series of closely-related variations, including the
|
||||
two shown here.
|
||||
|
||||
<img src="static/variation_walkthru/000002.3647897225.png">
|
||||
<img src="static/variation_walkthru/000002.1614299449.png">
|
||||
|
||||
|
||||
I like the expression on Xena's face in the first one (subseed
|
||||
3647897225), and the armor on her shoulder in the second one (subseed
|
||||
1614299449). Can we combine them to get the best of both worlds?
|
||||
|
||||
We combine the two variations using -V (--with_variations). Again, we
|
||||
must provide the seed for the originally-chosen image in order for
|
||||
this to work.
|
||||
|
||||
~~~
|
||||
dream> "prompt" -S3357757885 -V3647897225,0.1;1614299449,0.1
|
||||
Outputs:
|
||||
./outputs/Xena/000003.1614299449.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1 -S3357757885
|
||||
~~~
|
||||
|
||||
Here we are providing equal weights (0.1 and 0.1) for both the
|
||||
subseeds. The resulting image is close, but not exactly what I
|
||||
wanted:
|
||||
|
||||
<img src="static/variation_walkthru/000003.1614299449.png">
|
||||
|
||||
We could either try combining the images with different weights, or we
|
||||
can generate more variations around the almost-but-not-quite image. We
|
||||
do the latter, using both the -V (combining) and -v (variation
|
||||
strength) options. Note that we use -n6 to generate 6 variations:
|
||||
|
||||
~~~~
|
||||
dream> "prompt" -S3357757885 -V3647897225,0.1;1614299449,0.1 -v0.05 -n6
|
||||
Outputs:
|
||||
./outputs/Xena/000004.3279757577.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,3279757577:0.05 -S3357757885
|
||||
./outputs/Xena/000004.2853129515.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,2853129515:0.05 -S3357757885
|
||||
./outputs/Xena/000004.3747154981.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,3747154981:0.05 -S3357757885
|
||||
./outputs/Xena/000004.2664260391.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,2664260391:0.05 -S3357757885
|
||||
./outputs/Xena/000004.1642517170.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,1642517170:0.05 -S3357757885
|
||||
./outputs/Xena/000004.2183375608.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,2183375608:0.05 -S3357757885
|
||||
~~~~
|
||||
|
||||
This produces six images, all slight variations on the combination of
|
||||
the chosen two images. Here's the one I like best:
|
||||
|
||||
<img src="static/variation_walkthru/000004.3747154981.png">
|
||||
|
||||
As you can see, this is a very powerful too, which when combined with
|
||||
subprompt weighting, gives you great control over the content and
|
||||
quality of your generated images.
|
||||
18
configs/models.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
# This file describes the alternative machine learning models
|
||||
# available to the dream script.
|
||||
#
|
||||
# To add a new model, follow the examples below. Each
|
||||
# model requires a model config file, a weights file,
|
||||
# and the width and height of the images it
|
||||
# was trained on.
|
||||
|
||||
laion400m:
|
||||
config: configs/latent-diffusion/txt2img-1p4B-eval.yaml
|
||||
weights: models/ldm/text2img-large/model.ckpt
|
||||
width: 256
|
||||
height: 256
|
||||
stable-diffusion-1.4:
|
||||
config: configs/stable-diffusion/v1-inference.yaml
|
||||
weights: models/ldm/stable-diffusion-v1/model.ckpt
|
||||
width: 512
|
||||
height: 512
|
||||
109
configs/stable-diffusion/v1-finetune.yaml
Normal file
@@ -0,0 +1,109 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-03
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: true # Note: different from the one we trained before
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
embedding_reg_weight: 0.0
|
||||
|
||||
personalization_config:
|
||||
target: ldm.modules.embedding_manager.EmbeddingManager
|
||||
params:
|
||||
placeholder_strings: ["*"]
|
||||
initializer_words: ["sculpture"]
|
||||
per_image_tokens: false
|
||||
num_vectors_per_token: 1
|
||||
progressive_words: False
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: True
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 1
|
||||
num_workers: 2
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.personalized.PersonalizedBase
|
||||
params:
|
||||
size: 512
|
||||
set: train
|
||||
per_image_tokens: false
|
||||
repeats: 100
|
||||
validation:
|
||||
target: ldm.data.personalized.PersonalizedBase
|
||||
params:
|
||||
size: 512
|
||||
set: val
|
||||
per_image_tokens: false
|
||||
repeats: 10
|
||||
|
||||
lightning:
|
||||
modelcheckpoint:
|
||||
params:
|
||||
every_n_train_steps: 500
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 500
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
max_steps: 4000
|
||||
|
||||
103
configs/stable-diffusion/v1-finetune_style.yaml
Normal file
@@ -0,0 +1,103 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-03
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: true # Note: different from the one we trained before
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
embedding_reg_weight: 0.0
|
||||
|
||||
personalization_config:
|
||||
target: ldm.modules.embedding_manager.EmbeddingManager
|
||||
params:
|
||||
placeholder_strings: ["*"]
|
||||
initializer_words: ["painting"]
|
||||
per_image_tokens: false
|
||||
num_vectors_per_token: 1
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: True
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 2
|
||||
num_workers: 16
|
||||
wrap: false
|
||||
train:
|
||||
target: ldm.data.personalized_style.PersonalizedBase
|
||||
params:
|
||||
size: 512
|
||||
set: train
|
||||
per_image_tokens: false
|
||||
repeats: 100
|
||||
validation:
|
||||
target: ldm.data.personalized_style.PersonalizedBase
|
||||
params:
|
||||
size: 512
|
||||
set: val
|
||||
per_image_tokens: false
|
||||
repeats: 10
|
||||
|
||||
lightning:
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 500
|
||||
max_images: 8
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
@@ -26,6 +26,15 @@ model:
|
||||
f_max: [ 1. ]
|
||||
f_min: [ 1. ]
|
||||
|
||||
personalization_config:
|
||||
target: ldm.modules.embedding_manager.EmbeddingManager
|
||||
params:
|
||||
placeholder_strings: ["*"]
|
||||
initializer_words: ["sculpture"]
|
||||
per_image_tokens: false
|
||||
num_vectors_per_token: 1
|
||||
progressive_words: False
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
|
||||
58
environment-mac.yaml
Normal file
@@ -0,0 +1,58 @@
|
||||
name: ldm
|
||||
channels:
|
||||
- pytorch-nightly
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python==3.9.13
|
||||
- pip==22.2.2
|
||||
|
||||
# pytorch-nightly, left unpinned
|
||||
- pytorch
|
||||
- torchmetrics
|
||||
- torchvision
|
||||
|
||||
# I suggest to keep the other deps sorted for convenience.
|
||||
# If you wish to upgrade to 3.10, try to run this:
|
||||
#
|
||||
# ```shell
|
||||
# CONDA_CMD=conda
|
||||
# sed -E 's/python==3.9.13/python==3.10.5/;s/ldm/ldm-3.10/;21,99s/- ([^=]+)==.+/- \1/' environment-mac.yaml > /tmp/environment-mac-updated.yml
|
||||
# CONDA_SUBDIR=osx-arm64 $CONDA_CMD env create -f /tmp/environment-mac-updated.yml && $CONDA_CMD list -n ldm-3.10 | awk ' {print " - " $1 "==" $2;} '
|
||||
# ```
|
||||
#
|
||||
# Unfortunately, as of 2022-08-31, this fails at the pip stage.
|
||||
- albumentations==1.2.1
|
||||
- coloredlogs==15.0.1
|
||||
- einops==0.4.1
|
||||
- grpcio==1.46.4
|
||||
- humanfriendly
|
||||
- imageio-ffmpeg==0.4.7
|
||||
- imageio==2.21.2
|
||||
- imgaug==0.4.0
|
||||
- kornia==0.6.7
|
||||
- mpmath==1.2.1
|
||||
- nomkl
|
||||
- numpy==1.23.2
|
||||
- omegaconf==2.1.1
|
||||
- onnx==1.12.0
|
||||
- onnxruntime==1.12.1
|
||||
- opencv==4.6.0
|
||||
- pudb==2022.1
|
||||
- pytorch-lightning==1.6.5
|
||||
- scipy==1.9.1
|
||||
- streamlit==1.12.2
|
||||
- sympy==1.10.1
|
||||
- tensorboard==2.9.0
|
||||
- transformers==4.21.2
|
||||
- pip:
|
||||
- invisible-watermark
|
||||
- test-tube
|
||||
- tokenizers
|
||||
- torch-fidelity
|
||||
- -e git+https://github.com/huggingface/diffusers.git@v0.2.4#egg=diffusers
|
||||
- -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
||||
- -e git+https://github.com/openai/CLIP.git@main#egg=clip
|
||||
- -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion
|
||||
- -e .
|
||||
variables:
|
||||
PYTORCH_ENABLE_MPS_FALLBACK: 1
|
||||
@@ -18,13 +18,13 @@ dependencies:
|
||||
- pytorch-lightning==1.4.2
|
||||
- omegaconf==2.1.1
|
||||
- test-tube>=0.7.5
|
||||
- streamlit>=0.73.1
|
||||
- streamlit==1.12.0
|
||||
- pillow==9.2.0
|
||||
- einops==0.3.0
|
||||
- torch-fidelity==0.3.0
|
||||
- transformers==4.19.2
|
||||
- torchmetrics==0.6.0
|
||||
- kornia==0.6
|
||||
- accelerate==0.12.0
|
||||
- kornia==0.6.0
|
||||
- -e git+https://github.com/openai/CLIP.git@main#egg=clip
|
||||
- -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
||||
- -e git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
from abc import abstractmethod
|
||||
from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
|
||||
from torch.utils.data import (
|
||||
Dataset,
|
||||
ConcatDataset,
|
||||
ChainDataset,
|
||||
IterableDataset,
|
||||
)
|
||||
|
||||
|
||||
class Txt2ImgIterableBaseDataset(IterableDataset):
|
||||
'''
|
||||
"""
|
||||
Define an interface to make the IterableDatasets for text2img data chainable
|
||||
'''
|
||||
"""
|
||||
|
||||
def __init__(self, num_records=0, valid_ids=None, size=256):
|
||||
super().__init__()
|
||||
self.num_records = num_records
|
||||
@@ -13,11 +19,13 @@ class Txt2ImgIterableBaseDataset(IterableDataset):
|
||||
self.sample_ids = valid_ids
|
||||
self.size = size
|
||||
|
||||
print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
|
||||
print(
|
||||
f'{self.__class__.__name__} dataset contains {self.__len__()} examples.'
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return self.num_records
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self):
|
||||
pass
|
||||
pass
|
||||
|
||||
@@ -11,24 +11,34 @@ from tqdm import tqdm
|
||||
from torch.utils.data import Dataset, Subset
|
||||
|
||||
import taming.data.utils as tdu
|
||||
from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
|
||||
from taming.data.imagenet import (
|
||||
str_to_indices,
|
||||
give_synsets_from_indices,
|
||||
download,
|
||||
retrieve,
|
||||
)
|
||||
from taming.data.imagenet import ImagePaths
|
||||
|
||||
from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
|
||||
from ldm.modules.image_degradation import (
|
||||
degradation_fn_bsr,
|
||||
degradation_fn_bsr_light,
|
||||
)
|
||||
|
||||
|
||||
def synset2idx(path_to_yaml="data/index_synset.yaml"):
|
||||
def synset2idx(path_to_yaml='data/index_synset.yaml'):
|
||||
with open(path_to_yaml) as f:
|
||||
di2s = yaml.load(f)
|
||||
return dict((v,k) for k,v in di2s.items())
|
||||
return dict((v, k) for k, v in di2s.items())
|
||||
|
||||
|
||||
class ImageNetBase(Dataset):
|
||||
def __init__(self, config=None):
|
||||
self.config = config or OmegaConf.create()
|
||||
if not type(self.config)==dict:
|
||||
if not type(self.config) == dict:
|
||||
self.config = OmegaConf.to_container(self.config)
|
||||
self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
|
||||
self.keep_orig_class_label = self.config.get(
|
||||
'keep_orig_class_label', False
|
||||
)
|
||||
self.process_images = True # if False we skip loading & processing images and self.data contains filepaths
|
||||
self._prepare()
|
||||
self._prepare_synset_to_human()
|
||||
@@ -46,17 +56,23 @@ class ImageNetBase(Dataset):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _filter_relpaths(self, relpaths):
|
||||
ignore = set([
|
||||
"n06596364_9591.JPEG",
|
||||
])
|
||||
relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
|
||||
if "sub_indices" in self.config:
|
||||
indices = str_to_indices(self.config["sub_indices"])
|
||||
synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings
|
||||
ignore = set(
|
||||
[
|
||||
'n06596364_9591.JPEG',
|
||||
]
|
||||
)
|
||||
relpaths = [
|
||||
rpath for rpath in relpaths if not rpath.split('/')[-1] in ignore
|
||||
]
|
||||
if 'sub_indices' in self.config:
|
||||
indices = str_to_indices(self.config['sub_indices'])
|
||||
synsets = give_synsets_from_indices(
|
||||
indices, path_to_yaml=self.idx2syn
|
||||
) # returns a list of strings
|
||||
self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
|
||||
files = []
|
||||
for rpath in relpaths:
|
||||
syn = rpath.split("/")[0]
|
||||
syn = rpath.split('/')[0]
|
||||
if syn in synsets:
|
||||
files.append(rpath)
|
||||
return files
|
||||
@@ -65,78 +81,89 @@ class ImageNetBase(Dataset):
|
||||
|
||||
def _prepare_synset_to_human(self):
|
||||
SIZE = 2655750
|
||||
URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
|
||||
self.human_dict = os.path.join(self.root, "synset_human.txt")
|
||||
if (not os.path.exists(self.human_dict) or
|
||||
not os.path.getsize(self.human_dict)==SIZE):
|
||||
URL = 'https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1'
|
||||
self.human_dict = os.path.join(self.root, 'synset_human.txt')
|
||||
if (
|
||||
not os.path.exists(self.human_dict)
|
||||
or not os.path.getsize(self.human_dict) == SIZE
|
||||
):
|
||||
download(URL, self.human_dict)
|
||||
|
||||
def _prepare_idx_to_synset(self):
|
||||
URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
|
||||
self.idx2syn = os.path.join(self.root, "index_synset.yaml")
|
||||
if (not os.path.exists(self.idx2syn)):
|
||||
URL = 'https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1'
|
||||
self.idx2syn = os.path.join(self.root, 'index_synset.yaml')
|
||||
if not os.path.exists(self.idx2syn):
|
||||
download(URL, self.idx2syn)
|
||||
|
||||
def _prepare_human_to_integer_label(self):
|
||||
URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
|
||||
self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
|
||||
if (not os.path.exists(self.human2integer)):
|
||||
URL = 'https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1'
|
||||
self.human2integer = os.path.join(
|
||||
self.root, 'imagenet1000_clsidx_to_labels.txt'
|
||||
)
|
||||
if not os.path.exists(self.human2integer):
|
||||
download(URL, self.human2integer)
|
||||
with open(self.human2integer, "r") as f:
|
||||
with open(self.human2integer, 'r') as f:
|
||||
lines = f.read().splitlines()
|
||||
assert len(lines) == 1000
|
||||
self.human2integer_dict = dict()
|
||||
for line in lines:
|
||||
value, key = line.split(":")
|
||||
value, key = line.split(':')
|
||||
self.human2integer_dict[key] = int(value)
|
||||
|
||||
def _load(self):
|
||||
with open(self.txt_filelist, "r") as f:
|
||||
with open(self.txt_filelist, 'r') as f:
|
||||
self.relpaths = f.read().splitlines()
|
||||
l1 = len(self.relpaths)
|
||||
self.relpaths = self._filter_relpaths(self.relpaths)
|
||||
print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
|
||||
print(
|
||||
'Removed {} files from filelist during filtering.'.format(
|
||||
l1 - len(self.relpaths)
|
||||
)
|
||||
)
|
||||
|
||||
self.synsets = [p.split("/")[0] for p in self.relpaths]
|
||||
self.synsets = [p.split('/')[0] for p in self.relpaths]
|
||||
self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]
|
||||
|
||||
unique_synsets = np.unique(self.synsets)
|
||||
class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
|
||||
class_dict = dict(
|
||||
(synset, i) for i, synset in enumerate(unique_synsets)
|
||||
)
|
||||
if not self.keep_orig_class_label:
|
||||
self.class_labels = [class_dict[s] for s in self.synsets]
|
||||
else:
|
||||
self.class_labels = [self.synset2idx[s] for s in self.synsets]
|
||||
|
||||
with open(self.human_dict, "r") as f:
|
||||
with open(self.human_dict, 'r') as f:
|
||||
human_dict = f.read().splitlines()
|
||||
human_dict = dict(line.split(maxsplit=1) for line in human_dict)
|
||||
|
||||
self.human_labels = [human_dict[s] for s in self.synsets]
|
||||
|
||||
labels = {
|
||||
"relpath": np.array(self.relpaths),
|
||||
"synsets": np.array(self.synsets),
|
||||
"class_label": np.array(self.class_labels),
|
||||
"human_label": np.array(self.human_labels),
|
||||
'relpath': np.array(self.relpaths),
|
||||
'synsets': np.array(self.synsets),
|
||||
'class_label': np.array(self.class_labels),
|
||||
'human_label': np.array(self.human_labels),
|
||||
}
|
||||
|
||||
if self.process_images:
|
||||
self.size = retrieve(self.config, "size", default=256)
|
||||
self.data = ImagePaths(self.abspaths,
|
||||
labels=labels,
|
||||
size=self.size,
|
||||
random_crop=self.random_crop,
|
||||
)
|
||||
self.size = retrieve(self.config, 'size', default=256)
|
||||
self.data = ImagePaths(
|
||||
self.abspaths,
|
||||
labels=labels,
|
||||
size=self.size,
|
||||
random_crop=self.random_crop,
|
||||
)
|
||||
else:
|
||||
self.data = self.abspaths
|
||||
|
||||
|
||||
class ImageNetTrain(ImageNetBase):
|
||||
NAME = "ILSVRC2012_train"
|
||||
URL = "http://www.image-net.org/challenges/LSVRC/2012/"
|
||||
AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
|
||||
NAME = 'ILSVRC2012_train'
|
||||
URL = 'http://www.image-net.org/challenges/LSVRC/2012/'
|
||||
AT_HASH = 'a306397ccf9c2ead27155983c254227c0fd938e2'
|
||||
FILES = [
|
||||
"ILSVRC2012_img_train.tar",
|
||||
'ILSVRC2012_img_train.tar',
|
||||
]
|
||||
SIZES = [
|
||||
147897477120,
|
||||
@@ -151,57 +178,64 @@ class ImageNetTrain(ImageNetBase):
|
||||
if self.data_root:
|
||||
self.root = os.path.join(self.data_root, self.NAME)
|
||||
else:
|
||||
cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
|
||||
self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
|
||||
cachedir = os.environ.get(
|
||||
'XDG_CACHE_HOME', os.path.expanduser('~/.cache')
|
||||
)
|
||||
self.root = os.path.join(cachedir, 'autoencoders/data', self.NAME)
|
||||
|
||||
self.datadir = os.path.join(self.root, "data")
|
||||
self.txt_filelist = os.path.join(self.root, "filelist.txt")
|
||||
self.datadir = os.path.join(self.root, 'data')
|
||||
self.txt_filelist = os.path.join(self.root, 'filelist.txt')
|
||||
self.expected_length = 1281167
|
||||
self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
|
||||
default=True)
|
||||
self.random_crop = retrieve(
|
||||
self.config, 'ImageNetTrain/random_crop', default=True
|
||||
)
|
||||
if not tdu.is_prepared(self.root):
|
||||
# prep
|
||||
print("Preparing dataset {} in {}".format(self.NAME, self.root))
|
||||
print('Preparing dataset {} in {}'.format(self.NAME, self.root))
|
||||
|
||||
datadir = self.datadir
|
||||
if not os.path.exists(datadir):
|
||||
path = os.path.join(self.root, self.FILES[0])
|
||||
if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
|
||||
if (
|
||||
not os.path.exists(path)
|
||||
or not os.path.getsize(path) == self.SIZES[0]
|
||||
):
|
||||
import academictorrents as at
|
||||
|
||||
atpath = at.get(self.AT_HASH, datastore=self.root)
|
||||
assert atpath == path
|
||||
|
||||
print("Extracting {} to {}".format(path, datadir))
|
||||
print('Extracting {} to {}'.format(path, datadir))
|
||||
os.makedirs(datadir, exist_ok=True)
|
||||
with tarfile.open(path, "r:") as tar:
|
||||
with tarfile.open(path, 'r:') as tar:
|
||||
tar.extractall(path=datadir)
|
||||
|
||||
print("Extracting sub-tars.")
|
||||
subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
|
||||
print('Extracting sub-tars.')
|
||||
subpaths = sorted(glob.glob(os.path.join(datadir, '*.tar')))
|
||||
for subpath in tqdm(subpaths):
|
||||
subdir = subpath[:-len(".tar")]
|
||||
subdir = subpath[: -len('.tar')]
|
||||
os.makedirs(subdir, exist_ok=True)
|
||||
with tarfile.open(subpath, "r:") as tar:
|
||||
with tarfile.open(subpath, 'r:') as tar:
|
||||
tar.extractall(path=subdir)
|
||||
|
||||
filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
|
||||
filelist = glob.glob(os.path.join(datadir, '**', '*.JPEG'))
|
||||
filelist = [os.path.relpath(p, start=datadir) for p in filelist]
|
||||
filelist = sorted(filelist)
|
||||
filelist = "\n".join(filelist)+"\n"
|
||||
with open(self.txt_filelist, "w") as f:
|
||||
filelist = '\n'.join(filelist) + '\n'
|
||||
with open(self.txt_filelist, 'w') as f:
|
||||
f.write(filelist)
|
||||
|
||||
tdu.mark_prepared(self.root)
|
||||
|
||||
|
||||
class ImageNetValidation(ImageNetBase):
|
||||
NAME = "ILSVRC2012_validation"
|
||||
URL = "http://www.image-net.org/challenges/LSVRC/2012/"
|
||||
AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
|
||||
VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
|
||||
NAME = 'ILSVRC2012_validation'
|
||||
URL = 'http://www.image-net.org/challenges/LSVRC/2012/'
|
||||
AT_HASH = '5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5'
|
||||
VS_URL = 'https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1'
|
||||
FILES = [
|
||||
"ILSVRC2012_img_val.tar",
|
||||
"validation_synset.txt",
|
||||
'ILSVRC2012_img_val.tar',
|
||||
'validation_synset.txt',
|
||||
]
|
||||
SIZES = [
|
||||
6744924160,
|
||||
@@ -217,39 +251,49 @@ class ImageNetValidation(ImageNetBase):
|
||||
if self.data_root:
|
||||
self.root = os.path.join(self.data_root, self.NAME)
|
||||
else:
|
||||
cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
|
||||
self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
|
||||
self.datadir = os.path.join(self.root, "data")
|
||||
self.txt_filelist = os.path.join(self.root, "filelist.txt")
|
||||
cachedir = os.environ.get(
|
||||
'XDG_CACHE_HOME', os.path.expanduser('~/.cache')
|
||||
)
|
||||
self.root = os.path.join(cachedir, 'autoencoders/data', self.NAME)
|
||||
self.datadir = os.path.join(self.root, 'data')
|
||||
self.txt_filelist = os.path.join(self.root, 'filelist.txt')
|
||||
self.expected_length = 50000
|
||||
self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
|
||||
default=False)
|
||||
self.random_crop = retrieve(
|
||||
self.config, 'ImageNetValidation/random_crop', default=False
|
||||
)
|
||||
if not tdu.is_prepared(self.root):
|
||||
# prep
|
||||
print("Preparing dataset {} in {}".format(self.NAME, self.root))
|
||||
print('Preparing dataset {} in {}'.format(self.NAME, self.root))
|
||||
|
||||
datadir = self.datadir
|
||||
if not os.path.exists(datadir):
|
||||
path = os.path.join(self.root, self.FILES[0])
|
||||
if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
|
||||
if (
|
||||
not os.path.exists(path)
|
||||
or not os.path.getsize(path) == self.SIZES[0]
|
||||
):
|
||||
import academictorrents as at
|
||||
|
||||
atpath = at.get(self.AT_HASH, datastore=self.root)
|
||||
assert atpath == path
|
||||
|
||||
print("Extracting {} to {}".format(path, datadir))
|
||||
print('Extracting {} to {}'.format(path, datadir))
|
||||
os.makedirs(datadir, exist_ok=True)
|
||||
with tarfile.open(path, "r:") as tar:
|
||||
with tarfile.open(path, 'r:') as tar:
|
||||
tar.extractall(path=datadir)
|
||||
|
||||
vspath = os.path.join(self.root, self.FILES[1])
|
||||
if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
|
||||
if (
|
||||
not os.path.exists(vspath)
|
||||
or not os.path.getsize(vspath) == self.SIZES[1]
|
||||
):
|
||||
download(self.VS_URL, vspath)
|
||||
|
||||
with open(vspath, "r") as f:
|
||||
with open(vspath, 'r') as f:
|
||||
synset_dict = f.read().splitlines()
|
||||
synset_dict = dict(line.split() for line in synset_dict)
|
||||
|
||||
print("Reorganizing into synset folders")
|
||||
print('Reorganizing into synset folders')
|
||||
synsets = np.unique(list(synset_dict.values()))
|
||||
for s in synsets:
|
||||
os.makedirs(os.path.join(datadir, s), exist_ok=True)
|
||||
@@ -258,21 +302,26 @@ class ImageNetValidation(ImageNetBase):
|
||||
dst = os.path.join(datadir, v)
|
||||
shutil.move(src, dst)
|
||||
|
||||
filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
|
||||
filelist = glob.glob(os.path.join(datadir, '**', '*.JPEG'))
|
||||
filelist = [os.path.relpath(p, start=datadir) for p in filelist]
|
||||
filelist = sorted(filelist)
|
||||
filelist = "\n".join(filelist)+"\n"
|
||||
with open(self.txt_filelist, "w") as f:
|
||||
filelist = '\n'.join(filelist) + '\n'
|
||||
with open(self.txt_filelist, 'w') as f:
|
||||
f.write(filelist)
|
||||
|
||||
tdu.mark_prepared(self.root)
|
||||
|
||||
|
||||
|
||||
class ImageNetSR(Dataset):
|
||||
def __init__(self, size=None,
|
||||
degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
|
||||
random_crop=True):
|
||||
def __init__(
|
||||
self,
|
||||
size=None,
|
||||
degradation=None,
|
||||
downscale_f=4,
|
||||
min_crop_f=0.5,
|
||||
max_crop_f=1.0,
|
||||
random_crop=True,
|
||||
):
|
||||
"""
|
||||
Imagenet Superresolution Dataloader
|
||||
Performs following ops in order:
|
||||
@@ -296,67 +345,86 @@ class ImageNetSR(Dataset):
|
||||
self.LR_size = int(size / downscale_f)
|
||||
self.min_crop_f = min_crop_f
|
||||
self.max_crop_f = max_crop_f
|
||||
assert(max_crop_f <= 1.)
|
||||
assert max_crop_f <= 1.0
|
||||
self.center_crop = not random_crop
|
||||
|
||||
self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
|
||||
self.image_rescaler = albumentations.SmallestMaxSize(
|
||||
max_size=size, interpolation=cv2.INTER_AREA
|
||||
)
|
||||
|
||||
self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
|
||||
self.pil_interpolation = (
|
||||
False # gets reset later if incase interp_op is from pillow
|
||||
)
|
||||
|
||||
if degradation == "bsrgan":
|
||||
self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
|
||||
if degradation == 'bsrgan':
|
||||
self.degradation_process = partial(
|
||||
degradation_fn_bsr, sf=downscale_f
|
||||
)
|
||||
|
||||
elif degradation == "bsrgan_light":
|
||||
self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
|
||||
elif degradation == 'bsrgan_light':
|
||||
self.degradation_process = partial(
|
||||
degradation_fn_bsr_light, sf=downscale_f
|
||||
)
|
||||
|
||||
else:
|
||||
interpolation_fn = {
|
||||
"cv_nearest": cv2.INTER_NEAREST,
|
||||
"cv_bilinear": cv2.INTER_LINEAR,
|
||||
"cv_bicubic": cv2.INTER_CUBIC,
|
||||
"cv_area": cv2.INTER_AREA,
|
||||
"cv_lanczos": cv2.INTER_LANCZOS4,
|
||||
"pil_nearest": PIL.Image.NEAREST,
|
||||
"pil_bilinear": PIL.Image.BILINEAR,
|
||||
"pil_bicubic": PIL.Image.BICUBIC,
|
||||
"pil_box": PIL.Image.BOX,
|
||||
"pil_hamming": PIL.Image.HAMMING,
|
||||
"pil_lanczos": PIL.Image.LANCZOS,
|
||||
'cv_nearest': cv2.INTER_NEAREST,
|
||||
'cv_bilinear': cv2.INTER_LINEAR,
|
||||
'cv_bicubic': cv2.INTER_CUBIC,
|
||||
'cv_area': cv2.INTER_AREA,
|
||||
'cv_lanczos': cv2.INTER_LANCZOS4,
|
||||
'pil_nearest': PIL.Image.NEAREST,
|
||||
'pil_bilinear': PIL.Image.BILINEAR,
|
||||
'pil_bicubic': PIL.Image.BICUBIC,
|
||||
'pil_box': PIL.Image.BOX,
|
||||
'pil_hamming': PIL.Image.HAMMING,
|
||||
'pil_lanczos': PIL.Image.LANCZOS,
|
||||
}[degradation]
|
||||
|
||||
self.pil_interpolation = degradation.startswith("pil_")
|
||||
self.pil_interpolation = degradation.startswith('pil_')
|
||||
|
||||
if self.pil_interpolation:
|
||||
self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
|
||||
self.degradation_process = partial(
|
||||
TF.resize,
|
||||
size=self.LR_size,
|
||||
interpolation=interpolation_fn,
|
||||
)
|
||||
|
||||
else:
|
||||
self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
|
||||
interpolation=interpolation_fn)
|
||||
self.degradation_process = albumentations.SmallestMaxSize(
|
||||
max_size=self.LR_size, interpolation=interpolation_fn
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.base)
|
||||
|
||||
def __getitem__(self, i):
|
||||
example = self.base[i]
|
||||
image = Image.open(example["file_path_"])
|
||||
image = Image.open(example['file_path_'])
|
||||
|
||||
if not image.mode == "RGB":
|
||||
image = image.convert("RGB")
|
||||
if not image.mode == 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
image = np.array(image).astype(np.uint8)
|
||||
|
||||
min_side_len = min(image.shape[:2])
|
||||
crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
|
||||
crop_side_len = min_side_len * np.random.uniform(
|
||||
self.min_crop_f, self.max_crop_f, size=None
|
||||
)
|
||||
crop_side_len = int(crop_side_len)
|
||||
|
||||
if self.center_crop:
|
||||
self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
|
||||
self.cropper = albumentations.CenterCrop(
|
||||
height=crop_side_len, width=crop_side_len
|
||||
)
|
||||
|
||||
else:
|
||||
self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
|
||||
self.cropper = albumentations.RandomCrop(
|
||||
height=crop_side_len, width=crop_side_len
|
||||
)
|
||||
|
||||
image = self.cropper(image=image)["image"]
|
||||
image = self.image_rescaler(image=image)["image"]
|
||||
image = self.cropper(image=image)['image']
|
||||
image = self.image_rescaler(image=image)['image']
|
||||
|
||||
if self.pil_interpolation:
|
||||
image_pil = PIL.Image.fromarray(image)
|
||||
@@ -364,10 +432,10 @@ class ImageNetSR(Dataset):
|
||||
LR_image = np.array(LR_image).astype(np.uint8)
|
||||
|
||||
else:
|
||||
LR_image = self.degradation_process(image=image)["image"]
|
||||
LR_image = self.degradation_process(image=image)['image']
|
||||
|
||||
example["image"] = (image/127.5 - 1.0).astype(np.float32)
|
||||
example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
|
||||
example['image'] = (image / 127.5 - 1.0).astype(np.float32)
|
||||
example['LR_image'] = (LR_image / 127.5 - 1.0).astype(np.float32)
|
||||
|
||||
return example
|
||||
|
||||
@@ -377,9 +445,11 @@ class ImageNetSRTrain(ImageNetSR):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def get_base(self):
|
||||
with open("data/imagenet_train_hr_indices.p", "rb") as f:
|
||||
with open('data/imagenet_train_hr_indices.p', 'rb') as f:
|
||||
indices = pickle.load(f)
|
||||
dset = ImageNetTrain(process_images=False,)
|
||||
dset = ImageNetTrain(
|
||||
process_images=False,
|
||||
)
|
||||
return Subset(dset, indices)
|
||||
|
||||
|
||||
@@ -388,7 +458,9 @@ class ImageNetSRValidation(ImageNetSR):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def get_base(self):
|
||||
with open("data/imagenet_val_hr_indices.p", "rb") as f:
|
||||
with open('data/imagenet_val_hr_indices.p', 'rb') as f:
|
||||
indices = pickle.load(f)
|
||||
dset = ImageNetValidation(process_images=False,)
|
||||
dset = ImageNetValidation(
|
||||
process_images=False,
|
||||
)
|
||||
return Subset(dset, indices)
|
||||
|
||||
104
ldm/data/lsun.py
@@ -7,30 +7,33 @@ from torchvision import transforms
|
||||
|
||||
|
||||
class LSUNBase(Dataset):
|
||||
def __init__(self,
|
||||
txt_file,
|
||||
data_root,
|
||||
size=None,
|
||||
interpolation="bicubic",
|
||||
flip_p=0.5
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
txt_file,
|
||||
data_root,
|
||||
size=None,
|
||||
interpolation='bicubic',
|
||||
flip_p=0.5,
|
||||
):
|
||||
self.data_paths = txt_file
|
||||
self.data_root = data_root
|
||||
with open(self.data_paths, "r") as f:
|
||||
with open(self.data_paths, 'r') as f:
|
||||
self.image_paths = f.read().splitlines()
|
||||
self._length = len(self.image_paths)
|
||||
self.labels = {
|
||||
"relative_file_path_": [l for l in self.image_paths],
|
||||
"file_path_": [os.path.join(self.data_root, l)
|
||||
for l in self.image_paths],
|
||||
'relative_file_path_': [l for l in self.image_paths],
|
||||
'file_path_': [
|
||||
os.path.join(self.data_root, l) for l in self.image_paths
|
||||
],
|
||||
}
|
||||
|
||||
self.size = size
|
||||
self.interpolation = {"linear": PIL.Image.LINEAR,
|
||||
"bilinear": PIL.Image.BILINEAR,
|
||||
"bicubic": PIL.Image.BICUBIC,
|
||||
"lanczos": PIL.Image.LANCZOS,
|
||||
}[interpolation]
|
||||
self.interpolation = {
|
||||
'linear': PIL.Image.LINEAR,
|
||||
'bilinear': PIL.Image.BILINEAR,
|
||||
'bicubic': PIL.Image.BICUBIC,
|
||||
'lanczos': PIL.Image.LANCZOS,
|
||||
}[interpolation]
|
||||
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
||||
|
||||
def __len__(self):
|
||||
@@ -38,55 +41,86 @@ class LSUNBase(Dataset):
|
||||
|
||||
def __getitem__(self, i):
|
||||
example = dict((k, self.labels[k][i]) for k in self.labels)
|
||||
image = Image.open(example["file_path_"])
|
||||
if not image.mode == "RGB":
|
||||
image = image.convert("RGB")
|
||||
image = Image.open(example['file_path_'])
|
||||
if not image.mode == 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
# default to score-sde preprocessing
|
||||
img = np.array(image).astype(np.uint8)
|
||||
crop = min(img.shape[0], img.shape[1])
|
||||
h, w, = img.shape[0], img.shape[1]
|
||||
img = img[(h - crop) // 2:(h + crop) // 2,
|
||||
(w - crop) // 2:(w + crop) // 2]
|
||||
h, w, = (
|
||||
img.shape[0],
|
||||
img.shape[1],
|
||||
)
|
||||
img = img[
|
||||
(h - crop) // 2 : (h + crop) // 2,
|
||||
(w - crop) // 2 : (w + crop) // 2,
|
||||
]
|
||||
|
||||
image = Image.fromarray(img)
|
||||
if self.size is not None:
|
||||
image = image.resize((self.size, self.size), resample=self.interpolation)
|
||||
image = image.resize(
|
||||
(self.size, self.size), resample=self.interpolation
|
||||
)
|
||||
|
||||
image = self.flip(image)
|
||||
image = np.array(image).astype(np.uint8)
|
||||
example["image"] = (image / 127.5 - 1.0).astype(np.float32)
|
||||
example['image'] = (image / 127.5 - 1.0).astype(np.float32)
|
||||
return example
|
||||
|
||||
|
||||
class LSUNChurchesTrain(LSUNBase):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
|
||||
super().__init__(
|
||||
txt_file='data/lsun/church_outdoor_train.txt',
|
||||
data_root='data/lsun/churches',
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
class LSUNChurchesValidation(LSUNBase):
|
||||
def __init__(self, flip_p=0., **kwargs):
|
||||
super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
|
||||
flip_p=flip_p, **kwargs)
|
||||
def __init__(self, flip_p=0.0, **kwargs):
|
||||
super().__init__(
|
||||
txt_file='data/lsun/church_outdoor_val.txt',
|
||||
data_root='data/lsun/churches',
|
||||
flip_p=flip_p,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
class LSUNBedroomsTrain(LSUNBase):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
|
||||
super().__init__(
|
||||
txt_file='data/lsun/bedrooms_train.txt',
|
||||
data_root='data/lsun/bedrooms',
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
class LSUNBedroomsValidation(LSUNBase):
|
||||
def __init__(self, flip_p=0.0, **kwargs):
|
||||
super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
|
||||
flip_p=flip_p, **kwargs)
|
||||
super().__init__(
|
||||
txt_file='data/lsun/bedrooms_val.txt',
|
||||
data_root='data/lsun/bedrooms',
|
||||
flip_p=flip_p,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
class LSUNCatsTrain(LSUNBase):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
|
||||
super().__init__(
|
||||
txt_file='data/lsun/cat_train.txt',
|
||||
data_root='data/lsun/cats',
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
class LSUNCatsValidation(LSUNBase):
|
||||
def __init__(self, flip_p=0., **kwargs):
|
||||
super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
|
||||
flip_p=flip_p, **kwargs)
|
||||
def __init__(self, flip_p=0.0, **kwargs):
|
||||
super().__init__(
|
||||
txt_file='data/lsun/cat_val.txt',
|
||||
data_root='data/lsun/cats',
|
||||
flip_p=flip_p,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
202
ldm/data/personalized.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import PIL
|
||||
from PIL import Image
|
||||
from torch.utils.data import Dataset
|
||||
from torchvision import transforms
|
||||
|
||||
import random
|
||||
|
||||
imagenet_templates_smallest = [
|
||||
'a photo of a {}',
|
||||
]
|
||||
|
||||
imagenet_templates_small = [
|
||||
'a photo of a {}',
|
||||
'a rendering of a {}',
|
||||
'a cropped photo of the {}',
|
||||
'the photo of a {}',
|
||||
'a photo of a clean {}',
|
||||
'a photo of a dirty {}',
|
||||
'a dark photo of the {}',
|
||||
'a photo of my {}',
|
||||
'a photo of the cool {}',
|
||||
'a close-up photo of a {}',
|
||||
'a bright photo of the {}',
|
||||
'a cropped photo of a {}',
|
||||
'a photo of the {}',
|
||||
'a good photo of the {}',
|
||||
'a photo of one {}',
|
||||
'a close-up photo of the {}',
|
||||
'a rendition of the {}',
|
||||
'a photo of the clean {}',
|
||||
'a rendition of a {}',
|
||||
'a photo of a nice {}',
|
||||
'a good photo of a {}',
|
||||
'a photo of the nice {}',
|
||||
'a photo of the small {}',
|
||||
'a photo of the weird {}',
|
||||
'a photo of the large {}',
|
||||
'a photo of a cool {}',
|
||||
'a photo of a small {}',
|
||||
]
|
||||
|
||||
imagenet_dual_templates_small = [
|
||||
'a photo of a {} with {}',
|
||||
'a rendering of a {} with {}',
|
||||
'a cropped photo of the {} with {}',
|
||||
'the photo of a {} with {}',
|
||||
'a photo of a clean {} with {}',
|
||||
'a photo of a dirty {} with {}',
|
||||
'a dark photo of the {} with {}',
|
||||
'a photo of my {} with {}',
|
||||
'a photo of the cool {} with {}',
|
||||
'a close-up photo of a {} with {}',
|
||||
'a bright photo of the {} with {}',
|
||||
'a cropped photo of a {} with {}',
|
||||
'a photo of the {} with {}',
|
||||
'a good photo of the {} with {}',
|
||||
'a photo of one {} with {}',
|
||||
'a close-up photo of the {} with {}',
|
||||
'a rendition of the {} with {}',
|
||||
'a photo of the clean {} with {}',
|
||||
'a rendition of a {} with {}',
|
||||
'a photo of a nice {} with {}',
|
||||
'a good photo of a {} with {}',
|
||||
'a photo of the nice {} with {}',
|
||||
'a photo of the small {} with {}',
|
||||
'a photo of the weird {} with {}',
|
||||
'a photo of the large {} with {}',
|
||||
'a photo of a cool {} with {}',
|
||||
'a photo of a small {} with {}',
|
||||
]
|
||||
|
||||
per_img_token_list = [
|
||||
'א',
|
||||
'ב',
|
||||
'ג',
|
||||
'ד',
|
||||
'ה',
|
||||
'ו',
|
||||
'ז',
|
||||
'ח',
|
||||
'ט',
|
||||
'י',
|
||||
'כ',
|
||||
'ל',
|
||||
'מ',
|
||||
'נ',
|
||||
'ס',
|
||||
'ע',
|
||||
'פ',
|
||||
'צ',
|
||||
'ק',
|
||||
'ר',
|
||||
'ש',
|
||||
'ת',
|
||||
]
|
||||
|
||||
|
||||
class PersonalizedBase(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
data_root,
|
||||
size=None,
|
||||
repeats=100,
|
||||
interpolation='bicubic',
|
||||
flip_p=0.5,
|
||||
set='train',
|
||||
placeholder_token='*',
|
||||
per_image_tokens=False,
|
||||
center_crop=False,
|
||||
mixing_prob=0.25,
|
||||
coarse_class_text=None,
|
||||
):
|
||||
|
||||
self.data_root = data_root
|
||||
|
||||
self.image_paths = [
|
||||
os.path.join(self.data_root, file_path)
|
||||
for file_path in os.listdir(self.data_root)
|
||||
]
|
||||
|
||||
# self._length = len(self.image_paths)
|
||||
self.num_images = len(self.image_paths)
|
||||
self._length = self.num_images
|
||||
|
||||
self.placeholder_token = placeholder_token
|
||||
|
||||
self.per_image_tokens = per_image_tokens
|
||||
self.center_crop = center_crop
|
||||
self.mixing_prob = mixing_prob
|
||||
|
||||
self.coarse_class_text = coarse_class_text
|
||||
|
||||
if per_image_tokens:
|
||||
assert self.num_images < len(
|
||||
per_img_token_list
|
||||
), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
|
||||
|
||||
if set == 'train':
|
||||
self._length = self.num_images * repeats
|
||||
|
||||
self.size = size
|
||||
self.interpolation = {
|
||||
'linear': PIL.Image.LINEAR,
|
||||
'bilinear': PIL.Image.BILINEAR,
|
||||
'bicubic': PIL.Image.BICUBIC,
|
||||
'lanczos': PIL.Image.LANCZOS,
|
||||
}[interpolation]
|
||||
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
def __getitem__(self, i):
|
||||
example = {}
|
||||
image = Image.open(self.image_paths[i % self.num_images])
|
||||
|
||||
if not image.mode == 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
placeholder_string = self.placeholder_token
|
||||
if self.coarse_class_text:
|
||||
placeholder_string = (
|
||||
f'{self.coarse_class_text} {placeholder_string}'
|
||||
)
|
||||
|
||||
if self.per_image_tokens and np.random.uniform() < self.mixing_prob:
|
||||
text = random.choice(imagenet_dual_templates_small).format(
|
||||
placeholder_string, per_img_token_list[i % self.num_images]
|
||||
)
|
||||
else:
|
||||
text = random.choice(imagenet_templates_small).format(
|
||||
placeholder_string
|
||||
)
|
||||
|
||||
example['caption'] = text
|
||||
|
||||
# default to score-sde preprocessing
|
||||
img = np.array(image).astype(np.uint8)
|
||||
|
||||
if self.center_crop:
|
||||
crop = min(img.shape[0], img.shape[1])
|
||||
h, w, = (
|
||||
img.shape[0],
|
||||
img.shape[1],
|
||||
)
|
||||
img = img[
|
||||
(h - crop) // 2 : (h + crop) // 2,
|
||||
(w - crop) // 2 : (w + crop) // 2,
|
||||
]
|
||||
|
||||
image = Image.fromarray(img)
|
||||
if self.size is not None:
|
||||
image = image.resize(
|
||||
(self.size, self.size), resample=self.interpolation
|
||||
)
|
||||
|
||||
image = self.flip(image)
|
||||
image = np.array(image).astype(np.uint8)
|
||||
example['image'] = (image / 127.5 - 1.0).astype(np.float32)
|
||||
return example
|
||||
169
ldm/data/personalized_style.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import PIL
|
||||
from PIL import Image
|
||||
from torch.utils.data import Dataset
|
||||
from torchvision import transforms
|
||||
|
||||
import random
|
||||
|
||||
imagenet_templates_small = [
|
||||
'a painting in the style of {}',
|
||||
'a rendering in the style of {}',
|
||||
'a cropped painting in the style of {}',
|
||||
'the painting in the style of {}',
|
||||
'a clean painting in the style of {}',
|
||||
'a dirty painting in the style of {}',
|
||||
'a dark painting in the style of {}',
|
||||
'a picture in the style of {}',
|
||||
'a cool painting in the style of {}',
|
||||
'a close-up painting in the style of {}',
|
||||
'a bright painting in the style of {}',
|
||||
'a cropped painting in the style of {}',
|
||||
'a good painting in the style of {}',
|
||||
'a close-up painting in the style of {}',
|
||||
'a rendition in the style of {}',
|
||||
'a nice painting in the style of {}',
|
||||
'a small painting in the style of {}',
|
||||
'a weird painting in the style of {}',
|
||||
'a large painting in the style of {}',
|
||||
]
|
||||
|
||||
imagenet_dual_templates_small = [
|
||||
'a painting in the style of {} with {}',
|
||||
'a rendering in the style of {} with {}',
|
||||
'a cropped painting in the style of {} with {}',
|
||||
'the painting in the style of {} with {}',
|
||||
'a clean painting in the style of {} with {}',
|
||||
'a dirty painting in the style of {} with {}',
|
||||
'a dark painting in the style of {} with {}',
|
||||
'a cool painting in the style of {} with {}',
|
||||
'a close-up painting in the style of {} with {}',
|
||||
'a bright painting in the style of {} with {}',
|
||||
'a cropped painting in the style of {} with {}',
|
||||
'a good painting in the style of {} with {}',
|
||||
'a painting of one {} in the style of {}',
|
||||
'a nice painting in the style of {} with {}',
|
||||
'a small painting in the style of {} with {}',
|
||||
'a weird painting in the style of {} with {}',
|
||||
'a large painting in the style of {} with {}',
|
||||
]
|
||||
|
||||
per_img_token_list = [
|
||||
'א',
|
||||
'ב',
|
||||
'ג',
|
||||
'ד',
|
||||
'ה',
|
||||
'ו',
|
||||
'ז',
|
||||
'ח',
|
||||
'ט',
|
||||
'י',
|
||||
'כ',
|
||||
'ל',
|
||||
'מ',
|
||||
'נ',
|
||||
'ס',
|
||||
'ע',
|
||||
'פ',
|
||||
'צ',
|
||||
'ק',
|
||||
'ר',
|
||||
'ש',
|
||||
'ת',
|
||||
]
|
||||
|
||||
|
||||
class PersonalizedBase(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
data_root,
|
||||
size=None,
|
||||
repeats=100,
|
||||
interpolation='bicubic',
|
||||
flip_p=0.5,
|
||||
set='train',
|
||||
placeholder_token='*',
|
||||
per_image_tokens=False,
|
||||
center_crop=False,
|
||||
):
|
||||
|
||||
self.data_root = data_root
|
||||
|
||||
self.image_paths = [
|
||||
os.path.join(self.data_root, file_path)
|
||||
for file_path in os.listdir(self.data_root)
|
||||
]
|
||||
|
||||
# self._length = len(self.image_paths)
|
||||
self.num_images = len(self.image_paths)
|
||||
self._length = self.num_images
|
||||
|
||||
self.placeholder_token = placeholder_token
|
||||
|
||||
self.per_image_tokens = per_image_tokens
|
||||
self.center_crop = center_crop
|
||||
|
||||
if per_image_tokens:
|
||||
assert self.num_images < len(
|
||||
per_img_token_list
|
||||
), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
|
||||
|
||||
if set == 'train':
|
||||
self._length = self.num_images * repeats
|
||||
|
||||
self.size = size
|
||||
self.interpolation = {
|
||||
'linear': PIL.Image.LINEAR,
|
||||
'bilinear': PIL.Image.BILINEAR,
|
||||
'bicubic': PIL.Image.BICUBIC,
|
||||
'lanczos': PIL.Image.LANCZOS,
|
||||
}[interpolation]
|
||||
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
def __getitem__(self, i):
|
||||
example = {}
|
||||
image = Image.open(self.image_paths[i % self.num_images])
|
||||
|
||||
if not image.mode == 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
if self.per_image_tokens and np.random.uniform() < 0.25:
|
||||
text = random.choice(imagenet_dual_templates_small).format(
|
||||
self.placeholder_token, per_img_token_list[i % self.num_images]
|
||||
)
|
||||
else:
|
||||
text = random.choice(imagenet_templates_small).format(
|
||||
self.placeholder_token
|
||||
)
|
||||
|
||||
example['caption'] = text
|
||||
|
||||
# default to score-sde preprocessing
|
||||
img = np.array(image).astype(np.uint8)
|
||||
|
||||
if self.center_crop:
|
||||
crop = min(img.shape[0], img.shape[1])
|
||||
h, w, = (
|
||||
img.shape[0],
|
||||
img.shape[1],
|
||||
)
|
||||
img = img[
|
||||
(h - crop) // 2 : (h + crop) // 2,
|
||||
(w - crop) // 2 : (w + crop) // 2,
|
||||
]
|
||||
|
||||
image = Image.fromarray(img)
|
||||
if self.size is not None:
|
||||
image = image.resize(
|
||||
(self.size, self.size), resample=self.interpolation
|
||||
)
|
||||
|
||||
image = self.flip(image)
|
||||
image = np.array(image).astype(np.uint8)
|
||||
example['image'] = (image / 127.5 - 1.0).astype(np.float32)
|
||||
return example
|
||||
17
ldm/dream/devices.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import torch
|
||||
|
||||
def choose_torch_device() -> str:
|
||||
'''Convenience routine for guessing which GPU device to run model on'''
|
||||
if torch.cuda.is_available():
|
||||
return 'cuda'
|
||||
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
||||
return 'mps'
|
||||
return 'cpu'
|
||||
|
||||
def choose_autocast_device(device) -> str:
|
||||
'''Returns an autocast compatible device from a torch device'''
|
||||
device_type = device.type # this returns 'mps' on M1
|
||||
# autocast only supports cuda or cpu
|
||||
if device_type not in ('cuda','cpu'):
|
||||
return 'cpu'
|
||||
return device_type
|
||||
70
ldm/dream/image_util.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from math import sqrt, floor, ceil
|
||||
from PIL import Image
|
||||
|
||||
class InitImageResizer():
|
||||
"""Simple class to create resized copies of an Image while preserving the aspect ratio."""
|
||||
def __init__(self,Image):
|
||||
self.image = Image
|
||||
|
||||
def resize(self,width=None,height=None) -> Image:
|
||||
"""
|
||||
Return a copy of the image resized to fit within
|
||||
a box width x height. The aspect ratio is
|
||||
maintained. If neither width nor height are provided,
|
||||
then returns a copy of the original image. If one or the other is
|
||||
provided, then the other will be calculated from the
|
||||
aspect ratio.
|
||||
|
||||
Everything is floored to the nearest multiple of 64 so
|
||||
that it can be passed to img2img()
|
||||
"""
|
||||
im = self.image
|
||||
|
||||
ar = im.width/float(im.height)
|
||||
|
||||
# Infer missing values from aspect ratio
|
||||
if not(width or height): # both missing
|
||||
width = im.width
|
||||
height = im.height
|
||||
elif not height: # height missing
|
||||
height = int(width/ar)
|
||||
elif not width: # width missing
|
||||
width = int(height*ar)
|
||||
|
||||
# rw and rh are the resizing width and height for the image
|
||||
# they maintain the aspect ratio, but may not completelyl fill up
|
||||
# the requested destination size
|
||||
(rw,rh) = (width,int(width/ar)) if im.width>=im.height else (int(height*ar),height)
|
||||
|
||||
#round everything to multiples of 64
|
||||
width,height,rw,rh = map(
|
||||
lambda x: x-x%64, (width,height,rw,rh)
|
||||
)
|
||||
|
||||
# no resize necessary, but return a copy
|
||||
if im.width == width and im.height == height:
|
||||
return im.copy()
|
||||
|
||||
# otherwise resize the original image so that it fits inside the bounding box
|
||||
resized_image = self.image.resize((rw,rh),resample=Image.Resampling.LANCZOS)
|
||||
return resized_image
|
||||
|
||||
def make_grid(image_list, rows=None, cols=None):
|
||||
image_cnt = len(image_list)
|
||||
if None in (rows, cols):
|
||||
rows = floor(sqrt(image_cnt)) # try to make it square
|
||||
cols = ceil(image_cnt / rows)
|
||||
width = image_list[0].width
|
||||
height = image_list[0].height
|
||||
|
||||
grid_img = Image.new('RGB', (width * cols, height * rows))
|
||||
i = 0
|
||||
for r in range(0, rows):
|
||||
for c in range(0, cols):
|
||||
if i >= len(image_list):
|
||||
break
|
||||
grid_img.paste(image_list[i], (c * width, r * height))
|
||||
i = i + 1
|
||||
|
||||
return grid_img
|
||||
|
||||
79
ldm/dream/pngwriter.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
Two helper classes for dealing with PNG images and their path names.
|
||||
PngWriter -- Converts Images generated by T2I into PNGs, finds
|
||||
appropriate names for them, and writes prompt metadata
|
||||
into the PNG.
|
||||
PromptFormatter -- Utility for converting a Namespace of prompt parameters
|
||||
back into a formatted prompt string with command-line switches.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from PIL import PngImagePlugin
|
||||
|
||||
# -------------------image generation utils-----
|
||||
|
||||
|
||||
class PngWriter:
|
||||
def __init__(self, outdir):
|
||||
self.outdir = outdir
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
# gives the next unique prefix in outdir
|
||||
def unique_prefix(self):
|
||||
# sort reverse alphabetically until we find max+1
|
||||
dirlist = sorted(os.listdir(self.outdir), reverse=True)
|
||||
# find the first filename that matches our pattern or return 000000.0.png
|
||||
existing_name = next(
|
||||
(f for f in dirlist if re.match('^(\d+)\..*\.png', f)),
|
||||
'0000000.0.png',
|
||||
)
|
||||
basecount = int(existing_name.split('.', 1)[0]) + 1
|
||||
return f'{basecount:06}'
|
||||
|
||||
# saves image named _image_ to outdir/name, writing metadata from prompt
|
||||
# returns full path of output
|
||||
def save_image_and_prompt_to_png(self, image, prompt, name):
|
||||
path = os.path.join(self.outdir, name)
|
||||
info = PngImagePlugin.PngInfo()
|
||||
info.add_text('Dream', prompt)
|
||||
image.save(path, 'PNG', pnginfo=info)
|
||||
return path
|
||||
|
||||
|
||||
class PromptFormatter:
|
||||
def __init__(self, t2i, opt):
|
||||
self.t2i = t2i
|
||||
self.opt = opt
|
||||
|
||||
# note: the t2i object should provide all these values.
|
||||
# there should be no need to or against opt values
|
||||
def normalize_prompt(self):
|
||||
"""Normalize the prompt and switches"""
|
||||
t2i = self.t2i
|
||||
opt = self.opt
|
||||
|
||||
switches = list()
|
||||
switches.append(f'"{opt.prompt}"')
|
||||
switches.append(f'-s{opt.steps or t2i.steps}')
|
||||
switches.append(f'-W{opt.width or t2i.width}')
|
||||
switches.append(f'-H{opt.height or t2i.height}')
|
||||
switches.append(f'-C{opt.cfg_scale or t2i.cfg_scale}')
|
||||
switches.append(f'-A{opt.sampler_name or t2i.sampler_name}')
|
||||
if opt.init_img:
|
||||
switches.append(f'-I{opt.init_img}')
|
||||
if opt.fit:
|
||||
switches.append(f'--fit')
|
||||
if opt.strength and opt.init_img is not None:
|
||||
switches.append(f'-f{opt.strength or t2i.strength}')
|
||||
if opt.gfpgan_strength:
|
||||
switches.append(f'-G{opt.gfpgan_strength}')
|
||||
if opt.upscale:
|
||||
switches.append(f'-U {" ".join([str(u) for u in opt.upscale])}')
|
||||
if opt.variation_amount > 0:
|
||||
switches.append(f'-v{opt.variation_amount}')
|
||||
if opt.with_variations:
|
||||
formatted_variations = ','.join(f'{seed}:{weight}' for seed, weight in opt.with_variations)
|
||||
switches.append(f'-V{formatted_variations}')
|
||||
if t2i.full_precision:
|
||||
switches.append('-F')
|
||||
return ' '.join(switches)
|
||||
121
ldm/dream/readline.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Readline helper functions for dream.py (linux and mac only).
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import atexit
|
||||
|
||||
# ---------------readline utilities---------------------
|
||||
try:
|
||||
import readline
|
||||
|
||||
readline_available = True
|
||||
except:
|
||||
readline_available = False
|
||||
|
||||
|
||||
class Completer:
|
||||
def __init__(self, options):
|
||||
self.options = sorted(options)
|
||||
return
|
||||
|
||||
def complete(self, text, state):
|
||||
buffer = readline.get_line_buffer()
|
||||
|
||||
if text.startswith(('-I', '--init_img')):
|
||||
return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
|
||||
|
||||
if buffer.strip().endswith('cd') or text.startswith(('.', '/')):
|
||||
return self._path_completions(text, state, ())
|
||||
|
||||
response = None
|
||||
if state == 0:
|
||||
# This is the first time for this text, so build a match list.
|
||||
if text:
|
||||
self.matches = [
|
||||
s for s in self.options if s and s.startswith(text)
|
||||
]
|
||||
else:
|
||||
self.matches = self.options[:]
|
||||
|
||||
# Return the state'th item from the match list,
|
||||
# if we have that many.
|
||||
try:
|
||||
response = self.matches[state]
|
||||
except IndexError:
|
||||
response = None
|
||||
return response
|
||||
|
||||
def _path_completions(self, text, state, extensions):
|
||||
# get the path so far
|
||||
if text.startswith('-I'):
|
||||
path = text.replace('-I', '', 1).lstrip()
|
||||
elif text.startswith('--init_img='):
|
||||
path = text.replace('--init_img=', '', 1).lstrip()
|
||||
else:
|
||||
path = text
|
||||
|
||||
matches = list()
|
||||
|
||||
path = os.path.expanduser(path)
|
||||
if len(path) == 0:
|
||||
matches.append(text + './')
|
||||
else:
|
||||
dir = os.path.dirname(path)
|
||||
dir_list = os.listdir(dir)
|
||||
for n in dir_list:
|
||||
if n.startswith('.') and len(n) > 1:
|
||||
continue
|
||||
full_path = os.path.join(dir, n)
|
||||
if full_path.startswith(path):
|
||||
if os.path.isdir(full_path):
|
||||
matches.append(
|
||||
os.path.join(os.path.dirname(text), n) + '/'
|
||||
)
|
||||
elif n.endswith(extensions):
|
||||
matches.append(os.path.join(os.path.dirname(text), n))
|
||||
|
||||
try:
|
||||
response = matches[state]
|
||||
except IndexError:
|
||||
response = None
|
||||
return response
|
||||
|
||||
|
||||
if readline_available:
|
||||
readline.set_completer(
|
||||
Completer(
|
||||
[
|
||||
'--steps','-s',
|
||||
'--seed','-S',
|
||||
'--iterations','-n',
|
||||
'--width','-W','--height','-H',
|
||||
'--cfg_scale','-C',
|
||||
'--grid','-g',
|
||||
'--individual','-i',
|
||||
'--init_img','-I',
|
||||
'--strength','-f',
|
||||
'--variants','-v',
|
||||
'--outdir','-o',
|
||||
'--sampler','-A','-m',
|
||||
'--embedding_path',
|
||||
'--device',
|
||||
'--grid','-g',
|
||||
'--gfpgan_strength','-G',
|
||||
'--upscale','-U',
|
||||
'-save_orig','--save_original',
|
||||
'--skip_normalize','-x',
|
||||
'--log_tokenization','t',
|
||||
]
|
||||
).complete
|
||||
)
|
||||
readline.set_completer_delims(' ')
|
||||
readline.parse_and_bind('tab: complete')
|
||||
|
||||
histfile = os.path.join(os.path.expanduser('~'), '.dream_history')
|
||||
try:
|
||||
readline.read_history_file(histfile)
|
||||
readline.set_history_length(1000)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
atexit.register(readline.write_history_file, histfile)
|
||||
202
ldm/dream/server.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import json
|
||||
import base64
|
||||
import mimetypes
|
||||
import os
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from ldm.dream.pngwriter import PngWriter
|
||||
from threading import Event
|
||||
|
||||
class CanceledException(Exception):
|
||||
pass
|
||||
|
||||
class DreamServer(BaseHTTPRequestHandler):
|
||||
model = None
|
||||
canceled = Event()
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == "/":
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "text/html")
|
||||
self.end_headers()
|
||||
with open("./static/dream_web/index.html", "rb") as content:
|
||||
self.wfile.write(content.read())
|
||||
elif self.path == "/config.js":
|
||||
# unfortunately this import can't be at the top level, since that would cause a circular import
|
||||
from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "application/javascript")
|
||||
self.end_headers()
|
||||
config = {
|
||||
'gfpgan_model_exists': gfpgan_model_exists
|
||||
}
|
||||
self.wfile.write(bytes("let config = " + json.dumps(config) + ";\n", "utf-8"))
|
||||
elif self.path == "/cancel":
|
||||
self.canceled.set()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(bytes('{}', 'utf8'))
|
||||
else:
|
||||
path = "." + self.path
|
||||
cwd = os.path.realpath(os.getcwd())
|
||||
is_in_cwd = os.path.commonprefix((os.path.realpath(path), cwd)) == cwd
|
||||
if not (is_in_cwd and os.path.exists(path)):
|
||||
self.send_response(404)
|
||||
return
|
||||
mime_type = mimetypes.guess_type(path)[0]
|
||||
if mime_type is not None:
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", mime_type)
|
||||
self.end_headers()
|
||||
with open("." + self.path, "rb") as content:
|
||||
self.wfile.write(content.read())
|
||||
else:
|
||||
self.send_response(404)
|
||||
|
||||
def do_POST(self):
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "application/json")
|
||||
self.end_headers()
|
||||
|
||||
# unfortunately this import can't be at the top level, since that would cause a circular import
|
||||
from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
|
||||
|
||||
content_length = int(self.headers['Content-Length'])
|
||||
post_data = json.loads(self.rfile.read(content_length))
|
||||
prompt = post_data['prompt']
|
||||
initimg = post_data['initimg']
|
||||
strength = float(post_data['strength'])
|
||||
iterations = int(post_data['iterations'])
|
||||
steps = int(post_data['steps'])
|
||||
width = int(post_data['width'])
|
||||
height = int(post_data['height'])
|
||||
fit = 'fit' in post_data
|
||||
cfgscale = float(post_data['cfgscale'])
|
||||
sampler_name = post_data['sampler']
|
||||
gfpgan_strength = float(post_data['gfpgan_strength']) if gfpgan_model_exists else 0
|
||||
upscale_level = post_data['upscale_level']
|
||||
upscale_strength = post_data['upscale_strength']
|
||||
upscale = [int(upscale_level),float(upscale_strength)] if upscale_level != '' else None
|
||||
progress_images = 'progress_images' in post_data
|
||||
seed = self.model.seed if int(post_data['seed']) == -1 else int(post_data['seed'])
|
||||
|
||||
self.canceled.clear()
|
||||
print(f">> Request to generate with prompt: {prompt}")
|
||||
# In order to handle upscaled images, the PngWriter needs to maintain state
|
||||
# across images generated by each call to prompt2img(), so we define it in
|
||||
# the outer scope of image_done()
|
||||
config = post_data.copy() # Shallow copy
|
||||
config['initimg'] = ''
|
||||
|
||||
images_generated = 0 # helps keep track of when upscaling is started
|
||||
images_upscaled = 0 # helps keep track of when upscaling is completed
|
||||
pngwriter = PngWriter("./outputs/img-samples/")
|
||||
|
||||
prefix = pngwriter.unique_prefix()
|
||||
# if upscaling is requested, then this will be called twice, once when
|
||||
# the images are first generated, and then again when after upscaling
|
||||
# is complete. The upscaling replaces the original file, so the second
|
||||
# entry should not be inserted into the image list.
|
||||
def image_done(image, seed, upscaled=False):
|
||||
name = f'{prefix}.{seed}.png'
|
||||
path = pngwriter.save_image_and_prompt_to_png(image, f'{prompt} -S{seed}', name)
|
||||
|
||||
# Append post_data to log, but only once!
|
||||
if not upscaled:
|
||||
with open("./outputs/img-samples/dream_web_log.txt", "a") as log:
|
||||
log.write(f"{path}: {json.dumps(config)}\n")
|
||||
|
||||
self.wfile.write(bytes(json.dumps(
|
||||
{'event': 'result', 'url': path, 'seed': seed, 'config': config}
|
||||
) + '\n',"utf-8"))
|
||||
|
||||
# control state of the "postprocessing..." message
|
||||
upscaling_requested = upscale or gfpgan_strength>0
|
||||
nonlocal images_generated # NB: Is this bad python style? It is typical usage in a perl closure.
|
||||
nonlocal images_upscaled # NB: Is this bad python style? It is typical usage in a perl closure.
|
||||
if upscaled:
|
||||
images_upscaled += 1
|
||||
else:
|
||||
images_generated +=1
|
||||
if upscaling_requested:
|
||||
action = None
|
||||
if images_generated >= iterations:
|
||||
if images_upscaled < iterations:
|
||||
action = 'upscaling-started'
|
||||
else:
|
||||
action = 'upscaling-done'
|
||||
if action:
|
||||
x = images_upscaled+1
|
||||
self.wfile.write(bytes(json.dumps(
|
||||
{'event':action,'processed_file_cnt':f'{x}/{iterations}'}
|
||||
) + '\n',"utf-8"))
|
||||
|
||||
step_writer = PngWriter('./outputs/intermediates/')
|
||||
step_index = 1
|
||||
def image_progress(sample, step):
|
||||
if self.canceled.is_set():
|
||||
self.wfile.write(bytes(json.dumps({'event':'canceled'}) + '\n', 'utf-8'))
|
||||
raise CanceledException
|
||||
path = None
|
||||
# since rendering images is moderately expensive, only render every 5th image
|
||||
# and don't bother with the last one, since it'll render anyway
|
||||
nonlocal step_index
|
||||
if progress_images and step % 5 == 0 and step < steps - 1:
|
||||
image = self.model._sample_to_image(sample)
|
||||
name = f'{prefix}.{seed}.{step_index}.png'
|
||||
metadata = f'{prompt} -S{seed} [intermediate]'
|
||||
path = step_writer.save_image_and_prompt_to_png(image, metadata, name)
|
||||
step_index += 1
|
||||
self.wfile.write(bytes(json.dumps(
|
||||
{'event': 'step', 'step': step + 1, 'url': path}
|
||||
) + '\n',"utf-8"))
|
||||
|
||||
try:
|
||||
if initimg is None:
|
||||
# Run txt2img
|
||||
self.model.prompt2image(prompt,
|
||||
iterations=iterations,
|
||||
cfg_scale = cfgscale,
|
||||
width = width,
|
||||
height = height,
|
||||
seed = seed,
|
||||
steps = steps,
|
||||
gfpgan_strength = gfpgan_strength,
|
||||
upscale = upscale,
|
||||
sampler_name = sampler_name,
|
||||
step_callback=image_progress,
|
||||
image_callback=image_done)
|
||||
else:
|
||||
# Decode initimg as base64 to temp file
|
||||
with open("./img2img-tmp.png", "wb") as f:
|
||||
initimg = initimg.split(",")[1] # Ignore mime type
|
||||
f.write(base64.b64decode(initimg))
|
||||
|
||||
try:
|
||||
# Run img2img
|
||||
self.model.prompt2image(prompt,
|
||||
init_img = "./img2img-tmp.png",
|
||||
strength = strength,
|
||||
iterations = iterations,
|
||||
cfg_scale = cfgscale,
|
||||
seed = seed,
|
||||
steps = steps,
|
||||
sampler_name = sampler_name,
|
||||
width = width,
|
||||
height = height,
|
||||
fit = fit,
|
||||
gfpgan_strength=gfpgan_strength,
|
||||
upscale = upscale,
|
||||
step_callback=image_progress,
|
||||
image_callback=image_done)
|
||||
finally:
|
||||
# Remove the temp file
|
||||
os.remove("./img2img-tmp.png")
|
||||
except CanceledException:
|
||||
print(f"Canceled.")
|
||||
return
|
||||
|
||||
|
||||
class ThreadingDreamServer(ThreadingHTTPServer):
|
||||
def __init__(self, server_address):
|
||||
super(ThreadingDreamServer, self).__init__(server_address, DreamServer)
|
||||
167
ldm/gfpgan/gfpgan_tools.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import torch
|
||||
import warnings
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
from PIL import Image
|
||||
from scripts.dream import create_argv_parser
|
||||
|
||||
arg_parser = create_argv_parser()
|
||||
opt = arg_parser.parse_args()
|
||||
|
||||
model_path = os.path.join(opt.gfpgan_dir, opt.gfpgan_model_path)
|
||||
gfpgan_model_exists = os.path.isfile(model_path)
|
||||
|
||||
def _run_gfpgan(image, strength, prompt, seed, upsampler_scale=4):
|
||||
print(f'>> GFPGAN - Restoring Faces: {prompt} : seed:{seed}')
|
||||
gfpgan = None
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
try:
|
||||
if not gfpgan_model_exists:
|
||||
raise Exception('GFPGAN model not found at path ' + model_path)
|
||||
|
||||
sys.path.append(os.path.abspath(opt.gfpgan_dir))
|
||||
from gfpgan import GFPGANer
|
||||
|
||||
bg_upsampler = _load_gfpgan_bg_upsampler(
|
||||
opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
|
||||
)
|
||||
|
||||
gfpgan = GFPGANer(
|
||||
model_path=model_path,
|
||||
upscale=upsampler_scale,
|
||||
arch='clean',
|
||||
channel_multiplier=2,
|
||||
bg_upsampler=bg_upsampler,
|
||||
)
|
||||
except Exception:
|
||||
import traceback
|
||||
|
||||
print('>> Error loading GFPGAN:', file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
if gfpgan is None:
|
||||
print(
|
||||
f'>> GFPGAN not initialized, it must be loaded via the --gfpgan argument'
|
||||
)
|
||||
return image
|
||||
|
||||
image = image.convert('RGB')
|
||||
|
||||
cropped_faces, restored_faces, restored_img = gfpgan.enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
has_aligned=False,
|
||||
only_center_face=False,
|
||||
paste_back=True,
|
||||
)
|
||||
res = Image.fromarray(restored_img)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if restored_img.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
gfpgan = None
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _load_gfpgan_bg_upsampler(bg_upsampler, upsampler_scale, bg_tile=400):
|
||||
if bg_upsampler == 'realesrgan':
|
||||
if not torch.cuda.is_available(): # CPU
|
||||
warnings.warn(
|
||||
'The unoptimized RealESRGAN is slow on CPU. We do not use it. '
|
||||
'If you really want to use it, please modify the corresponding codes.'
|
||||
)
|
||||
bg_upsampler = None
|
||||
else:
|
||||
model_path = {
|
||||
2: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
|
||||
4: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
|
||||
}
|
||||
|
||||
if upsampler_scale not in model_path:
|
||||
return None
|
||||
|
||||
from basicsr.archs.rrdbnet_arch import RRDBNet
|
||||
from realesrgan import RealESRGANer
|
||||
|
||||
if upsampler_scale == 4:
|
||||
model = RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=4,
|
||||
)
|
||||
if upsampler_scale == 2:
|
||||
model = RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=2,
|
||||
)
|
||||
|
||||
bg_upsampler = RealESRGANer(
|
||||
scale=upsampler_scale,
|
||||
model_path=model_path[upsampler_scale],
|
||||
model=model,
|
||||
tile=bg_tile,
|
||||
tile_pad=10,
|
||||
pre_pad=0,
|
||||
half=True,
|
||||
) # need to set False in CPU mode
|
||||
else:
|
||||
bg_upsampler = None
|
||||
|
||||
return bg_upsampler
|
||||
|
||||
|
||||
def real_esrgan_upscale(image, strength, upsampler_scale, prompt, seed):
|
||||
print(
|
||||
f'>> Real-ESRGAN Upscaling: {prompt} : seed:{seed} : scale:{upsampler_scale}x'
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
warnings.filterwarnings('ignore', category=UserWarning)
|
||||
|
||||
try:
|
||||
upsampler = _load_gfpgan_bg_upsampler(
|
||||
opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
|
||||
)
|
||||
except Exception:
|
||||
import traceback
|
||||
|
||||
print('>> Error loading Real-ESRGAN:', file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
|
||||
output, img_mode = upsampler.enhance(
|
||||
np.array(image, dtype=np.uint8),
|
||||
outscale=upsampler_scale,
|
||||
alpha_upsampler=opt.gfpgan_bg_upsampler,
|
||||
)
|
||||
|
||||
res = Image.fromarray(output)
|
||||
|
||||
if strength < 1.0:
|
||||
# Resize the image to the new image if the sizes have changed
|
||||
if output.size != image.size:
|
||||
image = image.resize(res.size)
|
||||
res = Image.blend(image, res, strength)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
upsampler = None
|
||||
|
||||
return res
|
||||
@@ -5,32 +5,49 @@ class LambdaWarmUpCosineScheduler:
|
||||
"""
|
||||
note: use with a base_lr of 1.0
|
||||
"""
|
||||
def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
warm_up_steps,
|
||||
lr_min,
|
||||
lr_max,
|
||||
lr_start,
|
||||
max_decay_steps,
|
||||
verbosity_interval=0,
|
||||
):
|
||||
self.lr_warm_up_steps = warm_up_steps
|
||||
self.lr_start = lr_start
|
||||
self.lr_min = lr_min
|
||||
self.lr_max = lr_max
|
||||
self.lr_max_decay_steps = max_decay_steps
|
||||
self.last_lr = 0.
|
||||
self.last_lr = 0.0
|
||||
self.verbosity_interval = verbosity_interval
|
||||
|
||||
def schedule(self, n, **kwargs):
|
||||
if self.verbosity_interval > 0:
|
||||
if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
|
||||
if n % self.verbosity_interval == 0:
|
||||
print(
|
||||
f'current step: {n}, recent lr-multiplier: {self.last_lr}'
|
||||
)
|
||||
if n < self.lr_warm_up_steps:
|
||||
lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
|
||||
lr = (
|
||||
self.lr_max - self.lr_start
|
||||
) / self.lr_warm_up_steps * n + self.lr_start
|
||||
self.last_lr = lr
|
||||
return lr
|
||||
else:
|
||||
t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
|
||||
t = (n - self.lr_warm_up_steps) / (
|
||||
self.lr_max_decay_steps - self.lr_warm_up_steps
|
||||
)
|
||||
t = min(t, 1.0)
|
||||
lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
|
||||
1 + np.cos(t * np.pi))
|
||||
1 + np.cos(t * np.pi)
|
||||
)
|
||||
self.last_lr = lr
|
||||
return lr
|
||||
|
||||
def __call__(self, n, **kwargs):
|
||||
return self.schedule(n,**kwargs)
|
||||
return self.schedule(n, **kwargs)
|
||||
|
||||
|
||||
class LambdaWarmUpCosineScheduler2:
|
||||
@@ -38,15 +55,30 @@ class LambdaWarmUpCosineScheduler2:
|
||||
supports repeated iterations, configurable via lists
|
||||
note: use with a base_lr of 1.0.
|
||||
"""
|
||||
def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
|
||||
assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
warm_up_steps,
|
||||
f_min,
|
||||
f_max,
|
||||
f_start,
|
||||
cycle_lengths,
|
||||
verbosity_interval=0,
|
||||
):
|
||||
assert (
|
||||
len(warm_up_steps)
|
||||
== len(f_min)
|
||||
== len(f_max)
|
||||
== len(f_start)
|
||||
== len(cycle_lengths)
|
||||
)
|
||||
self.lr_warm_up_steps = warm_up_steps
|
||||
self.f_start = f_start
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.cycle_lengths = cycle_lengths
|
||||
self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
|
||||
self.last_f = 0.
|
||||
self.last_f = 0.0
|
||||
self.verbosity_interval = verbosity_interval
|
||||
|
||||
def find_in_interval(self, n):
|
||||
@@ -60,17 +92,25 @@ class LambdaWarmUpCosineScheduler2:
|
||||
cycle = self.find_in_interval(n)
|
||||
n = n - self.cum_cycles[cycle]
|
||||
if self.verbosity_interval > 0:
|
||||
if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
|
||||
f"current cycle {cycle}")
|
||||
if n % self.verbosity_interval == 0:
|
||||
print(
|
||||
f'current step: {n}, recent lr-multiplier: {self.last_f}, '
|
||||
f'current cycle {cycle}'
|
||||
)
|
||||
if n < self.lr_warm_up_steps[cycle]:
|
||||
f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
|
||||
f = (
|
||||
self.f_max[cycle] - self.f_start[cycle]
|
||||
) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
|
||||
self.last_f = f
|
||||
return f
|
||||
else:
|
||||
t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
|
||||
t = (n - self.lr_warm_up_steps[cycle]) / (
|
||||
self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]
|
||||
)
|
||||
t = min(t, 1.0)
|
||||
f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
|
||||
1 + np.cos(t * np.pi))
|
||||
f = self.f_min[cycle] + 0.5 * (
|
||||
self.f_max[cycle] - self.f_min[cycle]
|
||||
) * (1 + np.cos(t * np.pi))
|
||||
self.last_f = f
|
||||
return f
|
||||
|
||||
@@ -79,20 +119,25 @@ class LambdaWarmUpCosineScheduler2:
|
||||
|
||||
|
||||
class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
|
||||
|
||||
def schedule(self, n, **kwargs):
|
||||
cycle = self.find_in_interval(n)
|
||||
n = n - self.cum_cycles[cycle]
|
||||
if self.verbosity_interval > 0:
|
||||
if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
|
||||
f"current cycle {cycle}")
|
||||
if n % self.verbosity_interval == 0:
|
||||
print(
|
||||
f'current step: {n}, recent lr-multiplier: {self.last_f}, '
|
||||
f'current cycle {cycle}'
|
||||
)
|
||||
|
||||
if n < self.lr_warm_up_steps[cycle]:
|
||||
f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
|
||||
f = (
|
||||
self.f_max[cycle] - self.f_start[cycle]
|
||||
) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
|
||||
self.last_f = f
|
||||
return f
|
||||
else:
|
||||
f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
|
||||
f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
|
||||
self.cycle_lengths[cycle] - n
|
||||
) / (self.cycle_lengths[cycle])
|
||||
self.last_f = f
|
||||
return f
|
||||
|
||||
|
||||
@@ -6,29 +6,32 @@ from contextlib import contextmanager
|
||||
from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
|
||||
|
||||
from ldm.modules.diffusionmodules.model import Encoder, Decoder
|
||||
from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
|
||||
from ldm.modules.distributions.distributions import (
|
||||
DiagonalGaussianDistribution,
|
||||
)
|
||||
|
||||
from ldm.util import instantiate_from_config
|
||||
|
||||
|
||||
class VQModel(pl.LightningModule):
|
||||
def __init__(self,
|
||||
ddconfig,
|
||||
lossconfig,
|
||||
n_embed,
|
||||
embed_dim,
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
image_key="image",
|
||||
colorize_nlabels=None,
|
||||
monitor=None,
|
||||
batch_resize_range=None,
|
||||
scheduler_config=None,
|
||||
lr_g_factor=1.0,
|
||||
remap=None,
|
||||
sane_index_shape=False, # tell vector quantizer to return indices as bhw
|
||||
use_ema=False
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
ddconfig,
|
||||
lossconfig,
|
||||
n_embed,
|
||||
embed_dim,
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
image_key='image',
|
||||
colorize_nlabels=None,
|
||||
monitor=None,
|
||||
batch_resize_range=None,
|
||||
scheduler_config=None,
|
||||
lr_g_factor=1.0,
|
||||
remap=None,
|
||||
sane_index_shape=False, # tell vector quantizer to return indices as bhw
|
||||
use_ema=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.n_embed = n_embed
|
||||
@@ -36,24 +39,34 @@ class VQModel(pl.LightningModule):
|
||||
self.encoder = Encoder(**ddconfig)
|
||||
self.decoder = Decoder(**ddconfig)
|
||||
self.loss = instantiate_from_config(lossconfig)
|
||||
self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
|
||||
remap=remap,
|
||||
sane_index_shape=sane_index_shape)
|
||||
self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
|
||||
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
||||
self.quantize = VectorQuantizer(
|
||||
n_embed,
|
||||
embed_dim,
|
||||
beta=0.25,
|
||||
remap=remap,
|
||||
sane_index_shape=sane_index_shape,
|
||||
)
|
||||
self.quant_conv = torch.nn.Conv2d(ddconfig['z_channels'], embed_dim, 1)
|
||||
self.post_quant_conv = torch.nn.Conv2d(
|
||||
embed_dim, ddconfig['z_channels'], 1
|
||||
)
|
||||
if colorize_nlabels is not None:
|
||||
assert type(colorize_nlabels)==int
|
||||
self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
|
||||
assert type(colorize_nlabels) == int
|
||||
self.register_buffer(
|
||||
'colorize', torch.randn(3, colorize_nlabels, 1, 1)
|
||||
)
|
||||
if monitor is not None:
|
||||
self.monitor = monitor
|
||||
self.batch_resize_range = batch_resize_range
|
||||
if self.batch_resize_range is not None:
|
||||
print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
|
||||
print(
|
||||
f'{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.'
|
||||
)
|
||||
|
||||
self.use_ema = use_ema
|
||||
if self.use_ema:
|
||||
self.model_ema = LitEma(self)
|
||||
print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
|
||||
print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
|
||||
|
||||
if ckpt_path is not None:
|
||||
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
||||
@@ -66,28 +79,30 @@ class VQModel(pl.LightningModule):
|
||||
self.model_ema.store(self.parameters())
|
||||
self.model_ema.copy_to(self)
|
||||
if context is not None:
|
||||
print(f"{context}: Switched to EMA weights")
|
||||
print(f'{context}: Switched to EMA weights')
|
||||
try:
|
||||
yield None
|
||||
finally:
|
||||
if self.use_ema:
|
||||
self.model_ema.restore(self.parameters())
|
||||
if context is not None:
|
||||
print(f"{context}: Restored training weights")
|
||||
print(f'{context}: Restored training weights')
|
||||
|
||||
def init_from_ckpt(self, path, ignore_keys=list()):
|
||||
sd = torch.load(path, map_location="cpu")["state_dict"]
|
||||
sd = torch.load(path, map_location='cpu')['state_dict']
|
||||
keys = list(sd.keys())
|
||||
for k in keys:
|
||||
for ik in ignore_keys:
|
||||
if k.startswith(ik):
|
||||
print("Deleting key {} from state_dict.".format(k))
|
||||
print('Deleting key {} from state_dict.'.format(k))
|
||||
del sd[k]
|
||||
missing, unexpected = self.load_state_dict(sd, strict=False)
|
||||
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
|
||||
print(
|
||||
f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
|
||||
)
|
||||
if len(missing) > 0:
|
||||
print(f"Missing Keys: {missing}")
|
||||
print(f"Unexpected Keys: {unexpected}")
|
||||
print(f'Missing Keys: {missing}')
|
||||
print(f'Unexpected Keys: {unexpected}')
|
||||
|
||||
def on_train_batch_end(self, *args, **kwargs):
|
||||
if self.use_ema:
|
||||
@@ -115,7 +130,7 @@ class VQModel(pl.LightningModule):
|
||||
return dec
|
||||
|
||||
def forward(self, input, return_pred_indices=False):
|
||||
quant, diff, (_,_,ind) = self.encode(input)
|
||||
quant, diff, (_, _, ind) = self.encode(input)
|
||||
dec = self.decode(quant)
|
||||
if return_pred_indices:
|
||||
return dec, diff, ind
|
||||
@@ -125,7 +140,11 @@ class VQModel(pl.LightningModule):
|
||||
x = batch[k]
|
||||
if len(x.shape) == 3:
|
||||
x = x[..., None]
|
||||
x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
|
||||
x = (
|
||||
x.permute(0, 3, 1, 2)
|
||||
.to(memory_format=torch.contiguous_format)
|
||||
.float()
|
||||
)
|
||||
if self.batch_resize_range is not None:
|
||||
lower_size = self.batch_resize_range[0]
|
||||
upper_size = self.batch_resize_range[1]
|
||||
@@ -133,9 +152,11 @@ class VQModel(pl.LightningModule):
|
||||
# do the first few batches with max size to avoid later oom
|
||||
new_resize = upper_size
|
||||
else:
|
||||
new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
|
||||
new_resize = np.random.choice(
|
||||
np.arange(lower_size, upper_size + 16, 16)
|
||||
)
|
||||
if new_resize != x.shape[2]:
|
||||
x = F.interpolate(x, size=new_resize, mode="bicubic")
|
||||
x = F.interpolate(x, size=new_resize, mode='bicubic')
|
||||
x = x.detach()
|
||||
return x
|
||||
|
||||
@@ -147,81 +168,139 @@ class VQModel(pl.LightningModule):
|
||||
|
||||
if optimizer_idx == 0:
|
||||
# autoencode
|
||||
aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
|
||||
last_layer=self.get_last_layer(), split="train",
|
||||
predicted_indices=ind)
|
||||
aeloss, log_dict_ae = self.loss(
|
||||
qloss,
|
||||
x,
|
||||
xrec,
|
||||
optimizer_idx,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='train',
|
||||
predicted_indices=ind,
|
||||
)
|
||||
|
||||
self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
|
||||
self.log_dict(
|
||||
log_dict_ae,
|
||||
prog_bar=False,
|
||||
logger=True,
|
||||
on_step=True,
|
||||
on_epoch=True,
|
||||
)
|
||||
return aeloss
|
||||
|
||||
if optimizer_idx == 1:
|
||||
# discriminator
|
||||
discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
|
||||
last_layer=self.get_last_layer(), split="train")
|
||||
self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
|
||||
discloss, log_dict_disc = self.loss(
|
||||
qloss,
|
||||
x,
|
||||
xrec,
|
||||
optimizer_idx,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='train',
|
||||
)
|
||||
self.log_dict(
|
||||
log_dict_disc,
|
||||
prog_bar=False,
|
||||
logger=True,
|
||||
on_step=True,
|
||||
on_epoch=True,
|
||||
)
|
||||
return discloss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
log_dict = self._validation_step(batch, batch_idx)
|
||||
with self.ema_scope():
|
||||
log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
|
||||
log_dict_ema = self._validation_step(
|
||||
batch, batch_idx, suffix='_ema'
|
||||
)
|
||||
return log_dict
|
||||
|
||||
def _validation_step(self, batch, batch_idx, suffix=""):
|
||||
def _validation_step(self, batch, batch_idx, suffix=''):
|
||||
x = self.get_input(batch, self.image_key)
|
||||
xrec, qloss, ind = self(x, return_pred_indices=True)
|
||||
aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split="val"+suffix,
|
||||
predicted_indices=ind
|
||||
)
|
||||
aeloss, log_dict_ae = self.loss(
|
||||
qloss,
|
||||
x,
|
||||
xrec,
|
||||
0,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='val' + suffix,
|
||||
predicted_indices=ind,
|
||||
)
|
||||
|
||||
discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split="val"+suffix,
|
||||
predicted_indices=ind
|
||||
)
|
||||
rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
|
||||
self.log(f"val{suffix}/rec_loss", rec_loss,
|
||||
prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
|
||||
self.log(f"val{suffix}/aeloss", aeloss,
|
||||
prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
|
||||
discloss, log_dict_disc = self.loss(
|
||||
qloss,
|
||||
x,
|
||||
xrec,
|
||||
1,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='val' + suffix,
|
||||
predicted_indices=ind,
|
||||
)
|
||||
rec_loss = log_dict_ae[f'val{suffix}/rec_loss']
|
||||
self.log(
|
||||
f'val{suffix}/rec_loss',
|
||||
rec_loss,
|
||||
prog_bar=True,
|
||||
logger=True,
|
||||
on_step=False,
|
||||
on_epoch=True,
|
||||
sync_dist=True,
|
||||
)
|
||||
self.log(
|
||||
f'val{suffix}/aeloss',
|
||||
aeloss,
|
||||
prog_bar=True,
|
||||
logger=True,
|
||||
on_step=False,
|
||||
on_epoch=True,
|
||||
sync_dist=True,
|
||||
)
|
||||
if version.parse(pl.__version__) >= version.parse('1.4.0'):
|
||||
del log_dict_ae[f"val{suffix}/rec_loss"]
|
||||
del log_dict_ae[f'val{suffix}/rec_loss']
|
||||
self.log_dict(log_dict_ae)
|
||||
self.log_dict(log_dict_disc)
|
||||
return self.log_dict
|
||||
|
||||
def configure_optimizers(self):
|
||||
lr_d = self.learning_rate
|
||||
lr_g = self.lr_g_factor*self.learning_rate
|
||||
print("lr_d", lr_d)
|
||||
print("lr_g", lr_g)
|
||||
opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
|
||||
list(self.decoder.parameters())+
|
||||
list(self.quantize.parameters())+
|
||||
list(self.quant_conv.parameters())+
|
||||
list(self.post_quant_conv.parameters()),
|
||||
lr=lr_g, betas=(0.5, 0.9))
|
||||
opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
|
||||
lr=lr_d, betas=(0.5, 0.9))
|
||||
lr_g = self.lr_g_factor * self.learning_rate
|
||||
print('lr_d', lr_d)
|
||||
print('lr_g', lr_g)
|
||||
opt_ae = torch.optim.Adam(
|
||||
list(self.encoder.parameters())
|
||||
+ list(self.decoder.parameters())
|
||||
+ list(self.quantize.parameters())
|
||||
+ list(self.quant_conv.parameters())
|
||||
+ list(self.post_quant_conv.parameters()),
|
||||
lr=lr_g,
|
||||
betas=(0.5, 0.9),
|
||||
)
|
||||
opt_disc = torch.optim.Adam(
|
||||
self.loss.discriminator.parameters(), lr=lr_d, betas=(0.5, 0.9)
|
||||
)
|
||||
|
||||
if self.scheduler_config is not None:
|
||||
scheduler = instantiate_from_config(self.scheduler_config)
|
||||
|
||||
print("Setting up LambdaLR scheduler...")
|
||||
print('Setting up LambdaLR scheduler...')
|
||||
scheduler = [
|
||||
{
|
||||
'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
|
||||
'scheduler': LambdaLR(
|
||||
opt_ae, lr_lambda=scheduler.schedule
|
||||
),
|
||||
'interval': 'step',
|
||||
'frequency': 1
|
||||
'frequency': 1,
|
||||
},
|
||||
{
|
||||
'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
|
||||
'scheduler': LambdaLR(
|
||||
opt_disc, lr_lambda=scheduler.schedule
|
||||
),
|
||||
'interval': 'step',
|
||||
'frequency': 1
|
||||
'frequency': 1,
|
||||
},
|
||||
]
|
||||
return [opt_ae, opt_disc], scheduler
|
||||
@@ -235,7 +314,7 @@ class VQModel(pl.LightningModule):
|
||||
x = self.get_input(batch, self.image_key)
|
||||
x = x.to(self.device)
|
||||
if only_inputs:
|
||||
log["inputs"] = x
|
||||
log['inputs'] = x
|
||||
return log
|
||||
xrec, _ = self(x)
|
||||
if x.shape[1] > 3:
|
||||
@@ -243,21 +322,24 @@ class VQModel(pl.LightningModule):
|
||||
assert xrec.shape[1] > 3
|
||||
x = self.to_rgb(x)
|
||||
xrec = self.to_rgb(xrec)
|
||||
log["inputs"] = x
|
||||
log["reconstructions"] = xrec
|
||||
log['inputs'] = x
|
||||
log['reconstructions'] = xrec
|
||||
if plot_ema:
|
||||
with self.ema_scope():
|
||||
xrec_ema, _ = self(x)
|
||||
if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
|
||||
log["reconstructions_ema"] = xrec_ema
|
||||
if x.shape[1] > 3:
|
||||
xrec_ema = self.to_rgb(xrec_ema)
|
||||
log['reconstructions_ema'] = xrec_ema
|
||||
return log
|
||||
|
||||
def to_rgb(self, x):
|
||||
assert self.image_key == "segmentation"
|
||||
if not hasattr(self, "colorize"):
|
||||
self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
|
||||
assert self.image_key == 'segmentation'
|
||||
if not hasattr(self, 'colorize'):
|
||||
self.register_buffer(
|
||||
'colorize', torch.randn(3, x.shape[1], 1, 1).to(x)
|
||||
)
|
||||
x = F.conv2d(x, weight=self.colorize)
|
||||
x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
|
||||
x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
|
||||
return x
|
||||
|
||||
|
||||
@@ -283,43 +365,50 @@ class VQModelInterface(VQModel):
|
||||
|
||||
|
||||
class AutoencoderKL(pl.LightningModule):
|
||||
def __init__(self,
|
||||
ddconfig,
|
||||
lossconfig,
|
||||
embed_dim,
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
image_key="image",
|
||||
colorize_nlabels=None,
|
||||
monitor=None,
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
ddconfig,
|
||||
lossconfig,
|
||||
embed_dim,
|
||||
ckpt_path=None,
|
||||
ignore_keys=[],
|
||||
image_key='image',
|
||||
colorize_nlabels=None,
|
||||
monitor=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.image_key = image_key
|
||||
self.encoder = Encoder(**ddconfig)
|
||||
self.decoder = Decoder(**ddconfig)
|
||||
self.loss = instantiate_from_config(lossconfig)
|
||||
assert ddconfig["double_z"]
|
||||
self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
|
||||
self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
||||
assert ddconfig['double_z']
|
||||
self.quant_conv = torch.nn.Conv2d(
|
||||
2 * ddconfig['z_channels'], 2 * embed_dim, 1
|
||||
)
|
||||
self.post_quant_conv = torch.nn.Conv2d(
|
||||
embed_dim, ddconfig['z_channels'], 1
|
||||
)
|
||||
self.embed_dim = embed_dim
|
||||
if colorize_nlabels is not None:
|
||||
assert type(colorize_nlabels)==int
|
||||
self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
|
||||
assert type(colorize_nlabels) == int
|
||||
self.register_buffer(
|
||||
'colorize', torch.randn(3, colorize_nlabels, 1, 1)
|
||||
)
|
||||
if monitor is not None:
|
||||
self.monitor = monitor
|
||||
if ckpt_path is not None:
|
||||
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
||||
|
||||
def init_from_ckpt(self, path, ignore_keys=list()):
|
||||
sd = torch.load(path, map_location="cpu")["state_dict"]
|
||||
sd = torch.load(path, map_location='cpu')['state_dict']
|
||||
keys = list(sd.keys())
|
||||
for k in keys:
|
||||
for ik in ignore_keys:
|
||||
if k.startswith(ik):
|
||||
print("Deleting key {} from state_dict.".format(k))
|
||||
print('Deleting key {} from state_dict.'.format(k))
|
||||
del sd[k]
|
||||
self.load_state_dict(sd, strict=False)
|
||||
print(f"Restored from {path}")
|
||||
print(f'Restored from {path}')
|
||||
|
||||
def encode(self, x):
|
||||
h = self.encoder(x)
|
||||
@@ -345,7 +434,11 @@ class AutoencoderKL(pl.LightningModule):
|
||||
x = batch[k]
|
||||
if len(x.shape) == 3:
|
||||
x = x[..., None]
|
||||
x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
|
||||
x = (
|
||||
x.permute(0, 3, 1, 2)
|
||||
.to(memory_format=torch.contiguous_format)
|
||||
.float()
|
||||
)
|
||||
return x
|
||||
|
||||
def training_step(self, batch, batch_idx, optimizer_idx):
|
||||
@@ -354,44 +447,102 @@ class AutoencoderKL(pl.LightningModule):
|
||||
|
||||
if optimizer_idx == 0:
|
||||
# train encoder+decoder+logvar
|
||||
aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
|
||||
last_layer=self.get_last_layer(), split="train")
|
||||
self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
|
||||
self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
|
||||
aeloss, log_dict_ae = self.loss(
|
||||
inputs,
|
||||
reconstructions,
|
||||
posterior,
|
||||
optimizer_idx,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='train',
|
||||
)
|
||||
self.log(
|
||||
'aeloss',
|
||||
aeloss,
|
||||
prog_bar=True,
|
||||
logger=True,
|
||||
on_step=True,
|
||||
on_epoch=True,
|
||||
)
|
||||
self.log_dict(
|
||||
log_dict_ae,
|
||||
prog_bar=False,
|
||||
logger=True,
|
||||
on_step=True,
|
||||
on_epoch=False,
|
||||
)
|
||||
return aeloss
|
||||
|
||||
if optimizer_idx == 1:
|
||||
# train the discriminator
|
||||
discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
|
||||
last_layer=self.get_last_layer(), split="train")
|
||||
discloss, log_dict_disc = self.loss(
|
||||
inputs,
|
||||
reconstructions,
|
||||
posterior,
|
||||
optimizer_idx,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='train',
|
||||
)
|
||||
|
||||
self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
|
||||
self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
|
||||
self.log(
|
||||
'discloss',
|
||||
discloss,
|
||||
prog_bar=True,
|
||||
logger=True,
|
||||
on_step=True,
|
||||
on_epoch=True,
|
||||
)
|
||||
self.log_dict(
|
||||
log_dict_disc,
|
||||
prog_bar=False,
|
||||
logger=True,
|
||||
on_step=True,
|
||||
on_epoch=False,
|
||||
)
|
||||
return discloss
|
||||
|
||||
def validation_step(self, batch, batch_idx):
|
||||
inputs = self.get_input(batch, self.image_key)
|
||||
reconstructions, posterior = self(inputs)
|
||||
aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
|
||||
last_layer=self.get_last_layer(), split="val")
|
||||
aeloss, log_dict_ae = self.loss(
|
||||
inputs,
|
||||
reconstructions,
|
||||
posterior,
|
||||
0,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='val',
|
||||
)
|
||||
|
||||
discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
|
||||
last_layer=self.get_last_layer(), split="val")
|
||||
discloss, log_dict_disc = self.loss(
|
||||
inputs,
|
||||
reconstructions,
|
||||
posterior,
|
||||
1,
|
||||
self.global_step,
|
||||
last_layer=self.get_last_layer(),
|
||||
split='val',
|
||||
)
|
||||
|
||||
self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
|
||||
self.log('val/rec_loss', log_dict_ae['val/rec_loss'])
|
||||
self.log_dict(log_dict_ae)
|
||||
self.log_dict(log_dict_disc)
|
||||
return self.log_dict
|
||||
|
||||
def configure_optimizers(self):
|
||||
lr = self.learning_rate
|
||||
opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
|
||||
list(self.decoder.parameters())+
|
||||
list(self.quant_conv.parameters())+
|
||||
list(self.post_quant_conv.parameters()),
|
||||
lr=lr, betas=(0.5, 0.9))
|
||||
opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
|
||||
lr=lr, betas=(0.5, 0.9))
|
||||
opt_ae = torch.optim.Adam(
|
||||
list(self.encoder.parameters())
|
||||
+ list(self.decoder.parameters())
|
||||
+ list(self.quant_conv.parameters())
|
||||
+ list(self.post_quant_conv.parameters()),
|
||||
lr=lr,
|
||||
betas=(0.5, 0.9),
|
||||
)
|
||||
opt_disc = torch.optim.Adam(
|
||||
self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)
|
||||
)
|
||||
return [opt_ae, opt_disc], []
|
||||
|
||||
def get_last_layer(self):
|
||||
@@ -409,17 +560,19 @@ class AutoencoderKL(pl.LightningModule):
|
||||
assert xrec.shape[1] > 3
|
||||
x = self.to_rgb(x)
|
||||
xrec = self.to_rgb(xrec)
|
||||
log["samples"] = self.decode(torch.randn_like(posterior.sample()))
|
||||
log["reconstructions"] = xrec
|
||||
log["inputs"] = x
|
||||
log['samples'] = self.decode(torch.randn_like(posterior.sample()))
|
||||
log['reconstructions'] = xrec
|
||||
log['inputs'] = x
|
||||
return log
|
||||
|
||||
def to_rgb(self, x):
|
||||
assert self.image_key == "segmentation"
|
||||
if not hasattr(self, "colorize"):
|
||||
self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
|
||||
assert self.image_key == 'segmentation'
|
||||
if not hasattr(self, 'colorize'):
|
||||
self.register_buffer(
|
||||
'colorize', torch.randn(3, x.shape[1], 1, 1).to(x)
|
||||
)
|
||||
x = F.conv2d(x, weight=self.colorize)
|
||||
x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
|
||||
x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
|
||||
return x
|
||||
|
||||
|
||||
|
||||
@@ -10,13 +10,13 @@ from einops import rearrange
|
||||
from glob import glob
|
||||
from natsort import natsorted
|
||||
|
||||
from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
|
||||
from ldm.modules.diffusionmodules.openaimodel import (
|
||||
EncoderUNetModel,
|
||||
UNetModel,
|
||||
)
|
||||
from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
|
||||
|
||||
__models__ = {
|
||||
'class_label': EncoderUNetModel,
|
||||
'segmentation': UNetModel
|
||||
}
|
||||
__models__ = {'class_label': EncoderUNetModel, 'segmentation': UNetModel}
|
||||
|
||||
|
||||
def disabled_train(self, mode=True):
|
||||
@@ -26,37 +26,49 @@ def disabled_train(self, mode=True):
|
||||
|
||||
|
||||
class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
|
||||
def __init__(self,
|
||||
diffusion_path,
|
||||
num_classes,
|
||||
ckpt_path=None,
|
||||
pool='attention',
|
||||
label_key=None,
|
||||
diffusion_ckpt_path=None,
|
||||
scheduler_config=None,
|
||||
weight_decay=1.e-2,
|
||||
log_steps=10,
|
||||
monitor='val/loss',
|
||||
*args,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
diffusion_path,
|
||||
num_classes,
|
||||
ckpt_path=None,
|
||||
pool='attention',
|
||||
label_key=None,
|
||||
diffusion_ckpt_path=None,
|
||||
scheduler_config=None,
|
||||
weight_decay=1.0e-2,
|
||||
log_steps=10,
|
||||
monitor='val/loss',
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.num_classes = num_classes
|
||||
# get latest config of diffusion model
|
||||
diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
|
||||
diffusion_config = natsorted(
|
||||
glob(os.path.join(diffusion_path, 'configs', '*-project.yaml'))
|
||||
)[-1]
|
||||
self.diffusion_config = OmegaConf.load(diffusion_config).model
|
||||
self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
|
||||
self.load_diffusion()
|
||||
|
||||
self.monitor = monitor
|
||||
self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
|
||||
self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
|
||||
self.numd = (
|
||||
self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
|
||||
)
|
||||
self.log_time_interval = (
|
||||
self.diffusion_model.num_timesteps // log_steps
|
||||
)
|
||||
self.log_steps = log_steps
|
||||
|
||||
self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
|
||||
self.label_key = (
|
||||
label_key
|
||||
if not hasattr(self.diffusion_model, 'cond_stage_key')
|
||||
else self.diffusion_model.cond_stage_key
|
||||
)
|
||||
|
||||
assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
|
||||
assert (
|
||||
self.label_key is not None
|
||||
), 'label_key neither in diffusion model nor in model.params'
|
||||
|
||||
if self.label_key not in __models__:
|
||||
raise NotImplementedError()
|
||||
@@ -68,22 +80,27 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
self.weight_decay = weight_decay
|
||||
|
||||
def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
|
||||
sd = torch.load(path, map_location="cpu")
|
||||
if "state_dict" in list(sd.keys()):
|
||||
sd = sd["state_dict"]
|
||||
sd = torch.load(path, map_location='cpu')
|
||||
if 'state_dict' in list(sd.keys()):
|
||||
sd = sd['state_dict']
|
||||
keys = list(sd.keys())
|
||||
for k in keys:
|
||||
for ik in ignore_keys:
|
||||
if k.startswith(ik):
|
||||
print("Deleting key {} from state_dict.".format(k))
|
||||
print('Deleting key {} from state_dict.'.format(k))
|
||||
del sd[k]
|
||||
missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
|
||||
sd, strict=False)
|
||||
print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
|
||||
missing, unexpected = (
|
||||
self.load_state_dict(sd, strict=False)
|
||||
if not only_model
|
||||
else self.model.load_state_dict(sd, strict=False)
|
||||
)
|
||||
print(
|
||||
f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
|
||||
)
|
||||
if len(missing) > 0:
|
||||
print(f"Missing Keys: {missing}")
|
||||
print(f'Missing Keys: {missing}')
|
||||
if len(unexpected) > 0:
|
||||
print(f"Unexpected Keys: {unexpected}")
|
||||
print(f'Unexpected Keys: {unexpected}')
|
||||
|
||||
def load_diffusion(self):
|
||||
model = instantiate_from_config(self.diffusion_config)
|
||||
@@ -93,17 +110,25 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
param.requires_grad = False
|
||||
|
||||
def load_classifier(self, ckpt_path, pool):
|
||||
model_config = deepcopy(self.diffusion_config.params.unet_config.params)
|
||||
model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
|
||||
model_config = deepcopy(
|
||||
self.diffusion_config.params.unet_config.params
|
||||
)
|
||||
model_config.in_channels = (
|
||||
self.diffusion_config.params.unet_config.params.out_channels
|
||||
)
|
||||
model_config.out_channels = self.num_classes
|
||||
if self.label_key == 'class_label':
|
||||
model_config.pool = pool
|
||||
|
||||
self.model = __models__[self.label_key](**model_config)
|
||||
if ckpt_path is not None:
|
||||
print('#####################################################################')
|
||||
print(
|
||||
'#####################################################################'
|
||||
)
|
||||
print(f'load from ckpt "{ckpt_path}"')
|
||||
print('#####################################################################')
|
||||
print(
|
||||
'#####################################################################'
|
||||
)
|
||||
self.init_from_ckpt(ckpt_path)
|
||||
|
||||
@torch.no_grad()
|
||||
@@ -111,11 +136,19 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
noise = default(noise, lambda: torch.randn_like(x))
|
||||
continuous_sqrt_alpha_cumprod = None
|
||||
if self.diffusion_model.use_continuous_noise:
|
||||
continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
|
||||
continuous_sqrt_alpha_cumprod = (
|
||||
self.diffusion_model.sample_continuous_noise_level(
|
||||
x.shape[0], t + 1
|
||||
)
|
||||
)
|
||||
# todo: make sure t+1 is correct here
|
||||
|
||||
return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
|
||||
continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
|
||||
return self.diffusion_model.q_sample(
|
||||
x_start=x,
|
||||
t=t,
|
||||
noise=noise,
|
||||
continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod,
|
||||
)
|
||||
|
||||
def forward(self, x_noisy, t, *args, **kwargs):
|
||||
return self.model(x_noisy, t)
|
||||
@@ -141,17 +174,21 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
targets = rearrange(targets, 'b h w c -> b c h w')
|
||||
for down in range(self.numd):
|
||||
h, w = targets.shape[-2:]
|
||||
targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
|
||||
targets = F.interpolate(
|
||||
targets, size=(h // 2, w // 2), mode='nearest'
|
||||
)
|
||||
|
||||
# targets = rearrange(targets,'b c h w -> b h w c')
|
||||
|
||||
return targets
|
||||
|
||||
def compute_top_k(self, logits, labels, k, reduction="mean"):
|
||||
def compute_top_k(self, logits, labels, k, reduction='mean'):
|
||||
_, top_ks = torch.topk(logits, k, dim=1)
|
||||
if reduction == "mean":
|
||||
return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
|
||||
elif reduction == "none":
|
||||
if reduction == 'mean':
|
||||
return (
|
||||
(top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
|
||||
)
|
||||
elif reduction == 'none':
|
||||
return (top_ks == labels[:, None]).float().sum(dim=-1)
|
||||
|
||||
def on_train_epoch_start(self):
|
||||
@@ -162,29 +199,59 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
def write_logs(self, loss, logits, targets):
|
||||
log_prefix = 'train' if self.training else 'val'
|
||||
log = {}
|
||||
log[f"{log_prefix}/loss"] = loss.mean()
|
||||
log[f"{log_prefix}/acc@1"] = self.compute_top_k(
|
||||
logits, targets, k=1, reduction="mean"
|
||||
log[f'{log_prefix}/loss'] = loss.mean()
|
||||
log[f'{log_prefix}/acc@1'] = self.compute_top_k(
|
||||
logits, targets, k=1, reduction='mean'
|
||||
)
|
||||
log[f"{log_prefix}/acc@5"] = self.compute_top_k(
|
||||
logits, targets, k=5, reduction="mean"
|
||||
log[f'{log_prefix}/acc@5'] = self.compute_top_k(
|
||||
logits, targets, k=5, reduction='mean'
|
||||
)
|
||||
|
||||
self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
|
||||
self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
|
||||
self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
|
||||
self.log_dict(
|
||||
log,
|
||||
prog_bar=False,
|
||||
logger=True,
|
||||
on_step=self.training,
|
||||
on_epoch=True,
|
||||
)
|
||||
self.log(
|
||||
'loss', log[f'{log_prefix}/loss'], prog_bar=True, logger=False
|
||||
)
|
||||
self.log(
|
||||
'global_step',
|
||||
self.global_step,
|
||||
logger=False,
|
||||
on_epoch=False,
|
||||
prog_bar=True,
|
||||
)
|
||||
lr = self.optimizers().param_groups[0]['lr']
|
||||
self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
|
||||
self.log(
|
||||
'lr_abs',
|
||||
lr,
|
||||
on_step=True,
|
||||
logger=True,
|
||||
on_epoch=False,
|
||||
prog_bar=True,
|
||||
)
|
||||
|
||||
def shared_step(self, batch, t=None):
|
||||
x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
|
||||
x, *_ = self.diffusion_model.get_input(
|
||||
batch, k=self.diffusion_model.first_stage_key
|
||||
)
|
||||
targets = self.get_conditioning(batch)
|
||||
if targets.dim() == 4:
|
||||
targets = targets.argmax(dim=1)
|
||||
if t is None:
|
||||
t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
|
||||
t = torch.randint(
|
||||
0,
|
||||
self.diffusion_model.num_timesteps,
|
||||
(x.shape[0],),
|
||||
device=self.device,
|
||||
).long()
|
||||
else:
|
||||
t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
|
||||
t = torch.full(
|
||||
size=(x.shape[0],), fill_value=t, device=self.device
|
||||
).long()
|
||||
x_noisy = self.get_x_noisy(x, t)
|
||||
logits = self(x_noisy, t)
|
||||
|
||||
@@ -200,8 +267,14 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
return loss
|
||||
|
||||
def reset_noise_accs(self):
|
||||
self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
|
||||
range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
|
||||
self.noisy_acc = {
|
||||
t: {'acc@1': [], 'acc@5': []}
|
||||
for t in range(
|
||||
0,
|
||||
self.diffusion_model.num_timesteps,
|
||||
self.diffusion_model.log_every_t,
|
||||
)
|
||||
}
|
||||
|
||||
def on_validation_start(self):
|
||||
self.reset_noise_accs()
|
||||
@@ -212,24 +285,35 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
|
||||
for t in self.noisy_acc:
|
||||
_, logits, _, targets = self.shared_step(batch, t)
|
||||
self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
|
||||
self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
|
||||
self.noisy_acc[t]['acc@1'].append(
|
||||
self.compute_top_k(logits, targets, k=1, reduction='mean')
|
||||
)
|
||||
self.noisy_acc[t]['acc@5'].append(
|
||||
self.compute_top_k(logits, targets, k=5, reduction='mean')
|
||||
)
|
||||
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
|
||||
optimizer = AdamW(
|
||||
self.model.parameters(),
|
||||
lr=self.learning_rate,
|
||||
weight_decay=self.weight_decay,
|
||||
)
|
||||
|
||||
if self.use_scheduler:
|
||||
scheduler = instantiate_from_config(self.scheduler_config)
|
||||
|
||||
print("Setting up LambdaLR scheduler...")
|
||||
print('Setting up LambdaLR scheduler...')
|
||||
scheduler = [
|
||||
{
|
||||
'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
|
||||
'scheduler': LambdaLR(
|
||||
optimizer, lr_lambda=scheduler.schedule
|
||||
),
|
||||
'interval': 'step',
|
||||
'frequency': 1
|
||||
}]
|
||||
'frequency': 1,
|
||||
}
|
||||
]
|
||||
return [optimizer], scheduler
|
||||
|
||||
return optimizer
|
||||
@@ -243,7 +327,7 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
y = self.get_conditioning(batch)
|
||||
|
||||
if self.label_key == 'class_label':
|
||||
y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
|
||||
y = log_txt_as_img((x.shape[2], x.shape[3]), batch['human_label'])
|
||||
log['labels'] = y
|
||||
|
||||
if ismap(y):
|
||||
@@ -256,10 +340,14 @@ class NoisyLatentImageClassifier(pl.LightningModule):
|
||||
|
||||
log[f'inputs@t{current_time}'] = x_noisy
|
||||
|
||||
pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
|
||||
pred = F.one_hot(
|
||||
logits.argmax(dim=1), num_classes=self.num_classes
|
||||
)
|
||||
pred = rearrange(pred, 'b h w c -> b c h w')
|
||||
|
||||
log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
|
||||
log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(
|
||||
pred
|
||||
)
|
||||
|
||||
for key in log:
|
||||
log[key] = log[key][:N]
|
||||
|
||||
@@ -4,88 +4,146 @@ import torch
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
|
||||
extract_into_tensor
|
||||
from ldm.modules.diffusionmodules.util import (
|
||||
make_ddim_sampling_parameters,
|
||||
make_ddim_timesteps,
|
||||
noise_like,
|
||||
extract_into_tensor,
|
||||
)
|
||||
|
||||
|
||||
class DDIMSampler(object):
|
||||
def __init__(self, model, schedule="linear", **kwargs):
|
||||
def __init__(self, model, schedule='linear', device=None, **kwargs):
|
||||
super().__init__()
|
||||
self.model = model
|
||||
self.ddpm_num_timesteps = model.num_timesteps
|
||||
self.schedule = schedule
|
||||
self.device = device or choose_torch_device()
|
||||
|
||||
def register_buffer(self, name, attr):
|
||||
if type(attr) == torch.Tensor:
|
||||
if attr.device != torch.device("cuda"):
|
||||
attr = attr.to(torch.device("cuda"))
|
||||
if attr.device != torch.device(self.device):
|
||||
attr = attr.to(dtype=torch.float32, device=self.device)
|
||||
setattr(self, name, attr)
|
||||
|
||||
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
||||
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
|
||||
def make_schedule(
|
||||
self,
|
||||
ddim_num_steps,
|
||||
ddim_discretize='uniform',
|
||||
ddim_eta=0.0,
|
||||
verbose=True,
|
||||
):
|
||||
self.ddim_timesteps = make_ddim_timesteps(
|
||||
ddim_discr_method=ddim_discretize,
|
||||
num_ddim_timesteps=ddim_num_steps,
|
||||
num_ddpm_timesteps=self.ddpm_num_timesteps,
|
||||
verbose=verbose,
|
||||
)
|
||||
alphas_cumprod = self.model.alphas_cumprod
|
||||
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
||||
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||
assert (
|
||||
alphas_cumprod.shape[0] == self.ddpm_num_timesteps
|
||||
), 'alphas have to be defined for each timestep'
|
||||
to_torch = (
|
||||
lambda x: x.clone()
|
||||
.detach()
|
||||
.to(torch.float32)
|
||||
.to(self.model.device)
|
||||
)
|
||||
|
||||
self.register_buffer('betas', to_torch(self.model.betas))
|
||||
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||
self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
|
||||
self.register_buffer(
|
||||
'alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)
|
||||
)
|
||||
|
||||
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
||||
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
||||
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
||||
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
||||
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
||||
self.register_buffer(
|
||||
'sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
'sqrt_one_minus_alphas_cumprod',
|
||||
to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
'log_one_minus_alphas_cumprod',
|
||||
to_torch(np.log(1.0 - alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
'sqrt_recip_alphas_cumprod',
|
||||
to_torch(np.sqrt(1.0 / alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
'sqrt_recipm1_alphas_cumprod',
|
||||
to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
|
||||
)
|
||||
|
||||
# ddim sampling parameters
|
||||
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
||||
ddim_timesteps=self.ddim_timesteps,
|
||||
eta=ddim_eta,verbose=verbose)
|
||||
(
|
||||
ddim_sigmas,
|
||||
ddim_alphas,
|
||||
ddim_alphas_prev,
|
||||
) = make_ddim_sampling_parameters(
|
||||
alphacums=alphas_cumprod.cpu(),
|
||||
ddim_timesteps=self.ddim_timesteps,
|
||||
eta=ddim_eta,
|
||||
verbose=verbose,
|
||||
)
|
||||
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
||||
self.register_buffer('ddim_alphas', ddim_alphas)
|
||||
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
||||
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
||||
self.register_buffer(
|
||||
'ddim_sqrt_one_minus_alphas', np.sqrt(1.0 - ddim_alphas)
|
||||
)
|
||||
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
||||
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
||||
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
||||
(1 - self.alphas_cumprod_prev)
|
||||
/ (1 - self.alphas_cumprod)
|
||||
* (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
|
||||
)
|
||||
self.register_buffer(
|
||||
'ddim_sigmas_for_original_num_steps',
|
||||
sigmas_for_original_sampling_steps,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample(self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.,
|
||||
noise_dropout=0.,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs
|
||||
):
|
||||
def sample(
|
||||
self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.0,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs,
|
||||
):
|
||||
if conditioning is not None:
|
||||
if isinstance(conditioning, dict):
|
||||
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
|
||||
if cbs != batch_size:
|
||||
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||
print(
|
||||
f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
|
||||
)
|
||||
else:
|
||||
if conditioning.shape[0] != batch_size:
|
||||
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
|
||||
print(
|
||||
f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
|
||||
)
|
||||
|
||||
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||
# sampling
|
||||
@@ -93,30 +151,47 @@ class DDIMSampler(object):
|
||||
size = (batch_size, C, H, W)
|
||||
print(f'Data shape for DDIM sampling is {size}, eta {eta}')
|
||||
|
||||
samples, intermediates = self.ddim_sampling(conditioning, size,
|
||||
callback=callback,
|
||||
img_callback=img_callback,
|
||||
quantize_denoised=quantize_x0,
|
||||
mask=mask, x0=x0,
|
||||
ddim_use_original_steps=False,
|
||||
noise_dropout=noise_dropout,
|
||||
temperature=temperature,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
x_T=x_T,
|
||||
log_every_t=log_every_t,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
samples, intermediates = self.ddim_sampling(
|
||||
conditioning,
|
||||
size,
|
||||
callback=callback,
|
||||
img_callback=img_callback,
|
||||
quantize_denoised=quantize_x0,
|
||||
mask=mask,
|
||||
x0=x0,
|
||||
ddim_use_original_steps=False,
|
||||
noise_dropout=noise_dropout,
|
||||
temperature=temperature,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
x_T=x_T,
|
||||
log_every_t=log_every_t,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
return samples, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def ddim_sampling(self, cond, shape,
|
||||
x_T=None, ddim_use_original_steps=False,
|
||||
callback=None, timesteps=None, quantize_denoised=False,
|
||||
mask=None, x0=None, img_callback=None, log_every_t=100,
|
||||
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1., unconditional_conditioning=None,):
|
||||
def ddim_sampling(
|
||||
self,
|
||||
cond,
|
||||
shape,
|
||||
x_T=None,
|
||||
ddim_use_original_steps=False,
|
||||
callback=None,
|
||||
timesteps=None,
|
||||
quantize_denoised=False,
|
||||
mask=None,
|
||||
x0=None,
|
||||
img_callback=None,
|
||||
log_every_t=100,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
):
|
||||
device = self.model.betas.device
|
||||
b = shape[0]
|
||||
if x_T is None:
|
||||
@@ -125,17 +200,38 @@ class DDIMSampler(object):
|
||||
img = x_T
|
||||
|
||||
if timesteps is None:
|
||||
timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
|
||||
timesteps = (
|
||||
self.ddpm_num_timesteps
|
||||
if ddim_use_original_steps
|
||||
else self.ddim_timesteps
|
||||
)
|
||||
elif timesteps is not None and not ddim_use_original_steps:
|
||||
subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
|
||||
subset_end = (
|
||||
int(
|
||||
min(timesteps / self.ddim_timesteps.shape[0], 1)
|
||||
* self.ddim_timesteps.shape[0]
|
||||
)
|
||||
- 1
|
||||
)
|
||||
timesteps = self.ddim_timesteps[:subset_end]
|
||||
|
||||
intermediates = {'x_inter': [img], 'pred_x0': [img]}
|
||||
time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
|
||||
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||
time_range = (
|
||||
reversed(range(0, timesteps))
|
||||
if ddim_use_original_steps
|
||||
else np.flip(timesteps)
|
||||
)
|
||||
total_steps = (
|
||||
timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||
)
|
||||
print(f'Running DDIM Sampling with {total_steps} timesteps')
|
||||
|
||||
iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps, dynamic_ncols=True)
|
||||
iterator = tqdm(
|
||||
time_range,
|
||||
desc='DDIM Sampler',
|
||||
total=total_steps,
|
||||
dynamic_ncols=True,
|
||||
)
|
||||
|
||||
for i, step in enumerate(iterator):
|
||||
index = total_steps - i - 1
|
||||
@@ -143,18 +239,30 @@ class DDIMSampler(object):
|
||||
|
||||
if mask is not None:
|
||||
assert x0 is not None
|
||||
img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1. - mask) * img
|
||||
img_orig = self.model.q_sample(
|
||||
x0, ts
|
||||
) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1.0 - mask) * img
|
||||
|
||||
outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
|
||||
quantize_denoised=quantize_denoised, temperature=temperature,
|
||||
noise_dropout=noise_dropout, score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning)
|
||||
outs = self.p_sample_ddim(
|
||||
img,
|
||||
cond,
|
||||
ts,
|
||||
index=index,
|
||||
use_original_steps=ddim_use_original_steps,
|
||||
quantize_denoised=quantize_denoised,
|
||||
temperature=temperature,
|
||||
noise_dropout=noise_dropout,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
img, pred_x0 = outs
|
||||
if callback: callback(i)
|
||||
if img_callback: img_callback(pred_x0, i)
|
||||
if callback:
|
||||
callback(i)
|
||||
if img_callback:
|
||||
img_callback(pred_x0, i)
|
||||
|
||||
if index % log_every_t == 0 or index == total_steps - 1:
|
||||
intermediates['x_inter'].append(img)
|
||||
@@ -163,42 +271,82 @@ class DDIMSampler(object):
|
||||
return img, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
|
||||
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1., unconditional_conditioning=None):
|
||||
def p_sample_ddim(
|
||||
self,
|
||||
x,
|
||||
c,
|
||||
t,
|
||||
index,
|
||||
repeat_noise=False,
|
||||
use_original_steps=False,
|
||||
quantize_denoised=False,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
):
|
||||
b, *_, device = *x.shape, x.device
|
||||
|
||||
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
||||
if (
|
||||
unconditional_conditioning is None
|
||||
or unconditional_guidance_scale == 1.0
|
||||
):
|
||||
e_t = self.model.apply_model(x, t, c)
|
||||
else:
|
||||
x_in = torch.cat([x] * 2)
|
||||
t_in = torch.cat([t] * 2)
|
||||
c_in = torch.cat([unconditional_conditioning, c])
|
||||
e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
||||
e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
|
||||
e_t = e_t_uncond + unconditional_guidance_scale * (
|
||||
e_t - e_t_uncond
|
||||
)
|
||||
|
||||
if score_corrector is not None:
|
||||
assert self.model.parameterization == "eps"
|
||||
e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
|
||||
assert self.model.parameterization == 'eps'
|
||||
e_t = score_corrector.modify_score(
|
||||
self.model, e_t, x, t, c, **corrector_kwargs
|
||||
)
|
||||
|
||||
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||
alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
|
||||
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
||||
sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
||||
alphas = (
|
||||
self.model.alphas_cumprod
|
||||
if use_original_steps
|
||||
else self.ddim_alphas
|
||||
)
|
||||
alphas_prev = (
|
||||
self.model.alphas_cumprod_prev
|
||||
if use_original_steps
|
||||
else self.ddim_alphas_prev
|
||||
)
|
||||
sqrt_one_minus_alphas = (
|
||||
self.model.sqrt_one_minus_alphas_cumprod
|
||||
if use_original_steps
|
||||
else self.ddim_sqrt_one_minus_alphas
|
||||
)
|
||||
sigmas = (
|
||||
self.model.ddim_sigmas_for_original_num_steps
|
||||
if use_original_steps
|
||||
else self.ddim_sigmas
|
||||
)
|
||||
# select parameters corresponding to the currently considered timestep
|
||||
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||
sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
|
||||
sqrt_one_minus_at = torch.full(
|
||||
(b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
|
||||
)
|
||||
|
||||
# current prediction for x_0
|
||||
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||
if quantize_denoised:
|
||||
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||
# direction pointing to x_t
|
||||
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
||||
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||
if noise_dropout > 0.:
|
||||
dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t
|
||||
noise = (
|
||||
sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||
)
|
||||
if noise_dropout > 0.0:
|
||||
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||
return x_prev, pred_x0
|
||||
@@ -216,26 +364,55 @@ class DDIMSampler(object):
|
||||
|
||||
if noise is None:
|
||||
noise = torch.randn_like(x0)
|
||||
return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
|
||||
extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
|
||||
return (
|
||||
extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0
|
||||
+ extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape)
|
||||
* noise
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
|
||||
use_original_steps=False):
|
||||
def decode(
|
||||
self,
|
||||
x_latent,
|
||||
cond,
|
||||
t_start,
|
||||
img_callback=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
use_original_steps=False,
|
||||
):
|
||||
|
||||
timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
|
||||
timesteps = (
|
||||
np.arange(self.ddpm_num_timesteps)
|
||||
if use_original_steps
|
||||
else self.ddim_timesteps
|
||||
)
|
||||
timesteps = timesteps[:t_start]
|
||||
|
||||
time_range = np.flip(timesteps)
|
||||
total_steps = timesteps.shape[0]
|
||||
print(f"Running DDIM Sampling with {total_steps} timesteps")
|
||||
print(f'Running DDIM Sampling with {total_steps} timesteps')
|
||||
|
||||
iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
|
||||
x_dec = x_latent
|
||||
for i, step in enumerate(iterator):
|
||||
index = total_steps - i - 1
|
||||
ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
|
||||
x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning)
|
||||
ts = torch.full(
|
||||
(x_latent.shape[0],),
|
||||
step,
|
||||
device=x_latent.device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
x_dec, _ = self.p_sample_ddim(
|
||||
x_dec,
|
||||
cond,
|
||||
ts,
|
||||
index=index,
|
||||
use_original_steps=use_original_steps,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
if img_callback:
|
||||
img_callback(x_dec, i)
|
||||
|
||||
return x_dec
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
'''wrapper around part of Karen Crownson's k-duffsion library, making it call compatible with other Samplers'''
|
||||
"""wrapper around part of Katherine Crowson's k-diffusion library, making it call compatible with other Samplers"""
|
||||
import k_diffusion as K
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import accelerate
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
class CFGDenoiser(nn.Module):
|
||||
def __init__(self, model):
|
||||
@@ -16,59 +16,73 @@ class CFGDenoiser(nn.Module):
|
||||
uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
|
||||
return uncond + (cond - uncond) * cond_scale
|
||||
|
||||
|
||||
class KSampler(object):
|
||||
def __init__(self,model,schedule="lms", **kwargs):
|
||||
def __init__(self, model, schedule='lms', device=None, **kwargs):
|
||||
super().__init__()
|
||||
self.model = K.external.CompVisDenoiser(model)
|
||||
self.accelerator = accelerate.Accelerator()
|
||||
self.device = self.accelerator.device
|
||||
self.model = K.external.CompVisDenoiser(model)
|
||||
self.schedule = schedule
|
||||
self.device = device or choose_torch_device()
|
||||
|
||||
def forward(self, x, sigma, uncond, cond, cond_scale):
|
||||
x_in = torch.cat([x] * 2)
|
||||
sigma_in = torch.cat([sigma] * 2)
|
||||
cond_in = torch.cat([uncond, cond])
|
||||
uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
|
||||
uncond, cond = self.inner_model(
|
||||
x_in, sigma_in, cond=cond_in
|
||||
).chunk(2)
|
||||
return uncond + (cond - uncond) * cond_scale
|
||||
|
||||
|
||||
# most of these arguments are ignored and are only present for compatibility with
|
||||
# other samples
|
||||
@torch.no_grad()
|
||||
def sample(self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.,
|
||||
noise_dropout=0.,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs
|
||||
):
|
||||
def sample(
|
||||
self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.0,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs,
|
||||
):
|
||||
def route_callback(k_callback_values):
|
||||
if img_callback is not None:
|
||||
img_callback(k_callback_values['x'], k_callback_values['i'])
|
||||
|
||||
sigmas = self.model.get_sigmas(S)
|
||||
if x_T:
|
||||
x = x_T
|
||||
if x_T is not None:
|
||||
x = x_T * sigmas[0]
|
||||
else:
|
||||
x = torch.randn([batch_size, *shape], device=self.device) * sigmas[0] # for GPU draw
|
||||
x = (
|
||||
torch.randn([batch_size, *shape], device=self.device)
|
||||
* sigmas[0]
|
||||
) # for GPU draw
|
||||
model_wrap_cfg = CFGDenoiser(self.model)
|
||||
extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': unconditional_guidance_scale}
|
||||
return (K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not self.accelerator.is_main_process),
|
||||
None)
|
||||
|
||||
def gather(samples_ddim):
|
||||
return self.accelerator.gather(samples_ddim)
|
||||
extra_args = {
|
||||
'cond': conditioning,
|
||||
'uncond': unconditional_conditioning,
|
||||
'cond_scale': unconditional_guidance_scale,
|
||||
}
|
||||
return (
|
||||
K.sampling.__dict__[f'sample_{self.schedule}'](
|
||||
model_wrap_cfg, x, sigmas, extra_args=extra_args,
|
||||
callback=route_callback
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
@@ -4,120 +4,195 @@ import torch
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from functools import partial
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
|
||||
from ldm.modules.diffusionmodules.util import (
|
||||
make_ddim_sampling_parameters,
|
||||
make_ddim_timesteps,
|
||||
noise_like,
|
||||
)
|
||||
|
||||
|
||||
class PLMSSampler(object):
|
||||
def __init__(self, model, schedule="linear", **kwargs):
|
||||
def __init__(self, model, schedule='linear', device=None, **kwargs):
|
||||
super().__init__()
|
||||
self.model = model
|
||||
self.ddpm_num_timesteps = model.num_timesteps
|
||||
self.schedule = schedule
|
||||
self.device = device if device else choose_torch_device()
|
||||
|
||||
def register_buffer(self, name, attr):
|
||||
if type(attr) == torch.Tensor:
|
||||
if attr.device != torch.device("cuda"):
|
||||
attr = attr.to(torch.device("cuda"))
|
||||
if attr.device != torch.device(self.device):
|
||||
attr = attr.to(torch.float32).to(torch.device(self.device))
|
||||
setattr(self, name, attr)
|
||||
|
||||
def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
|
||||
def make_schedule(
|
||||
self,
|
||||
ddim_num_steps,
|
||||
ddim_discretize='uniform',
|
||||
ddim_eta=0.0,
|
||||
verbose=True,
|
||||
):
|
||||
if ddim_eta != 0:
|
||||
raise ValueError('ddim_eta must be 0 for PLMS')
|
||||
self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
|
||||
num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
|
||||
self.ddim_timesteps = make_ddim_timesteps(
|
||||
ddim_discr_method=ddim_discretize,
|
||||
num_ddim_timesteps=ddim_num_steps,
|
||||
num_ddpm_timesteps=self.ddpm_num_timesteps,
|
||||
verbose=verbose,
|
||||
)
|
||||
alphas_cumprod = self.model.alphas_cumprod
|
||||
assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
|
||||
to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
|
||||
assert (
|
||||
alphas_cumprod.shape[0] == self.ddpm_num_timesteps
|
||||
), 'alphas have to be defined for each timestep'
|
||||
to_torch = (
|
||||
lambda x: x.clone()
|
||||
.detach()
|
||||
.to(torch.float32)
|
||||
.to(self.model.device)
|
||||
)
|
||||
|
||||
self.register_buffer('betas', to_torch(self.model.betas))
|
||||
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||
self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
|
||||
self.register_buffer(
|
||||
'alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)
|
||||
)
|
||||
|
||||
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
|
||||
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
|
||||
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
|
||||
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
|
||||
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
|
||||
self.register_buffer(
|
||||
'sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))
|
||||
)
|
||||
self.register_buffer(
|
||||
'sqrt_one_minus_alphas_cumprod',
|
||||
to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
'log_one_minus_alphas_cumprod',
|
||||
to_torch(np.log(1.0 - alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
'sqrt_recip_alphas_cumprod',
|
||||
to_torch(np.sqrt(1.0 / alphas_cumprod.cpu())),
|
||||
)
|
||||
self.register_buffer(
|
||||
'sqrt_recipm1_alphas_cumprod',
|
||||
to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
|
||||
)
|
||||
|
||||
# ddim sampling parameters
|
||||
ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
|
||||
ddim_timesteps=self.ddim_timesteps,
|
||||
eta=ddim_eta,verbose=verbose)
|
||||
(
|
||||
ddim_sigmas,
|
||||
ddim_alphas,
|
||||
ddim_alphas_prev,
|
||||
) = make_ddim_sampling_parameters(
|
||||
alphacums=alphas_cumprod.cpu(),
|
||||
ddim_timesteps=self.ddim_timesteps,
|
||||
eta=ddim_eta,
|
||||
verbose=verbose,
|
||||
)
|
||||
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
||||
self.register_buffer('ddim_alphas', ddim_alphas)
|
||||
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
||||
self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
|
||||
self.register_buffer(
|
||||
'ddim_sqrt_one_minus_alphas', np.sqrt(1.0 - ddim_alphas)
|
||||
)
|
||||
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
|
||||
1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
||||
self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
|
||||
(1 - self.alphas_cumprod_prev)
|
||||
/ (1 - self.alphas_cumprod)
|
||||
* (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
|
||||
)
|
||||
self.register_buffer(
|
||||
'ddim_sigmas_for_original_num_steps',
|
||||
sigmas_for_original_sampling_steps,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample(self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.,
|
||||
noise_dropout=0.,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs
|
||||
):
|
||||
def sample(
|
||||
self,
|
||||
S,
|
||||
batch_size,
|
||||
shape,
|
||||
conditioning=None,
|
||||
callback=None,
|
||||
normals_sequence=None,
|
||||
img_callback=None,
|
||||
quantize_x0=False,
|
||||
eta=0.0,
|
||||
mask=None,
|
||||
x0=None,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
verbose=True,
|
||||
x_T=None,
|
||||
log_every_t=100,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
# this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
|
||||
**kwargs,
|
||||
):
|
||||
if conditioning is not None:
|
||||
if isinstance(conditioning, dict):
|
||||
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
|
||||
if cbs != batch_size:
|
||||
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
|
||||
print(
|
||||
f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
|
||||
)
|
||||
else:
|
||||
if conditioning.shape[0] != batch_size:
|
||||
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
|
||||
print(
|
||||
f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
|
||||
)
|
||||
|
||||
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
|
||||
# sampling
|
||||
C, H, W = shape
|
||||
size = (batch_size, C, H, W)
|
||||
# print(f'Data shape for PLMS sampling is {size}')
|
||||
# print(f'Data shape for PLMS sampling is {size}')
|
||||
|
||||
samples, intermediates = self.plms_sampling(conditioning, size,
|
||||
callback=callback,
|
||||
img_callback=img_callback,
|
||||
quantize_denoised=quantize_x0,
|
||||
mask=mask, x0=x0,
|
||||
ddim_use_original_steps=False,
|
||||
noise_dropout=noise_dropout,
|
||||
temperature=temperature,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
x_T=x_T,
|
||||
log_every_t=log_every_t,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
samples, intermediates = self.plms_sampling(
|
||||
conditioning,
|
||||
size,
|
||||
callback=callback,
|
||||
img_callback=img_callback,
|
||||
quantize_denoised=quantize_x0,
|
||||
mask=mask,
|
||||
x0=x0,
|
||||
ddim_use_original_steps=False,
|
||||
noise_dropout=noise_dropout,
|
||||
temperature=temperature,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
x_T=x_T,
|
||||
log_every_t=log_every_t,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
)
|
||||
return samples, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def plms_sampling(self, cond, shape,
|
||||
x_T=None, ddim_use_original_steps=False,
|
||||
callback=None, timesteps=None, quantize_denoised=False,
|
||||
mask=None, x0=None, img_callback=None, log_every_t=100,
|
||||
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1., unconditional_conditioning=None,):
|
||||
def plms_sampling(
|
||||
self,
|
||||
cond,
|
||||
shape,
|
||||
x_T=None,
|
||||
ddim_use_original_steps=False,
|
||||
callback=None,
|
||||
timesteps=None,
|
||||
quantize_denoised=False,
|
||||
mask=None,
|
||||
x0=None,
|
||||
img_callback=None,
|
||||
log_every_t=100,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
):
|
||||
device = self.model.betas.device
|
||||
b = shape[0]
|
||||
if x_T is None:
|
||||
@@ -126,42 +201,81 @@ class PLMSSampler(object):
|
||||
img = x_T
|
||||
|
||||
if timesteps is None:
|
||||
timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
|
||||
timesteps = (
|
||||
self.ddpm_num_timesteps
|
||||
if ddim_use_original_steps
|
||||
else self.ddim_timesteps
|
||||
)
|
||||
elif timesteps is not None and not ddim_use_original_steps:
|
||||
subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
|
||||
subset_end = (
|
||||
int(
|
||||
min(timesteps / self.ddim_timesteps.shape[0], 1)
|
||||
* self.ddim_timesteps.shape[0]
|
||||
)
|
||||
- 1
|
||||
)
|
||||
timesteps = self.ddim_timesteps[:subset_end]
|
||||
|
||||
intermediates = {'x_inter': [img], 'pred_x0': [img]}
|
||||
time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
|
||||
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||
# print(f"Running PLMS Sampling with {total_steps} timesteps")
|
||||
time_range = (
|
||||
list(reversed(range(0, timesteps)))
|
||||
if ddim_use_original_steps
|
||||
else np.flip(timesteps)
|
||||
)
|
||||
total_steps = (
|
||||
timesteps if ddim_use_original_steps else timesteps.shape[0]
|
||||
)
|
||||
# print(f"Running PLMS Sampling with {total_steps} timesteps")
|
||||
|
||||
iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps, dynamic_ncols=True)
|
||||
iterator = tqdm(
|
||||
time_range,
|
||||
desc='PLMS Sampler',
|
||||
total=total_steps,
|
||||
dynamic_ncols=True,
|
||||
)
|
||||
old_eps = []
|
||||
|
||||
for i, step in enumerate(iterator):
|
||||
index = total_steps - i - 1
|
||||
ts = torch.full((b,), step, device=device, dtype=torch.long)
|
||||
ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
|
||||
ts_next = torch.full(
|
||||
(b,),
|
||||
time_range[min(i + 1, len(time_range) - 1)],
|
||||
device=device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
|
||||
if mask is not None:
|
||||
assert x0 is not None
|
||||
img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1. - mask) * img
|
||||
img_orig = self.model.q_sample(
|
||||
x0, ts
|
||||
) # TODO: deterministic forward pass?
|
||||
img = img_orig * mask + (1.0 - mask) * img
|
||||
|
||||
outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
|
||||
quantize_denoised=quantize_denoised, temperature=temperature,
|
||||
noise_dropout=noise_dropout, score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
old_eps=old_eps, t_next=ts_next)
|
||||
outs = self.p_sample_plms(
|
||||
img,
|
||||
cond,
|
||||
ts,
|
||||
index=index,
|
||||
use_original_steps=ddim_use_original_steps,
|
||||
quantize_denoised=quantize_denoised,
|
||||
temperature=temperature,
|
||||
noise_dropout=noise_dropout,
|
||||
score_corrector=score_corrector,
|
||||
corrector_kwargs=corrector_kwargs,
|
||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||
unconditional_conditioning=unconditional_conditioning,
|
||||
old_eps=old_eps,
|
||||
t_next=ts_next,
|
||||
)
|
||||
img, pred_x0, e_t = outs
|
||||
old_eps.append(e_t)
|
||||
if len(old_eps) >= 4:
|
||||
old_eps.pop(0)
|
||||
if callback: callback(i)
|
||||
if img_callback: img_callback(pred_x0, i)
|
||||
if callback:
|
||||
callback(i)
|
||||
if img_callback:
|
||||
img_callback(pred_x0, i)
|
||||
|
||||
if index % log_every_t == 0 or index == total_steps - 1:
|
||||
intermediates['x_inter'].append(img)
|
||||
@@ -170,47 +284,95 @@ class PLMSSampler(object):
|
||||
return img, intermediates
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
|
||||
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
|
||||
def p_sample_plms(
|
||||
self,
|
||||
x,
|
||||
c,
|
||||
t,
|
||||
index,
|
||||
repeat_noise=False,
|
||||
use_original_steps=False,
|
||||
quantize_denoised=False,
|
||||
temperature=1.0,
|
||||
noise_dropout=0.0,
|
||||
score_corrector=None,
|
||||
corrector_kwargs=None,
|
||||
unconditional_guidance_scale=1.0,
|
||||
unconditional_conditioning=None,
|
||||
old_eps=None,
|
||||
t_next=None,
|
||||
):
|
||||
b, *_, device = *x.shape, x.device
|
||||
|
||||
def get_model_output(x, t):
|
||||
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
||||
if (
|
||||
unconditional_conditioning is None
|
||||
or unconditional_guidance_scale == 1.0
|
||||
):
|
||||
e_t = self.model.apply_model(x, t, c)
|
||||
else:
|
||||
x_in = torch.cat([x] * 2)
|
||||
t_in = torch.cat([t] * 2)
|
||||
c_in = torch.cat([unconditional_conditioning, c])
|
||||
e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
||||
e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
|
||||
e_t_uncond, e_t = self.model.apply_model(
|
||||
x_in, t_in, c_in
|
||||
).chunk(2)
|
||||
e_t = e_t_uncond + unconditional_guidance_scale * (
|
||||
e_t - e_t_uncond
|
||||
)
|
||||
|
||||
if score_corrector is not None:
|
||||
assert self.model.parameterization == "eps"
|
||||
e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
|
||||
assert self.model.parameterization == 'eps'
|
||||
e_t = score_corrector.modify_score(
|
||||
self.model, e_t, x, t, c, **corrector_kwargs
|
||||
)
|
||||
|
||||
return e_t
|
||||
|
||||
alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
|
||||
alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
|
||||
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
||||
sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
||||
alphas = (
|
||||
self.model.alphas_cumprod
|
||||
if use_original_steps
|
||||
else self.ddim_alphas
|
||||
)
|
||||
alphas_prev = (
|
||||
self.model.alphas_cumprod_prev
|
||||
if use_original_steps
|
||||
else self.ddim_alphas_prev
|
||||
)
|
||||
sqrt_one_minus_alphas = (
|
||||
self.model.sqrt_one_minus_alphas_cumprod
|
||||
if use_original_steps
|
||||
else self.ddim_sqrt_one_minus_alphas
|
||||
)
|
||||
sigmas = (
|
||||
self.model.ddim_sigmas_for_original_num_steps
|
||||
if use_original_steps
|
||||
else self.ddim_sigmas
|
||||
)
|
||||
|
||||
def get_x_prev_and_pred_x0(e_t, index):
|
||||
# select parameters corresponding to the currently considered timestep
|
||||
a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
|
||||
a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
|
||||
a_prev = torch.full(
|
||||
(b, 1, 1, 1), alphas_prev[index], device=device
|
||||
)
|
||||
sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
|
||||
sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
|
||||
sqrt_one_minus_at = torch.full(
|
||||
(b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
|
||||
)
|
||||
|
||||
# current prediction for x_0
|
||||
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||
if quantize_denoised:
|
||||
pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
|
||||
# direction pointing to x_t
|
||||
dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
|
||||
noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
|
||||
if noise_dropout > 0.:
|
||||
dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t
|
||||
noise = (
|
||||
sigma_t
|
||||
* noise_like(x.shape, device, repeat_noise)
|
||||
* temperature
|
||||
)
|
||||
if noise_dropout > 0.0:
|
||||
noise = torch.nn.functional.dropout(noise, p=noise_dropout)
|
||||
x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
|
||||
return x_prev, pred_x0
|
||||
@@ -229,7 +391,12 @@ class PLMSSampler(object):
|
||||
e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
|
||||
elif len(old_eps) >= 3:
|
||||
# 4nd order Pseudo Linear Multistep (Adams-Bashforth)
|
||||
e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
|
||||
e_t_prime = (
|
||||
55 * e_t
|
||||
- 59 * old_eps[-1]
|
||||
+ 37 * old_eps[-2]
|
||||
- 9 * old_eps[-3]
|
||||
) / 24
|
||||
|
||||
x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ def exists(val):
|
||||
|
||||
|
||||
def uniq(arr):
|
||||
return{el: True for el in arr}.keys()
|
||||
return {el: True for el in arr}.keys()
|
||||
|
||||
|
||||
def default(val, d):
|
||||
@@ -45,19 +45,18 @@ class GEGLU(nn.Module):
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
|
||||
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
|
||||
super().__init__()
|
||||
inner_dim = int(dim * mult)
|
||||
dim_out = default(dim_out, dim)
|
||||
project_in = nn.Sequential(
|
||||
nn.Linear(dim, inner_dim),
|
||||
nn.GELU()
|
||||
) if not glu else GEGLU(dim, inner_dim)
|
||||
project_in = (
|
||||
nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
|
||||
if not glu
|
||||
else GEGLU(dim, inner_dim)
|
||||
)
|
||||
|
||||
self.net = nn.Sequential(
|
||||
project_in,
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(inner_dim, dim_out)
|
||||
project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
@@ -74,7 +73,9 @@ def zero_module(module):
|
||||
|
||||
|
||||
def Normalize(in_channels):
|
||||
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
|
||||
return torch.nn.GroupNorm(
|
||||
num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
|
||||
)
|
||||
|
||||
|
||||
class LinearAttention(nn.Module):
|
||||
@@ -82,17 +83,28 @@ class LinearAttention(nn.Module):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
hidden_dim = dim_head * heads
|
||||
self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
|
||||
self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
|
||||
self.to_out = nn.Conv2d(hidden_dim, dim, 1)
|
||||
|
||||
def forward(self, x):
|
||||
b, c, h, w = x.shape
|
||||
qkv = self.to_qkv(x)
|
||||
q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
|
||||
k = k.softmax(dim=-1)
|
||||
q, k, v = rearrange(
|
||||
qkv,
|
||||
'b (qkv heads c) h w -> qkv b heads c (h w)',
|
||||
heads=self.heads,
|
||||
qkv=3,
|
||||
)
|
||||
k = k.softmax(dim=-1)
|
||||
context = torch.einsum('bhdn,bhen->bhde', k, v)
|
||||
out = torch.einsum('bhde,bhdn->bhen', context, q)
|
||||
out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
|
||||
out = rearrange(
|
||||
out,
|
||||
'b heads c (h w) -> b (heads c) h w',
|
||||
heads=self.heads,
|
||||
h=h,
|
||||
w=w,
|
||||
)
|
||||
return self.to_out(out)
|
||||
|
||||
|
||||
@@ -102,26 +114,18 @@ class SpatialSelfAttention(nn.Module):
|
||||
self.in_channels = in_channels
|
||||
|
||||
self.norm = Normalize(in_channels)
|
||||
self.q = torch.nn.Conv2d(in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.k = torch.nn.Conv2d(in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.v = torch.nn.Conv2d(in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.proj_out = torch.nn.Conv2d(in_channels,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.q = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.k = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.v = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
self.proj_out = torch.nn.Conv2d(
|
||||
in_channels, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
h_ = x
|
||||
@@ -131,12 +135,12 @@ class SpatialSelfAttention(nn.Module):
|
||||
v = self.v(h_)
|
||||
|
||||
# compute attention
|
||||
b,c,h,w = q.shape
|
||||
b, c, h, w = q.shape
|
||||
q = rearrange(q, 'b c h w -> b (h w) c')
|
||||
k = rearrange(k, 'b c h w -> b c (h w)')
|
||||
w_ = torch.einsum('bij,bjk->bik', q, k)
|
||||
|
||||
w_ = w_ * (int(c)**(-0.5))
|
||||
w_ = w_ * (int(c) ** (-0.5))
|
||||
w_ = torch.nn.functional.softmax(w_, dim=2)
|
||||
|
||||
# attend to values
|
||||
@@ -146,16 +150,18 @@ class SpatialSelfAttention(nn.Module):
|
||||
h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
|
||||
h_ = self.proj_out(h_)
|
||||
|
||||
return x+h_
|
||||
return x + h_
|
||||
|
||||
|
||||
class CrossAttention(nn.Module):
|
||||
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
|
||||
def __init__(
|
||||
self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0
|
||||
):
|
||||
super().__init__()
|
||||
inner_dim = dim_head * heads
|
||||
context_dim = default(context_dim, query_dim)
|
||||
|
||||
self.scale = dim_head ** -0.5
|
||||
self.scale = dim_head**-0.5
|
||||
self.heads = heads
|
||||
|
||||
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
|
||||
@@ -163,8 +169,7 @@ class CrossAttention(nn.Module):
|
||||
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
|
||||
|
||||
self.to_out = nn.Sequential(
|
||||
nn.Linear(inner_dim, query_dim),
|
||||
nn.Dropout(dropout)
|
||||
nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
|
||||
)
|
||||
|
||||
def forward(self, x, context=None, mask=None):
|
||||
@@ -175,7 +180,9 @@ class CrossAttention(nn.Module):
|
||||
k = self.to_k(context)
|
||||
v = self.to_v(context)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
|
||||
q, k, v = map(
|
||||
lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)
|
||||
)
|
||||
|
||||
sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
|
||||
|
||||
@@ -194,21 +201,40 @@ class CrossAttention(nn.Module):
|
||||
|
||||
|
||||
class BasicTransformerBlock(nn.Module):
|
||||
def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
n_heads,
|
||||
d_head,
|
||||
dropout=0.0,
|
||||
context_dim=None,
|
||||
gated_ff=True,
|
||||
checkpoint=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout) # is a self-attention
|
||||
self.attn1 = CrossAttention(
|
||||
query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
|
||||
) # is a self-attention
|
||||
self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
|
||||
self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
|
||||
heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
|
||||
self.attn2 = CrossAttention(
|
||||
query_dim=dim,
|
||||
context_dim=context_dim,
|
||||
heads=n_heads,
|
||||
dim_head=d_head,
|
||||
dropout=dropout,
|
||||
) # is self-attn if context is none
|
||||
self.norm1 = nn.LayerNorm(dim)
|
||||
self.norm2 = nn.LayerNorm(dim)
|
||||
self.norm3 = nn.LayerNorm(dim)
|
||||
self.checkpoint = checkpoint
|
||||
|
||||
def forward(self, x, context=None):
|
||||
return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
|
||||
return checkpoint(
|
||||
self._forward, (x, context), self.parameters(), self.checkpoint
|
||||
)
|
||||
|
||||
def _forward(self, x, context=None):
|
||||
x = x.contiguous() if x.device.type == 'mps' else x
|
||||
x = self.attn1(self.norm1(x)) + x
|
||||
x = self.attn2(self.norm2(x), context=context) + x
|
||||
x = self.ff(self.norm3(x)) + x
|
||||
@@ -223,29 +249,43 @@ class SpatialTransformer(nn.Module):
|
||||
Then apply standard transformer action.
|
||||
Finally, reshape to image
|
||||
"""
|
||||
def __init__(self, in_channels, n_heads, d_head,
|
||||
depth=1, dropout=0., context_dim=None):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
n_heads,
|
||||
d_head,
|
||||
depth=1,
|
||||
dropout=0.0,
|
||||
context_dim=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
inner_dim = n_heads * d_head
|
||||
self.norm = Normalize(in_channels)
|
||||
|
||||
self.proj_in = nn.Conv2d(in_channels,
|
||||
inner_dim,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
|
||||
for d in range(depth)]
|
||||
self.proj_in = nn.Conv2d(
|
||||
in_channels, inner_dim, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
|
||||
self.proj_out = zero_module(nn.Conv2d(inner_dim,
|
||||
in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0))
|
||||
self.transformer_blocks = nn.ModuleList(
|
||||
[
|
||||
BasicTransformerBlock(
|
||||
inner_dim,
|
||||
n_heads,
|
||||
d_head,
|
||||
dropout=dropout,
|
||||
context_dim=context_dim,
|
||||
)
|
||||
for d in range(depth)
|
||||
]
|
||||
)
|
||||
|
||||
self.proj_out = zero_module(
|
||||
nn.Conv2d(
|
||||
inner_dim, in_channels, kernel_size=1, stride=1, padding=0
|
||||
)
|
||||
)
|
||||
|
||||
def forward(self, x, context=None):
|
||||
# note: if no context is given, cross-attention defaults to self-attention
|
||||
@@ -258,4 +298,4 @@ class SpatialTransformer(nn.Module):
|
||||
x = block(x, context=context)
|
||||
x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
|
||||
x = self.proj_out(x)
|
||||
return x + x_in
|
||||
return x + x_in
|
||||
|
||||
@@ -24,6 +24,7 @@ from ldm.modules.attention import SpatialTransformer
|
||||
def convert_module_to_f16(x):
|
||||
pass
|
||||
|
||||
|
||||
def convert_module_to_f32(x):
|
||||
pass
|
||||
|
||||
@@ -42,7 +43,9 @@ class AttentionPool2d(nn.Module):
|
||||
output_dim: int = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
|
||||
self.positional_embedding = nn.Parameter(
|
||||
th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5
|
||||
)
|
||||
self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
|
||||
self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
|
||||
self.num_heads = embed_dim // num_heads_channels
|
||||
@@ -97,37 +100,45 @@ class Upsample(nn.Module):
|
||||
upsampling occurs in the inner-two dimensions.
|
||||
"""
|
||||
|
||||
def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
|
||||
def __init__(
|
||||
self, channels, use_conv, dims=2, out_channels=None, padding=1
|
||||
):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels or channels
|
||||
self.use_conv = use_conv
|
||||
self.dims = dims
|
||||
if use_conv:
|
||||
self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
|
||||
self.conv = conv_nd(
|
||||
dims, self.channels, self.out_channels, 3, padding=padding
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
assert x.shape[1] == self.channels
|
||||
if self.dims == 3:
|
||||
x = F.interpolate(
|
||||
x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
|
||||
x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode='nearest'
|
||||
)
|
||||
else:
|
||||
x = F.interpolate(x, scale_factor=2, mode="nearest")
|
||||
x = F.interpolate(x, scale_factor=2, mode='nearest')
|
||||
if self.use_conv:
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class TransposedUpsample(nn.Module):
|
||||
'Learned 2x upsampling without padding'
|
||||
"""Learned 2x upsampling without padding"""
|
||||
|
||||
def __init__(self, channels, out_channels=None, ks=5):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels or channels
|
||||
|
||||
self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
|
||||
self.up = nn.ConvTranspose2d(
|
||||
self.channels, self.out_channels, kernel_size=ks, stride=2
|
||||
)
|
||||
|
||||
def forward(self,x):
|
||||
def forward(self, x):
|
||||
return self.up(x)
|
||||
|
||||
|
||||
@@ -140,7 +151,9 @@ class Downsample(nn.Module):
|
||||
downsampling occurs in the inner-two dimensions.
|
||||
"""
|
||||
|
||||
def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
|
||||
def __init__(
|
||||
self, channels, use_conv, dims=2, out_channels=None, padding=1
|
||||
):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels or channels
|
||||
@@ -149,7 +162,12 @@ class Downsample(nn.Module):
|
||||
stride = 2 if dims != 3 else (1, 2, 2)
|
||||
if use_conv:
|
||||
self.op = conv_nd(
|
||||
dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
|
||||
dims,
|
||||
self.channels,
|
||||
self.out_channels,
|
||||
3,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
)
|
||||
else:
|
||||
assert self.channels == self.out_channels
|
||||
@@ -219,7 +237,9 @@ class ResBlock(TimestepBlock):
|
||||
nn.SiLU(),
|
||||
linear(
|
||||
emb_channels,
|
||||
2 * self.out_channels if use_scale_shift_norm else self.out_channels,
|
||||
2 * self.out_channels
|
||||
if use_scale_shift_norm
|
||||
else self.out_channels,
|
||||
),
|
||||
)
|
||||
self.out_layers = nn.Sequential(
|
||||
@@ -227,7 +247,9 @@ class ResBlock(TimestepBlock):
|
||||
nn.SiLU(),
|
||||
nn.Dropout(p=dropout),
|
||||
zero_module(
|
||||
conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
|
||||
conv_nd(
|
||||
dims, self.out_channels, self.out_channels, 3, padding=1
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
@@ -238,7 +260,9 @@ class ResBlock(TimestepBlock):
|
||||
dims, channels, self.out_channels, 3, padding=1
|
||||
)
|
||||
else:
|
||||
self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
|
||||
self.skip_connection = conv_nd(
|
||||
dims, channels, self.out_channels, 1
|
||||
)
|
||||
|
||||
def forward(self, x, emb):
|
||||
"""
|
||||
@@ -251,7 +275,6 @@ class ResBlock(TimestepBlock):
|
||||
self._forward, (x, emb), self.parameters(), self.use_checkpoint
|
||||
)
|
||||
|
||||
|
||||
def _forward(self, x, emb):
|
||||
if self.updown:
|
||||
in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
|
||||
@@ -297,7 +320,7 @@ class AttentionBlock(nn.Module):
|
||||
else:
|
||||
assert (
|
||||
channels % num_head_channels == 0
|
||||
), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
|
||||
), f'q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}'
|
||||
self.num_heads = channels // num_head_channels
|
||||
self.use_checkpoint = use_checkpoint
|
||||
self.norm = normalization(channels)
|
||||
@@ -312,8 +335,10 @@ class AttentionBlock(nn.Module):
|
||||
self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
|
||||
|
||||
def forward(self, x):
|
||||
return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
|
||||
#return pt_checkpoint(self._forward, x) # pytorch
|
||||
return checkpoint(
|
||||
self._forward, (x,), self.parameters(), True
|
||||
) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
|
||||
# return pt_checkpoint(self._forward, x) # pytorch
|
||||
|
||||
def _forward(self, x):
|
||||
b, c, *spatial = x.shape
|
||||
@@ -340,7 +365,7 @@ def count_flops_attn(model, _x, y):
|
||||
# We perform two matmuls with the same number of ops.
|
||||
# The first computes the weight matrix, the second computes
|
||||
# the combination of the value vectors.
|
||||
matmul_ops = 2 * b * (num_spatial ** 2) * c
|
||||
matmul_ops = 2 * b * (num_spatial**2) * c
|
||||
model.total_ops += th.DoubleTensor([matmul_ops])
|
||||
|
||||
|
||||
@@ -362,13 +387,15 @@ class QKVAttentionLegacy(nn.Module):
|
||||
bs, width, length = qkv.shape
|
||||
assert width % (3 * self.n_heads) == 0
|
||||
ch = width // (3 * self.n_heads)
|
||||
q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
|
||||
q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(
|
||||
ch, dim=1
|
||||
)
|
||||
scale = 1 / math.sqrt(math.sqrt(ch))
|
||||
weight = th.einsum(
|
||||
"bct,bcs->bts", q * scale, k * scale
|
||||
'bct,bcs->bts', q * scale, k * scale
|
||||
) # More stable with f16 than dividing afterwards
|
||||
weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||
a = th.einsum("bts,bcs->bct", weight, v)
|
||||
a = th.einsum('bts,bcs->bct', weight, v)
|
||||
return a.reshape(bs, -1, length)
|
||||
|
||||
@staticmethod
|
||||
@@ -397,12 +424,14 @@ class QKVAttention(nn.Module):
|
||||
q, k, v = qkv.chunk(3, dim=1)
|
||||
scale = 1 / math.sqrt(math.sqrt(ch))
|
||||
weight = th.einsum(
|
||||
"bct,bcs->bts",
|
||||
'bct,bcs->bts',
|
||||
(q * scale).view(bs * self.n_heads, ch, length),
|
||||
(k * scale).view(bs * self.n_heads, ch, length),
|
||||
) # More stable with f16 than dividing afterwards
|
||||
weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||
a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
|
||||
a = th.einsum(
|
||||
'bts,bcs->bct', weight, v.reshape(bs * self.n_heads, ch, length)
|
||||
)
|
||||
return a.reshape(bs, -1, length)
|
||||
|
||||
@staticmethod
|
||||
@@ -461,19 +490,24 @@ class UNetModel(nn.Module):
|
||||
use_scale_shift_norm=False,
|
||||
resblock_updown=False,
|
||||
use_new_attention_order=False,
|
||||
use_spatial_transformer=False, # custom transformer support
|
||||
transformer_depth=1, # custom transformer support
|
||||
context_dim=None, # custom transformer support
|
||||
n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
|
||||
use_spatial_transformer=False, # custom transformer support
|
||||
transformer_depth=1, # custom transformer support
|
||||
context_dim=None, # custom transformer support
|
||||
n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
|
||||
legacy=True,
|
||||
):
|
||||
super().__init__()
|
||||
if use_spatial_transformer:
|
||||
assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
|
||||
assert (
|
||||
context_dim is not None
|
||||
), 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
|
||||
|
||||
if context_dim is not None:
|
||||
assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
|
||||
assert (
|
||||
use_spatial_transformer
|
||||
), 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
|
||||
from omegaconf.listconfig import ListConfig
|
||||
|
||||
if type(context_dim) == ListConfig:
|
||||
context_dim = list(context_dim)
|
||||
|
||||
@@ -481,10 +515,14 @@ class UNetModel(nn.Module):
|
||||
num_heads_upsample = num_heads
|
||||
|
||||
if num_heads == -1:
|
||||
assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
|
||||
assert (
|
||||
num_head_channels != -1
|
||||
), 'Either num_heads or num_head_channels has to be set'
|
||||
|
||||
if num_head_channels == -1:
|
||||
assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
|
||||
assert (
|
||||
num_heads != -1
|
||||
), 'Either num_heads or num_head_channels has to be set'
|
||||
|
||||
self.image_size = image_size
|
||||
self.in_channels = in_channels
|
||||
@@ -545,8 +583,12 @@ class UNetModel(nn.Module):
|
||||
num_heads = ch // num_head_channels
|
||||
dim_head = num_head_channels
|
||||
if legacy:
|
||||
#num_heads = 1
|
||||
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||
# num_heads = 1
|
||||
dim_head = (
|
||||
ch // num_heads
|
||||
if use_spatial_transformer
|
||||
else num_head_channels
|
||||
)
|
||||
layers.append(
|
||||
AttentionBlock(
|
||||
ch,
|
||||
@@ -554,8 +596,14 @@ class UNetModel(nn.Module):
|
||||
num_heads=num_heads,
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer(
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
|
||||
)
|
||||
if not use_spatial_transformer
|
||||
else SpatialTransformer(
|
||||
ch,
|
||||
num_heads,
|
||||
dim_head,
|
||||
depth=transformer_depth,
|
||||
context_dim=context_dim,
|
||||
)
|
||||
)
|
||||
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
||||
@@ -592,8 +640,12 @@ class UNetModel(nn.Module):
|
||||
num_heads = ch // num_head_channels
|
||||
dim_head = num_head_channels
|
||||
if legacy:
|
||||
#num_heads = 1
|
||||
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||
# num_heads = 1
|
||||
dim_head = (
|
||||
ch // num_heads
|
||||
if use_spatial_transformer
|
||||
else num_head_channels
|
||||
)
|
||||
self.middle_block = TimestepEmbedSequential(
|
||||
ResBlock(
|
||||
ch,
|
||||
@@ -609,9 +661,15 @@ class UNetModel(nn.Module):
|
||||
num_heads=num_heads,
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer(
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
|
||||
),
|
||||
)
|
||||
if not use_spatial_transformer
|
||||
else SpatialTransformer(
|
||||
ch,
|
||||
num_heads,
|
||||
dim_head,
|
||||
depth=transformer_depth,
|
||||
context_dim=context_dim,
|
||||
),
|
||||
ResBlock(
|
||||
ch,
|
||||
time_embed_dim,
|
||||
@@ -646,8 +704,12 @@ class UNetModel(nn.Module):
|
||||
num_heads = ch // num_head_channels
|
||||
dim_head = num_head_channels
|
||||
if legacy:
|
||||
#num_heads = 1
|
||||
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
||||
# num_heads = 1
|
||||
dim_head = (
|
||||
ch // num_heads
|
||||
if use_spatial_transformer
|
||||
else num_head_channels
|
||||
)
|
||||
layers.append(
|
||||
AttentionBlock(
|
||||
ch,
|
||||
@@ -655,8 +717,14 @@ class UNetModel(nn.Module):
|
||||
num_heads=num_heads_upsample,
|
||||
num_head_channels=dim_head,
|
||||
use_new_attention_order=use_new_attention_order,
|
||||
) if not use_spatial_transformer else SpatialTransformer(
|
||||
ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
|
||||
)
|
||||
if not use_spatial_transformer
|
||||
else SpatialTransformer(
|
||||
ch,
|
||||
num_heads,
|
||||
dim_head,
|
||||
depth=transformer_depth,
|
||||
context_dim=context_dim,
|
||||
)
|
||||
)
|
||||
if level and i == num_res_blocks:
|
||||
@@ -673,7 +741,9 @@ class UNetModel(nn.Module):
|
||||
up=True,
|
||||
)
|
||||
if resblock_updown
|
||||
else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
|
||||
else Upsample(
|
||||
ch, conv_resample, dims=dims, out_channels=out_ch
|
||||
)
|
||||
)
|
||||
ds //= 2
|
||||
self.output_blocks.append(TimestepEmbedSequential(*layers))
|
||||
@@ -682,14 +752,16 @@ class UNetModel(nn.Module):
|
||||
self.out = nn.Sequential(
|
||||
normalization(ch),
|
||||
nn.SiLU(),
|
||||
zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
|
||||
zero_module(
|
||||
conv_nd(dims, model_channels, out_channels, 3, padding=1)
|
||||
),
|
||||
)
|
||||
if self.predict_codebook_ids:
|
||||
self.id_predictor = nn.Sequential(
|
||||
normalization(ch),
|
||||
conv_nd(dims, model_channels, n_embed, 1),
|
||||
#nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
|
||||
)
|
||||
normalization(ch),
|
||||
conv_nd(dims, model_channels, n_embed, 1),
|
||||
# nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
|
||||
)
|
||||
|
||||
def convert_to_fp16(self):
|
||||
"""
|
||||
@@ -707,7 +779,7 @@ class UNetModel(nn.Module):
|
||||
self.middle_block.apply(convert_module_to_f32)
|
||||
self.output_blocks.apply(convert_module_to_f32)
|
||||
|
||||
def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
|
||||
def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
|
||||
"""
|
||||
Apply the model to an input batch.
|
||||
:param x: an [N x C x ...] Tensor of inputs.
|
||||
@@ -718,9 +790,11 @@ class UNetModel(nn.Module):
|
||||
"""
|
||||
assert (y is not None) == (
|
||||
self.num_classes is not None
|
||||
), "must specify y if and only if the model is class-conditional"
|
||||
), 'must specify y if and only if the model is class-conditional'
|
||||
hs = []
|
||||
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
|
||||
t_emb = timestep_embedding(
|
||||
timesteps, self.model_channels, repeat_only=False
|
||||
)
|
||||
emb = self.time_embed(t_emb)
|
||||
|
||||
if self.num_classes is not None:
|
||||
@@ -768,9 +842,9 @@ class EncoderUNetModel(nn.Module):
|
||||
use_scale_shift_norm=False,
|
||||
resblock_updown=False,
|
||||
use_new_attention_order=False,
|
||||
pool="adaptive",
|
||||
pool='adaptive',
|
||||
*args,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -888,7 +962,7 @@ class EncoderUNetModel(nn.Module):
|
||||
)
|
||||
self._feature_size += ch
|
||||
self.pool = pool
|
||||
if pool == "adaptive":
|
||||
if pool == 'adaptive':
|
||||
self.out = nn.Sequential(
|
||||
normalization(ch),
|
||||
nn.SiLU(),
|
||||
@@ -896,7 +970,7 @@ class EncoderUNetModel(nn.Module):
|
||||
zero_module(conv_nd(dims, ch, out_channels, 1)),
|
||||
nn.Flatten(),
|
||||
)
|
||||
elif pool == "attention":
|
||||
elif pool == 'attention':
|
||||
assert num_head_channels != -1
|
||||
self.out = nn.Sequential(
|
||||
normalization(ch),
|
||||
@@ -905,13 +979,13 @@ class EncoderUNetModel(nn.Module):
|
||||
(image_size // ds), ch, num_head_channels, out_channels
|
||||
),
|
||||
)
|
||||
elif pool == "spatial":
|
||||
elif pool == 'spatial':
|
||||
self.out = nn.Sequential(
|
||||
nn.Linear(self._feature_size, 2048),
|
||||
nn.ReLU(),
|
||||
nn.Linear(2048, self.out_channels),
|
||||
)
|
||||
elif pool == "spatial_v2":
|
||||
elif pool == 'spatial_v2':
|
||||
self.out = nn.Sequential(
|
||||
nn.Linear(self._feature_size, 2048),
|
||||
normalization(2048),
|
||||
@@ -919,7 +993,7 @@ class EncoderUNetModel(nn.Module):
|
||||
nn.Linear(2048, self.out_channels),
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Unexpected {pool} pooling")
|
||||
raise NotImplementedError(f'Unexpected {pool} pooling')
|
||||
|
||||
def convert_to_fp16(self):
|
||||
"""
|
||||
@@ -942,20 +1016,21 @@ class EncoderUNetModel(nn.Module):
|
||||
:param timesteps: a 1-D batch of timesteps.
|
||||
:return: an [N x K] Tensor of outputs.
|
||||
"""
|
||||
emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
||||
emb = self.time_embed(
|
||||
timestep_embedding(timesteps, self.model_channels)
|
||||
)
|
||||
|
||||
results = []
|
||||
h = x.type(self.dtype)
|
||||
for module in self.input_blocks:
|
||||
h = module(h, emb)
|
||||
if self.pool.startswith("spatial"):
|
||||
if self.pool.startswith('spatial'):
|
||||
results.append(h.type(x.dtype).mean(dim=(2, 3)))
|
||||
h = self.middle_block(h, emb)
|
||||
if self.pool.startswith("spatial"):
|
||||
if self.pool.startswith('spatial'):
|
||||
results.append(h.type(x.dtype).mean(dim=(2, 3)))
|
||||
h = th.cat(results, axis=-1)
|
||||
return self.out(h)
|
||||
else:
|
||||
h = h.type(x.dtype)
|
||||
return self.out(h)
|
||||
|
||||
|
||||
@@ -18,15 +18,24 @@ from einops import repeat
|
||||
from ldm.util import instantiate_from_config
|
||||
|
||||
|
||||
def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
||||
if schedule == "linear":
|
||||
def make_beta_schedule(
|
||||
schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
|
||||
):
|
||||
if schedule == 'linear':
|
||||
betas = (
|
||||
torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
|
||||
torch.linspace(
|
||||
linear_start**0.5,
|
||||
linear_end**0.5,
|
||||
n_timestep,
|
||||
dtype=torch.float64,
|
||||
)
|
||||
** 2
|
||||
)
|
||||
|
||||
elif schedule == "cosine":
|
||||
elif schedule == 'cosine':
|
||||
timesteps = (
|
||||
torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
|
||||
torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep
|
||||
+ cosine_s
|
||||
)
|
||||
alphas = timesteps / (1 + cosine_s) * np.pi / 2
|
||||
alphas = torch.cos(alphas).pow(2)
|
||||
@@ -34,23 +43,41 @@ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2,
|
||||
betas = 1 - alphas[1:] / alphas[:-1]
|
||||
betas = np.clip(betas, a_min=0, a_max=0.999)
|
||||
|
||||
elif schedule == "sqrt_linear":
|
||||
betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
|
||||
elif schedule == "sqrt":
|
||||
betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
|
||||
elif schedule == 'sqrt_linear':
|
||||
betas = torch.linspace(
|
||||
linear_start, linear_end, n_timestep, dtype=torch.float64
|
||||
)
|
||||
elif schedule == 'sqrt':
|
||||
betas = (
|
||||
torch.linspace(
|
||||
linear_start, linear_end, n_timestep, dtype=torch.float64
|
||||
)
|
||||
** 0.5
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"schedule '{schedule}' unknown.")
|
||||
return betas.numpy()
|
||||
|
||||
|
||||
def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
|
||||
def make_ddim_timesteps(
|
||||
ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True
|
||||
):
|
||||
if ddim_discr_method == 'uniform':
|
||||
c = num_ddpm_timesteps // num_ddim_timesteps
|
||||
ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
|
||||
elif ddim_discr_method == 'quad':
|
||||
ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
|
||||
ddim_timesteps = (
|
||||
(
|
||||
np.linspace(
|
||||
0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps
|
||||
)
|
||||
)
|
||||
** 2
|
||||
).astype(int)
|
||||
else:
|
||||
raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
|
||||
raise NotImplementedError(
|
||||
f'There is no ddim discretization method called "{ddim_discr_method}"'
|
||||
)
|
||||
|
||||
# assert ddim_timesteps.shape[0] == num_ddim_timesteps
|
||||
# add one to get the final alpha values right (the ones from first scale to data during sampling)
|
||||
@@ -60,17 +87,27 @@ def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timestep
|
||||
return steps_out
|
||||
|
||||
|
||||
def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
|
||||
def make_ddim_sampling_parameters(
|
||||
alphacums, ddim_timesteps, eta, verbose=True
|
||||
):
|
||||
# select alphas for computing the variance schedule
|
||||
alphas = alphacums[ddim_timesteps]
|
||||
alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
|
||||
alphas_prev = np.asarray(
|
||||
[alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()
|
||||
)
|
||||
|
||||
# according the the formula provided in https://arxiv.org/abs/2010.02502
|
||||
sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
|
||||
sigmas = eta * np.sqrt(
|
||||
(1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)
|
||||
)
|
||||
if verbose:
|
||||
print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
|
||||
print(f'For the chosen value of eta, which is {eta}, '
|
||||
f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
|
||||
print(
|
||||
f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}'
|
||||
)
|
||||
print(
|
||||
f'For the chosen value of eta, which is {eta}, '
|
||||
f'this results in the following sigma_t schedule for ddim sampler {sigmas}'
|
||||
)
|
||||
return sigmas, alphas, alphas_prev
|
||||
|
||||
|
||||
@@ -109,7 +146,9 @@ def checkpoint(func, inputs, params, flag):
|
||||
explicitly take as arguments.
|
||||
:param flag: if False, disable gradient checkpointing.
|
||||
"""
|
||||
if flag:
|
||||
if (
|
||||
False
|
||||
): # disabled checkpointing to allow requires_grad = False for main model
|
||||
args = tuple(inputs) + tuple(params)
|
||||
return CheckpointFunction.apply(func, len(inputs), *args)
|
||||
else:
|
||||
@@ -129,7 +168,9 @@ class CheckpointFunction(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, *output_grads):
|
||||
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
|
||||
ctx.input_tensors = [
|
||||
x.detach().requires_grad_(True) for x in ctx.input_tensors
|
||||
]
|
||||
with torch.enable_grad():
|
||||
# Fixes a bug where the first op in run_function modifies the
|
||||
# Tensor storage in place, which is not allowed for detach()'d
|
||||
@@ -160,12 +201,16 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
|
||||
if not repeat_only:
|
||||
half = dim // 2
|
||||
freqs = torch.exp(
|
||||
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
|
||||
-math.log(max_period)
|
||||
* torch.arange(start=0, end=half, dtype=torch.float32)
|
||||
/ half
|
||||
).to(device=timesteps.device)
|
||||
args = timesteps[:, None].float() * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
embedding = torch.cat(
|
||||
[embedding, torch.zeros_like(embedding[:, :1])], dim=-1
|
||||
)
|
||||
else:
|
||||
embedding = repeat(timesteps, 'b -> b d', d=dim)
|
||||
return embedding
|
||||
@@ -215,6 +260,7 @@ class GroupNorm32(nn.GroupNorm):
|
||||
def forward(self, x):
|
||||
return super().forward(x.float()).type(x.dtype)
|
||||
|
||||
|
||||
def conv_nd(dims, *args, **kwargs):
|
||||
"""
|
||||
Create a 1D, 2D, or 3D convolution module.
|
||||
@@ -225,7 +271,7 @@ def conv_nd(dims, *args, **kwargs):
|
||||
return nn.Conv2d(*args, **kwargs)
|
||||
elif dims == 3:
|
||||
return nn.Conv3d(*args, **kwargs)
|
||||
raise ValueError(f"unsupported dimensions: {dims}")
|
||||
raise ValueError(f'unsupported dimensions: {dims}')
|
||||
|
||||
|
||||
def linear(*args, **kwargs):
|
||||
@@ -245,15 +291,16 @@ def avg_pool_nd(dims, *args, **kwargs):
|
||||
return nn.AvgPool2d(*args, **kwargs)
|
||||
elif dims == 3:
|
||||
return nn.AvgPool3d(*args, **kwargs)
|
||||
raise ValueError(f"unsupported dimensions: {dims}")
|
||||
raise ValueError(f'unsupported dimensions: {dims}')
|
||||
|
||||
|
||||
class HybridConditioner(nn.Module):
|
||||
|
||||
def __init__(self, c_concat_config, c_crossattn_config):
|
||||
super().__init__()
|
||||
self.concat_conditioner = instantiate_from_config(c_concat_config)
|
||||
self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
|
||||
self.crossattn_conditioner = instantiate_from_config(
|
||||
c_crossattn_config
|
||||
)
|
||||
|
||||
def forward(self, c_concat, c_crossattn):
|
||||
c_concat = self.concat_conditioner(c_concat)
|
||||
@@ -262,6 +309,8 @@ class HybridConditioner(nn.Module):
|
||||
|
||||
|
||||
def noise_like(shape, device, repeat=False):
|
||||
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
|
||||
shape[0], *((1,) * (len(shape) - 1))
|
||||
)
|
||||
noise = lambda: torch.randn(shape, device=device)
|
||||
return repeat_noise() if repeat else noise()
|
||||
return repeat_noise() if repeat else noise()
|
||||
|
||||
@@ -30,33 +30,45 @@ class DiagonalGaussianDistribution(object):
|
||||
self.std = torch.exp(0.5 * self.logvar)
|
||||
self.var = torch.exp(self.logvar)
|
||||
if self.deterministic:
|
||||
self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
|
||||
self.var = self.std = torch.zeros_like(self.mean).to(
|
||||
device=self.parameters.device
|
||||
)
|
||||
|
||||
def sample(self):
|
||||
x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
|
||||
x = self.mean + self.std * torch.randn(self.mean.shape).to(
|
||||
device=self.parameters.device
|
||||
)
|
||||
return x
|
||||
|
||||
def kl(self, other=None):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.])
|
||||
return torch.Tensor([0.0])
|
||||
else:
|
||||
if other is None:
|
||||
return 0.5 * torch.sum(torch.pow(self.mean, 2)
|
||||
+ self.var - 1.0 - self.logvar,
|
||||
dim=[1, 2, 3])
|
||||
return 0.5 * torch.sum(
|
||||
torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
|
||||
dim=[1, 2, 3],
|
||||
)
|
||||
else:
|
||||
return 0.5 * torch.sum(
|
||||
torch.pow(self.mean - other.mean, 2) / other.var
|
||||
+ self.var / other.var - 1.0 - self.logvar + other.logvar,
|
||||
dim=[1, 2, 3])
|
||||
+ self.var / other.var
|
||||
- 1.0
|
||||
- self.logvar
|
||||
+ other.logvar,
|
||||
dim=[1, 2, 3],
|
||||
)
|
||||
|
||||
def nll(self, sample, dims=[1,2,3]):
|
||||
def nll(self, sample, dims=[1, 2, 3]):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.])
|
||||
return torch.Tensor([0.0])
|
||||
logtwopi = np.log(2.0 * np.pi)
|
||||
return 0.5 * torch.sum(
|
||||
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
|
||||
dim=dims)
|
||||
logtwopi
|
||||
+ self.logvar
|
||||
+ torch.pow(sample - self.mean, 2) / self.var,
|
||||
dim=dims,
|
||||
)
|
||||
|
||||
def mode(self):
|
||||
return self.mean
|
||||
@@ -74,7 +86,7 @@ def normal_kl(mean1, logvar1, mean2, logvar2):
|
||||
if isinstance(obj, torch.Tensor):
|
||||
tensor = obj
|
||||
break
|
||||
assert tensor is not None, "at least one argument must be a Tensor"
|
||||
assert tensor is not None, 'at least one argument must be a Tensor'
|
||||
|
||||
# Force variances to be Tensors. Broadcasting helps convert scalars to
|
||||
# Tensors, but it does not work for torch.exp().
|
||||
|
||||
@@ -10,24 +10,30 @@ class LitEma(nn.Module):
|
||||
|
||||
self.m_name2s_name = {}
|
||||
self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
|
||||
self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
|
||||
else torch.tensor(-1,dtype=torch.int))
|
||||
self.register_buffer(
|
||||
'num_updates',
|
||||
torch.tensor(0, dtype=torch.int)
|
||||
if use_num_upates
|
||||
else torch.tensor(-1, dtype=torch.int),
|
||||
)
|
||||
|
||||
for name, p in model.named_parameters():
|
||||
if p.requires_grad:
|
||||
#remove as '.'-character is not allowed in buffers
|
||||
s_name = name.replace('.','')
|
||||
self.m_name2s_name.update({name:s_name})
|
||||
self.register_buffer(s_name,p.clone().detach().data)
|
||||
# remove as '.'-character is not allowed in buffers
|
||||
s_name = name.replace('.', '')
|
||||
self.m_name2s_name.update({name: s_name})
|
||||
self.register_buffer(s_name, p.clone().detach().data)
|
||||
|
||||
self.collected_params = []
|
||||
|
||||
def forward(self,model):
|
||||
def forward(self, model):
|
||||
decay = self.decay
|
||||
|
||||
if self.num_updates >= 0:
|
||||
self.num_updates += 1
|
||||
decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
|
||||
decay = min(
|
||||
self.decay, (1 + self.num_updates) / (10 + self.num_updates)
|
||||
)
|
||||
|
||||
one_minus_decay = 1.0 - decay
|
||||
|
||||
@@ -38,8 +44,12 @@ class LitEma(nn.Module):
|
||||
for key in m_param:
|
||||
if m_param[key].requires_grad:
|
||||
sname = self.m_name2s_name[key]
|
||||
shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
|
||||
shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
|
||||
shadow_params[sname] = shadow_params[sname].type_as(
|
||||
m_param[key]
|
||||
)
|
||||
shadow_params[sname].sub_(
|
||||
one_minus_decay * (shadow_params[sname] - m_param[key])
|
||||
)
|
||||
else:
|
||||
assert not key in self.m_name2s_name
|
||||
|
||||
@@ -48,7 +58,9 @@ class LitEma(nn.Module):
|
||||
shadow_params = dict(self.named_buffers())
|
||||
for key in m_param:
|
||||
if m_param[key].requires_grad:
|
||||
m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
|
||||
m_param[key].data.copy_(
|
||||
shadow_params[self.m_name2s_name[key]].data
|
||||
)
|
||||
else:
|
||||
assert not key in self.m_name2s_name
|
||||
|
||||
|
||||
255
ldm/modules/embedding_manager.py
Normal file
@@ -0,0 +1,255 @@
|
||||
from cmath import log
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
import sys
|
||||
|
||||
from ldm.data.personalized import per_img_token_list
|
||||
from transformers import CLIPTokenizer
|
||||
from functools import partial
|
||||
|
||||
DEFAULT_PLACEHOLDER_TOKEN = ['*']
|
||||
|
||||
PROGRESSIVE_SCALE = 2000
|
||||
|
||||
|
||||
def get_clip_token_for_string(tokenizer, string):
|
||||
batch_encoding = tokenizer(
|
||||
string,
|
||||
truncation=True,
|
||||
max_length=77,
|
||||
return_length=True,
|
||||
return_overflowing_tokens=False,
|
||||
padding='max_length',
|
||||
return_tensors='pt',
|
||||
)
|
||||
tokens = batch_encoding['input_ids']
|
||||
assert (
|
||||
torch.count_nonzero(tokens - 49407) == 2
|
||||
), f"String '{string}' maps to more than a single token. Please use another string"
|
||||
|
||||
return tokens[0, 1]
|
||||
|
||||
|
||||
def get_bert_token_for_string(tokenizer, string):
|
||||
token = tokenizer(string)
|
||||
# assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
|
||||
|
||||
token = token[0, 1]
|
||||
|
||||
return token
|
||||
|
||||
|
||||
def get_embedding_for_clip_token(embedder, token):
|
||||
return embedder(token.unsqueeze(0))[0, 0]
|
||||
|
||||
|
||||
class EmbeddingManager(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
embedder,
|
||||
placeholder_strings=None,
|
||||
initializer_words=None,
|
||||
per_image_tokens=False,
|
||||
num_vectors_per_token=1,
|
||||
progressive_words=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.string_to_token_dict = {}
|
||||
|
||||
self.string_to_param_dict = nn.ParameterDict()
|
||||
|
||||
self.initial_embeddings = (
|
||||
nn.ParameterDict()
|
||||
) # These should not be optimized
|
||||
|
||||
self.progressive_words = progressive_words
|
||||
self.progressive_counter = 0
|
||||
|
||||
self.max_vectors_per_token = num_vectors_per_token
|
||||
|
||||
if hasattr(
|
||||
embedder, 'tokenizer'
|
||||
): # using Stable Diffusion's CLIP encoder
|
||||
self.is_clip = True
|
||||
get_token_for_string = partial(
|
||||
get_clip_token_for_string, embedder.tokenizer
|
||||
)
|
||||
get_embedding_for_tkn = partial(
|
||||
get_embedding_for_clip_token,
|
||||
embedder.transformer.text_model.embeddings,
|
||||
)
|
||||
token_dim = 1280
|
||||
else: # using LDM's BERT encoder
|
||||
self.is_clip = False
|
||||
get_token_for_string = partial(
|
||||
get_bert_token_for_string, embedder.tknz_fn
|
||||
)
|
||||
get_embedding_for_tkn = embedder.transformer.token_emb
|
||||
token_dim = 1280
|
||||
|
||||
if per_image_tokens:
|
||||
placeholder_strings.extend(per_img_token_list)
|
||||
|
||||
for idx, placeholder_string in enumerate(placeholder_strings):
|
||||
|
||||
token = get_token_for_string(placeholder_string)
|
||||
|
||||
if initializer_words and idx < len(initializer_words):
|
||||
init_word_token = get_token_for_string(initializer_words[idx])
|
||||
|
||||
with torch.no_grad():
|
||||
init_word_embedding = get_embedding_for_tkn(
|
||||
init_word_token.cpu()
|
||||
)
|
||||
|
||||
token_params = torch.nn.Parameter(
|
||||
init_word_embedding.unsqueeze(0).repeat(
|
||||
num_vectors_per_token, 1
|
||||
),
|
||||
requires_grad=True,
|
||||
)
|
||||
self.initial_embeddings[
|
||||
placeholder_string
|
||||
] = torch.nn.Parameter(
|
||||
init_word_embedding.unsqueeze(0).repeat(
|
||||
num_vectors_per_token, 1
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
else:
|
||||
token_params = torch.nn.Parameter(
|
||||
torch.rand(
|
||||
size=(num_vectors_per_token, token_dim),
|
||||
requires_grad=True,
|
||||
)
|
||||
)
|
||||
|
||||
self.string_to_token_dict[placeholder_string] = token
|
||||
self.string_to_param_dict[placeholder_string] = token_params
|
||||
|
||||
def forward(
|
||||
self,
|
||||
tokenized_text,
|
||||
embedded_text,
|
||||
):
|
||||
b, n, device = *tokenized_text.shape, tokenized_text.device
|
||||
|
||||
for (
|
||||
placeholder_string,
|
||||
placeholder_token,
|
||||
) in self.string_to_token_dict.items():
|
||||
|
||||
placeholder_embedding = self.string_to_param_dict[
|
||||
placeholder_string
|
||||
].to(device)
|
||||
|
||||
if (
|
||||
self.max_vectors_per_token == 1
|
||||
): # If there's only one vector per token, we can do a simple replacement
|
||||
placeholder_idx = torch.where(
|
||||
tokenized_text == placeholder_token.to(device)
|
||||
)
|
||||
embedded_text[placeholder_idx] = placeholder_embedding
|
||||
else: # otherwise, need to insert and keep track of changing indices
|
||||
if self.progressive_words:
|
||||
self.progressive_counter += 1
|
||||
max_step_tokens = (
|
||||
1 + self.progressive_counter // PROGRESSIVE_SCALE
|
||||
)
|
||||
else:
|
||||
max_step_tokens = self.max_vectors_per_token
|
||||
|
||||
num_vectors_for_token = min(
|
||||
placeholder_embedding.shape[0], max_step_tokens
|
||||
)
|
||||
|
||||
placeholder_rows, placeholder_cols = torch.where(
|
||||
tokenized_text == placeholder_token.to(device)
|
||||
)
|
||||
|
||||
if placeholder_rows.nelement() == 0:
|
||||
continue
|
||||
|
||||
sorted_cols, sort_idx = torch.sort(
|
||||
placeholder_cols, descending=True
|
||||
)
|
||||
sorted_rows = placeholder_rows[sort_idx]
|
||||
|
||||
for idx in range(len(sorted_rows)):
|
||||
row = sorted_rows[idx]
|
||||
col = sorted_cols[idx]
|
||||
|
||||
new_token_row = torch.cat(
|
||||
[
|
||||
tokenized_text[row][:col],
|
||||
placeholder_token.repeat(num_vectors_for_token).to(
|
||||
device
|
||||
),
|
||||
tokenized_text[row][col + 1 :],
|
||||
],
|
||||
axis=0,
|
||||
)[:n]
|
||||
new_embed_row = torch.cat(
|
||||
[
|
||||
embedded_text[row][:col],
|
||||
placeholder_embedding[:num_vectors_for_token],
|
||||
embedded_text[row][col + 1 :],
|
||||
],
|
||||
axis=0,
|
||||
)[:n]
|
||||
|
||||
embedded_text[row] = new_embed_row
|
||||
tokenized_text[row] = new_token_row
|
||||
|
||||
return embedded_text
|
||||
|
||||
def save(self, ckpt_path):
|
||||
torch.save(
|
||||
{
|
||||
'string_to_token': self.string_to_token_dict,
|
||||
'string_to_param': self.string_to_param_dict,
|
||||
},
|
||||
ckpt_path,
|
||||
)
|
||||
|
||||
def load(self, ckpt_path, full=True):
|
||||
ckpt = torch.load(ckpt_path, map_location='cpu')
|
||||
self.string_to_token_dict = ckpt["string_to_token"]
|
||||
self.string_to_param_dict = ckpt["string_to_param"]
|
||||
if not full:
|
||||
for key, value in self.string_to_param_dict.items():
|
||||
self.string_to_param_dict[key] = torch.nn.Parameter(value.half())
|
||||
|
||||
def get_embedding_norms_squared(self):
|
||||
all_params = torch.cat(
|
||||
list(self.string_to_param_dict.values()), axis=0
|
||||
) # num_placeholders x embedding_dim
|
||||
param_norm_squared = (all_params * all_params).sum(
|
||||
axis=-1
|
||||
) # num_placeholders
|
||||
|
||||
return param_norm_squared
|
||||
|
||||
def embedding_parameters(self):
|
||||
return self.string_to_param_dict.parameters()
|
||||
|
||||
def embedding_to_coarse_loss(self):
|
||||
|
||||
loss = 0.0
|
||||
num_embeddings = len(self.initial_embeddings)
|
||||
|
||||
for key in self.initial_embeddings:
|
||||
optimized = self.string_to_param_dict[key]
|
||||
coarse = self.initial_embeddings[key].clone().to(optimized.device)
|
||||
|
||||
loss = (
|
||||
loss
|
||||
+ (optimized - coarse)
|
||||
@ (optimized - coarse).T
|
||||
/ num_embeddings
|
||||
)
|
||||
|
||||
return loss
|
||||
@@ -5,8 +5,40 @@ import clip
|
||||
from einops import rearrange, repeat
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
import kornia
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
|
||||
from ldm.modules.x_transformer import (
|
||||
Encoder,
|
||||
TransformerWrapper,
|
||||
) # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
|
||||
|
||||
|
||||
def _expand_mask(mask, dtype, tgt_len=None):
|
||||
"""
|
||||
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
||||
"""
|
||||
bsz, src_len = mask.size()
|
||||
tgt_len = tgt_len if tgt_len is not None else src_len
|
||||
|
||||
expanded_mask = (
|
||||
mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
||||
)
|
||||
|
||||
inverted_mask = 1.0 - expanded_mask
|
||||
|
||||
return inverted_mask.masked_fill(
|
||||
inverted_mask.to(torch.bool), torch.finfo(dtype).min
|
||||
)
|
||||
|
||||
|
||||
def _build_causal_attention_mask(bsz, seq_len, dtype):
|
||||
# lazily create causal attention mask, with full attention between the vision tokens
|
||||
# pytorch uses additive attention mask; fill with -inf
|
||||
mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
|
||||
mask.fill_(torch.tensor(torch.finfo(dtype).min))
|
||||
mask.triu_(1) # zero out the lower diagonal
|
||||
mask = mask.unsqueeze(1) # expand mask
|
||||
return mask
|
||||
|
||||
|
||||
class AbstractEncoder(nn.Module):
|
||||
@@ -17,7 +49,6 @@ class AbstractEncoder(nn.Module):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
class ClassEmbedder(nn.Module):
|
||||
def __init__(self, embed_dim, n_classes=1000, key='class'):
|
||||
super().__init__()
|
||||
@@ -35,11 +66,22 @@ class ClassEmbedder(nn.Module):
|
||||
|
||||
class TransformerEmbedder(AbstractEncoder):
|
||||
"""Some transformer encoder layers"""
|
||||
def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_embed,
|
||||
n_layer,
|
||||
vocab_size,
|
||||
max_seq_len=77,
|
||||
device=choose_torch_device(),
|
||||
):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
|
||||
attn_layers=Encoder(dim=n_embed, depth=n_layer))
|
||||
self.transformer = TransformerWrapper(
|
||||
num_tokens=vocab_size,
|
||||
max_seq_len=max_seq_len,
|
||||
attn_layers=Encoder(dim=n_embed, depth=n_layer),
|
||||
)
|
||||
|
||||
def forward(self, tokens):
|
||||
tokens = tokens.to(self.device) # meh
|
||||
@@ -51,27 +93,44 @@ class TransformerEmbedder(AbstractEncoder):
|
||||
|
||||
|
||||
class BERTTokenizer(AbstractEncoder):
|
||||
""" Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
|
||||
def __init__(self, device="cuda", vq_interface=True, max_length=77):
|
||||
"""Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
|
||||
|
||||
def __init__(
|
||||
self, device=choose_torch_device(), vq_interface=True, max_length=77
|
||||
):
|
||||
super().__init__()
|
||||
from transformers import BertTokenizerFast # TODO: add to reuquirements
|
||||
from transformers import (
|
||||
BertTokenizerFast,
|
||||
) # TODO: add to reuquirements
|
||||
|
||||
# Modified to allow to run on non-internet connected compute nodes.
|
||||
# Model needs to be loaded into cache from an internet-connected machine
|
||||
# by running:
|
||||
# from transformers import BertTokenizerFast
|
||||
# BertTokenizerFast.from_pretrained("bert-base-uncased")
|
||||
try:
|
||||
self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",local_files_only=True)
|
||||
self.tokenizer = BertTokenizerFast.from_pretrained(
|
||||
'bert-base-uncased', local_files_only=True
|
||||
)
|
||||
except OSError:
|
||||
raise SystemExit("* Couldn't load Bert tokenizer files. Try running scripts/preload_models.py from an internet-conected machine.")
|
||||
raise SystemExit(
|
||||
"* Couldn't load Bert tokenizer files. Try running scripts/preload_models.py from an internet-conected machine."
|
||||
)
|
||||
self.device = device
|
||||
self.vq_interface = vq_interface
|
||||
self.max_length = max_length
|
||||
|
||||
def forward(self, text):
|
||||
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
||||
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||
tokens = batch_encoding["input_ids"].to(self.device)
|
||||
batch_encoding = self.tokenizer(
|
||||
text,
|
||||
truncation=True,
|
||||
max_length=self.max_length,
|
||||
return_length=True,
|
||||
return_overflowing_tokens=False,
|
||||
padding='max_length',
|
||||
return_tensors='pt',
|
||||
)
|
||||
tokens = batch_encoding['input_ids'].to(self.device)
|
||||
return tokens
|
||||
|
||||
@torch.no_grad()
|
||||
@@ -87,54 +146,84 @@ class BERTTokenizer(AbstractEncoder):
|
||||
|
||||
class BERTEmbedder(AbstractEncoder):
|
||||
"""Uses the BERT tokenizr model and add some transformer encoder layers"""
|
||||
def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
|
||||
device="cuda",use_tokenizer=True, embedding_dropout=0.0):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_embed,
|
||||
n_layer,
|
||||
vocab_size=30522,
|
||||
max_seq_len=77,
|
||||
device=choose_torch_device(),
|
||||
use_tokenizer=True,
|
||||
embedding_dropout=0.0,
|
||||
):
|
||||
super().__init__()
|
||||
self.use_tknz_fn = use_tokenizer
|
||||
if self.use_tknz_fn:
|
||||
self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
|
||||
self.tknz_fn = BERTTokenizer(
|
||||
vq_interface=False, max_length=max_seq_len
|
||||
)
|
||||
self.device = device
|
||||
self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
|
||||
attn_layers=Encoder(dim=n_embed, depth=n_layer),
|
||||
emb_dropout=embedding_dropout)
|
||||
self.transformer = TransformerWrapper(
|
||||
num_tokens=vocab_size,
|
||||
max_seq_len=max_seq_len,
|
||||
attn_layers=Encoder(dim=n_embed, depth=n_layer),
|
||||
emb_dropout=embedding_dropout,
|
||||
)
|
||||
|
||||
def forward(self, text):
|
||||
def forward(self, text, embedding_manager=None):
|
||||
if self.use_tknz_fn:
|
||||
tokens = self.tknz_fn(text)#.to(self.device)
|
||||
tokens = self.tknz_fn(text) # .to(self.device)
|
||||
else:
|
||||
tokens = text
|
||||
z = self.transformer(tokens, return_embeddings=True)
|
||||
z = self.transformer(
|
||||
tokens, return_embeddings=True, embedding_manager=embedding_manager
|
||||
)
|
||||
return z
|
||||
|
||||
def encode(self, text):
|
||||
def encode(self, text, **kwargs):
|
||||
# output of length 77
|
||||
return self(text)
|
||||
return self(text, **kwargs)
|
||||
|
||||
|
||||
class SpatialRescaler(nn.Module):
|
||||
def __init__(self,
|
||||
n_stages=1,
|
||||
method='bilinear',
|
||||
multiplier=0.5,
|
||||
in_channels=3,
|
||||
out_channels=None,
|
||||
bias=False):
|
||||
def __init__(
|
||||
self,
|
||||
n_stages=1,
|
||||
method='bilinear',
|
||||
multiplier=0.5,
|
||||
in_channels=3,
|
||||
out_channels=None,
|
||||
bias=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.n_stages = n_stages
|
||||
assert self.n_stages >= 0
|
||||
assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
|
||||
assert method in [
|
||||
'nearest',
|
||||
'linear',
|
||||
'bilinear',
|
||||
'trilinear',
|
||||
'bicubic',
|
||||
'area',
|
||||
]
|
||||
self.multiplier = multiplier
|
||||
self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
|
||||
self.interpolator = partial(
|
||||
torch.nn.functional.interpolate, mode=method
|
||||
)
|
||||
self.remap_output = out_channels is not None
|
||||
if self.remap_output:
|
||||
print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
|
||||
self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
|
||||
print(
|
||||
f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.'
|
||||
)
|
||||
self.channel_mapper = nn.Conv2d(
|
||||
in_channels, out_channels, 1, bias=bias
|
||||
)
|
||||
|
||||
def forward(self,x):
|
||||
def forward(self, x):
|
||||
for stage in range(self.n_stages):
|
||||
x = self.interpolator(x, scale_factor=self.multiplier)
|
||||
|
||||
|
||||
if self.remap_output:
|
||||
x = self.channel_mapper(x)
|
||||
return x
|
||||
@@ -142,41 +231,245 @@ class SpatialRescaler(nn.Module):
|
||||
def encode(self, x):
|
||||
return self(x)
|
||||
|
||||
|
||||
class FrozenCLIPEmbedder(AbstractEncoder):
|
||||
"""Uses the CLIP transformer encoder for text (from Hugging Face)"""
|
||||
def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
version='openai/clip-vit-large-patch14',
|
||||
device=choose_torch_device(),
|
||||
max_length=77,
|
||||
):
|
||||
super().__init__()
|
||||
self.tokenizer = CLIPTokenizer.from_pretrained(version,local_files_only=True)
|
||||
self.transformer = CLIPTextModel.from_pretrained(version,local_files_only=True)
|
||||
self.tokenizer = CLIPTokenizer.from_pretrained(
|
||||
version, local_files_only=True
|
||||
)
|
||||
self.transformer = CLIPTextModel.from_pretrained(
|
||||
version, local_files_only=True
|
||||
)
|
||||
self.device = device
|
||||
self.max_length = max_length
|
||||
self.freeze()
|
||||
|
||||
def embedding_forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
position_ids=None,
|
||||
inputs_embeds=None,
|
||||
embedding_manager=None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
seq_length = (
|
||||
input_ids.shape[-1]
|
||||
if input_ids is not None
|
||||
else inputs_embeds.shape[-2]
|
||||
)
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = self.position_ids[:, :seq_length]
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.token_embedding(input_ids)
|
||||
|
||||
if embedding_manager is not None:
|
||||
inputs_embeds = embedding_manager(input_ids, inputs_embeds)
|
||||
|
||||
position_embeddings = self.position_embedding(position_ids)
|
||||
embeddings = inputs_embeds + position_embeddings
|
||||
|
||||
return embeddings
|
||||
|
||||
self.transformer.text_model.embeddings.forward = (
|
||||
embedding_forward.__get__(self.transformer.text_model.embeddings)
|
||||
)
|
||||
|
||||
def encoder_forward(
|
||||
self,
|
||||
inputs_embeds,
|
||||
attention_mask=None,
|
||||
causal_attention_mask=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
output_attentions = (
|
||||
output_attentions
|
||||
if output_attentions is not None
|
||||
else self.config.output_attentions
|
||||
)
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = (
|
||||
return_dict
|
||||
if return_dict is not None
|
||||
else self.config.use_return_dict
|
||||
)
|
||||
|
||||
encoder_states = () if output_hidden_states else None
|
||||
all_attentions = () if output_attentions else None
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
for idx, encoder_layer in enumerate(self.layers):
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
|
||||
layer_outputs = encoder_layer(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
all_attentions = all_attentions + (layer_outputs[1],)
|
||||
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
|
||||
return hidden_states
|
||||
|
||||
self.transformer.text_model.encoder.forward = encoder_forward.__get__(
|
||||
self.transformer.text_model.encoder
|
||||
)
|
||||
|
||||
def text_encoder_forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
position_ids=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
embedding_manager=None,
|
||||
):
|
||||
output_attentions = (
|
||||
output_attentions
|
||||
if output_attentions is not None
|
||||
else self.config.output_attentions
|
||||
)
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = (
|
||||
return_dict
|
||||
if return_dict is not None
|
||||
else self.config.use_return_dict
|
||||
)
|
||||
|
||||
if input_ids is None:
|
||||
raise ValueError('You have to specify either input_ids')
|
||||
|
||||
input_shape = input_ids.size()
|
||||
input_ids = input_ids.view(-1, input_shape[-1])
|
||||
|
||||
hidden_states = self.embeddings(
|
||||
input_ids=input_ids,
|
||||
position_ids=position_ids,
|
||||
embedding_manager=embedding_manager,
|
||||
)
|
||||
|
||||
bsz, seq_len = input_shape
|
||||
# CLIP's text model uses causal mask, prepare it here.
|
||||
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
|
||||
causal_attention_mask = _build_causal_attention_mask(
|
||||
bsz, seq_len, hidden_states.dtype
|
||||
).to(hidden_states.device)
|
||||
|
||||
# expand attention_mask
|
||||
if attention_mask is not None:
|
||||
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
||||
attention_mask = _expand_mask(
|
||||
attention_mask, hidden_states.dtype
|
||||
)
|
||||
|
||||
last_hidden_state = self.encoder(
|
||||
inputs_embeds=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
causal_attention_mask=causal_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
||||
|
||||
return last_hidden_state
|
||||
|
||||
self.transformer.text_model.forward = text_encoder_forward.__get__(
|
||||
self.transformer.text_model
|
||||
)
|
||||
|
||||
def transformer_forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
position_ids=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
embedding_manager=None,
|
||||
):
|
||||
return self.text_model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
embedding_manager=embedding_manager,
|
||||
)
|
||||
|
||||
self.transformer.forward = transformer_forward.__get__(
|
||||
self.transformer
|
||||
)
|
||||
|
||||
def freeze(self):
|
||||
self.transformer = self.transformer.eval()
|
||||
for param in self.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def forward(self, text):
|
||||
batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
|
||||
return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
|
||||
tokens = batch_encoding["input_ids"].to(self.device)
|
||||
outputs = self.transformer(input_ids=tokens)
|
||||
def forward(self, text, **kwargs):
|
||||
batch_encoding = self.tokenizer(
|
||||
text,
|
||||
truncation=True,
|
||||
max_length=self.max_length,
|
||||
return_length=True,
|
||||
return_overflowing_tokens=False,
|
||||
padding='max_length',
|
||||
return_tensors='pt',
|
||||
)
|
||||
tokens = batch_encoding['input_ids'].to(self.device)
|
||||
z = self.transformer(input_ids=tokens, **kwargs)
|
||||
|
||||
z = outputs.last_hidden_state
|
||||
return z
|
||||
|
||||
def encode(self, text):
|
||||
return self(text)
|
||||
def encode(self, text, **kwargs):
|
||||
return self(text, **kwargs)
|
||||
|
||||
|
||||
class FrozenCLIPTextEmbedder(nn.Module):
|
||||
"""
|
||||
Uses the CLIP transformer encoder for text.
|
||||
"""
|
||||
def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
version='ViT-L/14',
|
||||
device=choose_torch_device(),
|
||||
max_length=77,
|
||||
n_repeat=1,
|
||||
normalize=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.model, _ = clip.load(version, jit=False, device="cpu")
|
||||
self.model, _ = clip.load(version, jit=False, device=device)
|
||||
self.device = device
|
||||
self.max_length = max_length
|
||||
self.n_repeat = n_repeat
|
||||
@@ -196,7 +489,7 @@ class FrozenCLIPTextEmbedder(nn.Module):
|
||||
|
||||
def encode(self, text):
|
||||
z = self(text)
|
||||
if z.ndim==2:
|
||||
if z.ndim == 2:
|
||||
z = z[:, None, :]
|
||||
z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
|
||||
return z
|
||||
@@ -204,29 +497,42 @@ class FrozenCLIPTextEmbedder(nn.Module):
|
||||
|
||||
class FrozenClipImageEmbedder(nn.Module):
|
||||
"""
|
||||
Uses the CLIP image encoder.
|
||||
"""
|
||||
Uses the CLIP image encoder.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
jit=False,
|
||||
device='cuda' if torch.cuda.is_available() else 'cpu',
|
||||
antialias=False,
|
||||
):
|
||||
self,
|
||||
model,
|
||||
jit=False,
|
||||
device=choose_torch_device(),
|
||||
antialias=False,
|
||||
):
|
||||
super().__init__()
|
||||
self.model, _ = clip.load(name=model, device=device, jit=jit)
|
||||
|
||||
self.antialias = antialias
|
||||
|
||||
self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
|
||||
self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
|
||||
self.register_buffer(
|
||||
'mean',
|
||||
torch.Tensor([0.48145466, 0.4578275, 0.40821073]),
|
||||
persistent=False,
|
||||
)
|
||||
self.register_buffer(
|
||||
'std',
|
||||
torch.Tensor([0.26862954, 0.26130258, 0.27577711]),
|
||||
persistent=False,
|
||||
)
|
||||
|
||||
def preprocess(self, x):
|
||||
# normalize to [0,1]
|
||||
x = kornia.geometry.resize(x, (224, 224),
|
||||
interpolation='bicubic',align_corners=True,
|
||||
antialias=self.antialias)
|
||||
x = (x + 1.) / 2.
|
||||
x = kornia.geometry.resize(
|
||||
x,
|
||||
(224, 224),
|
||||
interpolation='bicubic',
|
||||
align_corners=True,
|
||||
antialias=self.antialias,
|
||||
)
|
||||
x = (x + 1.0) / 2.0
|
||||
# renormalize according to clip
|
||||
x = kornia.enhance.normalize(x, self.mean, self.std)
|
||||
return x
|
||||
@@ -236,7 +542,8 @@ class FrozenClipImageEmbedder(nn.Module):
|
||||
return self.model.encode_image(self.preprocess(x))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if __name__ == '__main__':
|
||||
from ldm.util import count_params
|
||||
|
||||
model = FrozenCLIPEmbedder()
|
||||
count_params(model, verbose=True)
|
||||
|
||||
@@ -1,2 +1,6 @@
|
||||
from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
|
||||
from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
|
||||
from ldm.modules.image_degradation.bsrgan import (
|
||||
degradation_bsrgan_variant as degradation_fn_bsr,
|
||||
)
|
||||
from ldm.modules.image_degradation.bsrgan_light import (
|
||||
degradation_bsrgan_variant as degradation_fn_bsr_light,
|
||||
)
|
||||
|
||||
@@ -27,16 +27,16 @@ import ldm.modules.image_degradation.utils_image as util
|
||||
|
||||
|
||||
def modcrop_np(img, sf):
|
||||
'''
|
||||
"""
|
||||
Args:
|
||||
img: numpy image, WxH or WxHxC
|
||||
sf: scale factor
|
||||
Return:
|
||||
cropped image
|
||||
'''
|
||||
"""
|
||||
w, h = img.shape[:2]
|
||||
im = np.copy(img)
|
||||
return im[:w - w % sf, :h - h % sf, ...]
|
||||
return im[: w - w % sf, : h - h % sf, ...]
|
||||
|
||||
|
||||
"""
|
||||
@@ -54,7 +54,9 @@ def analytic_kernel(k):
|
||||
# Loop over the small kernel to fill the big one
|
||||
for r in range(k_size):
|
||||
for c in range(k_size):
|
||||
big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
|
||||
big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += (
|
||||
k[r, c] * k
|
||||
)
|
||||
# Crop the edges of the big kernel to ignore very small values and increase run time of SR
|
||||
crop = k_size // 2
|
||||
cropped_big_k = big_k[crop:-crop, crop:-crop]
|
||||
@@ -63,7 +65,7 @@ def analytic_kernel(k):
|
||||
|
||||
|
||||
def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
|
||||
""" generate an anisotropic Gaussian kernel
|
||||
"""generate an anisotropic Gaussian kernel
|
||||
Args:
|
||||
ksize : e.g., 15, kernel size
|
||||
theta : [0, pi], rotation angle range
|
||||
@@ -74,7 +76,12 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
|
||||
k : kernel
|
||||
"""
|
||||
|
||||
v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
|
||||
v = np.dot(
|
||||
np.array(
|
||||
[[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
|
||||
),
|
||||
np.array([1.0, 0.0]),
|
||||
)
|
||||
V = np.array([[v[0], v[1]], [v[1], -v[0]]])
|
||||
D = np.array([[l1, 0], [0, l2]])
|
||||
Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
|
||||
@@ -126,24 +133,32 @@ def shift_pixel(x, sf, upper_left=True):
|
||||
|
||||
|
||||
def blur(x, k):
|
||||
'''
|
||||
"""
|
||||
x: image, NxcxHxW
|
||||
k: kernel, Nx1xhxw
|
||||
'''
|
||||
"""
|
||||
n, c = x.shape[:2]
|
||||
p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
|
||||
x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
|
||||
k = k.repeat(1, c, 1, 1)
|
||||
k = k.view(-1, 1, k.shape[2], k.shape[3])
|
||||
x = x.view(1, -1, x.shape[2], x.shape[3])
|
||||
x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
|
||||
x = torch.nn.functional.conv2d(
|
||||
x, k, bias=None, stride=1, padding=0, groups=n * c
|
||||
)
|
||||
x = x.view(n, c, x.shape[2], x.shape[3])
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
|
||||
""""
|
||||
def gen_kernel(
|
||||
k_size=np.array([15, 15]),
|
||||
scale_factor=np.array([4, 4]),
|
||||
min_var=0.6,
|
||||
max_var=10.0,
|
||||
noise_level=0,
|
||||
):
|
||||
""" "
|
||||
# modified version of https://github.com/assafshocher/BlindSR_dataset_generator
|
||||
# Kai Zhang
|
||||
# min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
|
||||
@@ -157,13 +172,16 @@ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var
|
||||
|
||||
# Set COV matrix using Lambdas and Theta
|
||||
LAMBDA = np.diag([lambda_1, lambda_2])
|
||||
Q = np.array([[np.cos(theta), -np.sin(theta)],
|
||||
[np.sin(theta), np.cos(theta)]])
|
||||
Q = np.array(
|
||||
[[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
|
||||
)
|
||||
SIGMA = Q @ LAMBDA @ Q.T
|
||||
INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
|
||||
|
||||
# Set expectation position (shifting kernel for aligned image)
|
||||
MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
|
||||
MU = k_size // 2 - 0.5 * (
|
||||
scale_factor - 1
|
||||
) # - 0.5 * (scale_factor - k_size % 2)
|
||||
MU = MU[None, None, :, None]
|
||||
|
||||
# Create meshgrid for Gaussian
|
||||
@@ -188,7 +206,9 @@ def fspecial_gaussian(hsize, sigma):
|
||||
hsize = [hsize, hsize]
|
||||
siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
|
||||
std = sigma
|
||||
[x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
|
||||
[x, y] = np.meshgrid(
|
||||
np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)
|
||||
)
|
||||
arg = -(x * x + y * y) / (2 * std * std)
|
||||
h = np.exp(arg)
|
||||
h[h < scipy.finfo(float).eps * h.max()] = 0
|
||||
@@ -208,10 +228,10 @@ def fspecial_laplacian(alpha):
|
||||
|
||||
|
||||
def fspecial(filter_type, *args, **kwargs):
|
||||
'''
|
||||
"""
|
||||
python code from:
|
||||
https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
|
||||
'''
|
||||
"""
|
||||
if filter_type == 'gaussian':
|
||||
return fspecial_gaussian(*args, **kwargs)
|
||||
if filter_type == 'laplacian':
|
||||
@@ -226,19 +246,19 @@ def fspecial(filter_type, *args, **kwargs):
|
||||
|
||||
|
||||
def bicubic_degradation(x, sf=3):
|
||||
'''
|
||||
"""
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]
|
||||
sf: down-scale factor
|
||||
Return:
|
||||
bicubicly downsampled LR image
|
||||
'''
|
||||
"""
|
||||
x = util.imresize_np(x, scale=1 / sf)
|
||||
return x
|
||||
|
||||
|
||||
def srmd_degradation(x, k, sf=3):
|
||||
''' blur + bicubic downsampling
|
||||
"""blur + bicubic downsampling
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]
|
||||
k: hxw, double
|
||||
@@ -253,14 +273,16 @@ def srmd_degradation(x, k, sf=3):
|
||||
pages={3262--3271},
|
||||
year={2018}
|
||||
}
|
||||
'''
|
||||
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
|
||||
"""
|
||||
x = ndimage.filters.convolve(
|
||||
x, np.expand_dims(k, axis=2), mode='wrap'
|
||||
) # 'nearest' | 'mirror'
|
||||
x = bicubic_degradation(x, sf=sf)
|
||||
return x
|
||||
|
||||
|
||||
def dpsr_degradation(x, k, sf=3):
|
||||
''' bicubic downsampling + blur
|
||||
"""bicubic downsampling + blur
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]
|
||||
k: hxw, double
|
||||
@@ -275,21 +297,21 @@ def dpsr_degradation(x, k, sf=3):
|
||||
pages={1671--1681},
|
||||
year={2019}
|
||||
}
|
||||
'''
|
||||
"""
|
||||
x = bicubic_degradation(x, sf=sf)
|
||||
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||
return x
|
||||
|
||||
|
||||
def classical_degradation(x, k, sf=3):
|
||||
''' blur + downsampling
|
||||
"""blur + downsampling
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]/[0, 255]
|
||||
k: hxw, double
|
||||
sf: down-scale factor
|
||||
Return:
|
||||
downsampled LR image
|
||||
'''
|
||||
"""
|
||||
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||
# x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
|
||||
st = 0
|
||||
@@ -328,10 +350,19 @@ def add_blur(img, sf=4):
|
||||
if random.random() < 0.5:
|
||||
l1 = wd2 * random.random()
|
||||
l2 = wd2 * random.random()
|
||||
k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
|
||||
k = anisotropic_Gaussian(
|
||||
ksize=2 * random.randint(2, 11) + 3,
|
||||
theta=random.random() * np.pi,
|
||||
l1=l1,
|
||||
l2=l2,
|
||||
)
|
||||
else:
|
||||
k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
|
||||
img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
|
||||
k = fspecial(
|
||||
'gaussian', 2 * random.randint(2, 11) + 3, wd * random.random()
|
||||
)
|
||||
img = ndimage.filters.convolve(
|
||||
img, np.expand_dims(k, axis=2), mode='mirror'
|
||||
)
|
||||
|
||||
return img
|
||||
|
||||
@@ -344,7 +375,11 @@ def add_resize(img, sf=4):
|
||||
sf1 = random.uniform(0.5 / sf, 1)
|
||||
else:
|
||||
sf1 = 1.0
|
||||
img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
|
||||
return img
|
||||
@@ -366,19 +401,26 @@ def add_resize(img, sf=4):
|
||||
# img = np.clip(img, 0.0, 1.0)
|
||||
# return img
|
||||
|
||||
|
||||
def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||
noise_level = random.randint(noise_level1, noise_level2)
|
||||
rnum = np.random.rand()
|
||||
if rnum > 0.6: # add color Gaussian noise
|
||||
img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||
img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(
|
||||
np.float32
|
||||
)
|
||||
elif rnum < 0.4: # add grayscale Gaussian noise
|
||||
img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||
img = img + np.random.normal(
|
||||
0, noise_level / 255.0, (*img.shape[:2], 1)
|
||||
).astype(np.float32)
|
||||
else: # add noise
|
||||
L = noise_level2 / 255.
|
||||
L = noise_level2 / 255.0
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = img + np.random.multivariate_normal(
|
||||
[0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
|
||||
).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
@@ -388,28 +430,37 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
rnum = random.random()
|
||||
if rnum > 0.6:
|
||||
img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||
img += img * np.random.normal(
|
||||
0, noise_level / 255.0, img.shape
|
||||
).astype(np.float32)
|
||||
elif rnum < 0.4:
|
||||
img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||
img += img * np.random.normal(
|
||||
0, noise_level / 255.0, (*img.shape[:2], 1)
|
||||
).astype(np.float32)
|
||||
else:
|
||||
L = noise_level2 / 255.
|
||||
L = noise_level2 / 255.0
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img += img * np.random.multivariate_normal(
|
||||
[0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
|
||||
).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
|
||||
def add_Poisson_noise(img):
|
||||
img = np.clip((img * 255.0).round(), 0, 255) / 255.
|
||||
img = np.clip((img * 255.0).round(), 0, 255) / 255.0
|
||||
vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
|
||||
if random.random() < 0.5:
|
||||
img = np.random.poisson(img * vals).astype(np.float32) / vals
|
||||
else:
|
||||
img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
|
||||
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
|
||||
noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
|
||||
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
|
||||
noise_gray = (
|
||||
np.random.poisson(img_gray * vals).astype(np.float32) / vals
|
||||
- img_gray
|
||||
)
|
||||
img += noise_gray[:, :, np.newaxis]
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
@@ -418,7 +469,9 @@ def add_Poisson_noise(img):
|
||||
def add_JPEG_noise(img):
|
||||
quality_factor = random.randint(30, 95)
|
||||
img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
|
||||
result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
|
||||
result, encimg = cv2.imencode(
|
||||
'.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]
|
||||
)
|
||||
img = cv2.imdecode(encimg, 1)
|
||||
img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
|
||||
return img
|
||||
@@ -428,10 +481,14 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
|
||||
h, w = lq.shape[:2]
|
||||
rnd_h = random.randint(0, h - lq_patchsize)
|
||||
rnd_w = random.randint(0, w - lq_patchsize)
|
||||
lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
|
||||
lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
|
||||
|
||||
rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
|
||||
hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
|
||||
hq = hq[
|
||||
rnd_h_H : rnd_h_H + lq_patchsize * sf,
|
||||
rnd_w_H : rnd_w_H + lq_patchsize * sf,
|
||||
:,
|
||||
]
|
||||
return lq, hq
|
||||
|
||||
|
||||
@@ -452,7 +509,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
sf_ori = sf
|
||||
|
||||
h1, w1 = img.shape[:2]
|
||||
img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||
img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
|
||||
h, w = img.shape[:2]
|
||||
|
||||
if h < lq_patchsize * sf or w < lq_patchsize * sf:
|
||||
@@ -462,8 +519,11 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
|
||||
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||
if np.random.rand() < 0.5:
|
||||
img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
img = util.imresize_np(img, 1 / 2, True)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
@@ -472,7 +532,10 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
shuffle_order = random.sample(range(7), 7)
|
||||
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||
if idx1 > idx2: # keep downsample3 last
|
||||
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||
shuffle_order[idx1], shuffle_order[idx2] = (
|
||||
shuffle_order[idx2],
|
||||
shuffle_order[idx1],
|
||||
)
|
||||
|
||||
for i in shuffle_order:
|
||||
|
||||
@@ -487,19 +550,30 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
# downsample2
|
||||
if random.random() < 0.75:
|
||||
sf1 = random.uniform(1, 2 * sf)
|
||||
img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||
k_shifted = shift_pixel(k, sf)
|
||||
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||
img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||
k_shifted = (
|
||||
k_shifted / k_shifted.sum()
|
||||
) # blur with shifted kernel
|
||||
img = ndimage.filters.convolve(
|
||||
img, np.expand_dims(k_shifted, axis=2), mode='mirror'
|
||||
)
|
||||
img = img[0::sf, 0::sf, ...] # nearest downsampling
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
|
||||
elif i == 3:
|
||||
# downsample3
|
||||
img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / sf * a), int(1 / sf * b)),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
|
||||
elif i == 4:
|
||||
@@ -544,15 +618,18 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
sf_ori = sf
|
||||
|
||||
h1, w1 = image.shape[:2]
|
||||
image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||
image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
|
||||
h, w = image.shape[:2]
|
||||
|
||||
hq = image.copy()
|
||||
|
||||
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||
if np.random.rand() < 0.5:
|
||||
image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
image = cv2.resize(
|
||||
image,
|
||||
(int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
image = util.imresize_np(image, 1 / 2, True)
|
||||
image = np.clip(image, 0.0, 1.0)
|
||||
@@ -561,7 +638,10 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
shuffle_order = random.sample(range(7), 7)
|
||||
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||
if idx1 > idx2: # keep downsample3 last
|
||||
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||
shuffle_order[idx1], shuffle_order[idx2] = (
|
||||
shuffle_order[idx2],
|
||||
shuffle_order[idx1],
|
||||
)
|
||||
|
||||
for i in shuffle_order:
|
||||
|
||||
@@ -576,19 +656,33 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
# downsample2
|
||||
if random.random() < 0.75:
|
||||
sf1 = random.uniform(1, 2 * sf)
|
||||
image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
image = cv2.resize(
|
||||
image,
|
||||
(
|
||||
int(1 / sf1 * image.shape[1]),
|
||||
int(1 / sf1 * image.shape[0]),
|
||||
),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||
k_shifted = shift_pixel(k, sf)
|
||||
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||
image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||
k_shifted = (
|
||||
k_shifted / k_shifted.sum()
|
||||
) # blur with shifted kernel
|
||||
image = ndimage.filters.convolve(
|
||||
image, np.expand_dims(k_shifted, axis=2), mode='mirror'
|
||||
)
|
||||
image = image[0::sf, 0::sf, ...] # nearest downsampling
|
||||
image = np.clip(image, 0.0, 1.0)
|
||||
|
||||
elif i == 3:
|
||||
# downsample3
|
||||
image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||
image = cv2.resize(
|
||||
image,
|
||||
(int(1 / sf * a), int(1 / sf * b)),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
image = np.clip(image, 0.0, 1.0)
|
||||
|
||||
elif i == 4:
|
||||
@@ -609,12 +703,19 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
# add final JPEG compression noise
|
||||
image = add_JPEG_noise(image)
|
||||
image = util.single2uint(image)
|
||||
example = {"image":image}
|
||||
example = {'image': image}
|
||||
return example
|
||||
|
||||
|
||||
# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
|
||||
def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
|
||||
def degradation_bsrgan_plus(
|
||||
img,
|
||||
sf=4,
|
||||
shuffle_prob=0.5,
|
||||
use_sharp=True,
|
||||
lq_patchsize=64,
|
||||
isp_model=None,
|
||||
):
|
||||
"""
|
||||
This is an extended degradation model by combining
|
||||
the degradation models of BSRGAN and Real-ESRGAN
|
||||
@@ -630,7 +731,7 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
|
||||
"""
|
||||
|
||||
h1, w1 = img.shape[:2]
|
||||
img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||
img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
|
||||
h, w = img.shape[:2]
|
||||
|
||||
if h < lq_patchsize * sf or w < lq_patchsize * sf:
|
||||
@@ -645,8 +746,12 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
|
||||
else:
|
||||
shuffle_order = list(range(13))
|
||||
# local shuffle for noise, JPEG is always the last one
|
||||
shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
|
||||
shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
|
||||
shuffle_order[2:6] = random.sample(
|
||||
shuffle_order[2:6], len(range(2, 6))
|
||||
)
|
||||
shuffle_order[9:13] = random.sample(
|
||||
shuffle_order[9:13], len(range(9, 13))
|
||||
)
|
||||
|
||||
poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1
|
||||
|
||||
@@ -689,8 +794,11 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
|
||||
print('check the shuffle!')
|
||||
|
||||
# resize to desired size
|
||||
img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
|
||||
# add final JPEG compression noise
|
||||
img = add_JPEG_noise(img)
|
||||
@@ -702,29 +810,37 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("hey")
|
||||
img = util.imread_uint('utils/test.png', 3)
|
||||
print(img)
|
||||
img = util.uint2single(img)
|
||||
print(img)
|
||||
img = img[:448, :448]
|
||||
h = img.shape[0] // 4
|
||||
print("resizing to", h)
|
||||
sf = 4
|
||||
deg_fn = partial(degradation_bsrgan_variant, sf=sf)
|
||||
for i in range(20):
|
||||
print(i)
|
||||
img_lq = deg_fn(img)
|
||||
print(img_lq)
|
||||
img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
|
||||
print(img_lq.shape)
|
||||
print("bicubic", img_lq_bicubic.shape)
|
||||
print(img_hq.shape)
|
||||
lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0)
|
||||
lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0)
|
||||
img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
|
||||
util.imsave(img_concat, str(i) + '.png')
|
||||
|
||||
|
||||
print('hey')
|
||||
img = util.imread_uint('utils/test.png', 3)
|
||||
print(img)
|
||||
img = util.uint2single(img)
|
||||
print(img)
|
||||
img = img[:448, :448]
|
||||
h = img.shape[0] // 4
|
||||
print('resizing to', h)
|
||||
sf = 4
|
||||
deg_fn = partial(degradation_bsrgan_variant, sf=sf)
|
||||
for i in range(20):
|
||||
print(i)
|
||||
img_lq = deg_fn(img)
|
||||
print(img_lq)
|
||||
img_lq_bicubic = albumentations.SmallestMaxSize(
|
||||
max_size=h, interpolation=cv2.INTER_CUBIC
|
||||
)(image=img)['image']
|
||||
print(img_lq.shape)
|
||||
print('bicubic', img_lq_bicubic.shape)
|
||||
print(img_hq.shape)
|
||||
lq_nearest = cv2.resize(
|
||||
util.single2uint(img_lq),
|
||||
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0,
|
||||
)
|
||||
lq_bicubic_nearest = cv2.resize(
|
||||
util.single2uint(img_lq_bicubic),
|
||||
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0,
|
||||
)
|
||||
img_concat = np.concatenate(
|
||||
[lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1
|
||||
)
|
||||
util.imsave(img_concat, str(i) + '.png')
|
||||
|
||||
@@ -27,16 +27,16 @@ import ldm.modules.image_degradation.utils_image as util
|
||||
|
||||
|
||||
def modcrop_np(img, sf):
|
||||
'''
|
||||
"""
|
||||
Args:
|
||||
img: numpy image, WxH or WxHxC
|
||||
sf: scale factor
|
||||
Return:
|
||||
cropped image
|
||||
'''
|
||||
"""
|
||||
w, h = img.shape[:2]
|
||||
im = np.copy(img)
|
||||
return im[:w - w % sf, :h - h % sf, ...]
|
||||
return im[: w - w % sf, : h - h % sf, ...]
|
||||
|
||||
|
||||
"""
|
||||
@@ -54,7 +54,9 @@ def analytic_kernel(k):
|
||||
# Loop over the small kernel to fill the big one
|
||||
for r in range(k_size):
|
||||
for c in range(k_size):
|
||||
big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
|
||||
big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += (
|
||||
k[r, c] * k
|
||||
)
|
||||
# Crop the edges of the big kernel to ignore very small values and increase run time of SR
|
||||
crop = k_size // 2
|
||||
cropped_big_k = big_k[crop:-crop, crop:-crop]
|
||||
@@ -63,7 +65,7 @@ def analytic_kernel(k):
|
||||
|
||||
|
||||
def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
|
||||
""" generate an anisotropic Gaussian kernel
|
||||
"""generate an anisotropic Gaussian kernel
|
||||
Args:
|
||||
ksize : e.g., 15, kernel size
|
||||
theta : [0, pi], rotation angle range
|
||||
@@ -74,7 +76,12 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
|
||||
k : kernel
|
||||
"""
|
||||
|
||||
v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
|
||||
v = np.dot(
|
||||
np.array(
|
||||
[[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
|
||||
),
|
||||
np.array([1.0, 0.0]),
|
||||
)
|
||||
V = np.array([[v[0], v[1]], [v[1], -v[0]]])
|
||||
D = np.array([[l1, 0], [0, l2]])
|
||||
Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
|
||||
@@ -126,24 +133,32 @@ def shift_pixel(x, sf, upper_left=True):
|
||||
|
||||
|
||||
def blur(x, k):
|
||||
'''
|
||||
"""
|
||||
x: image, NxcxHxW
|
||||
k: kernel, Nx1xhxw
|
||||
'''
|
||||
"""
|
||||
n, c = x.shape[:2]
|
||||
p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
|
||||
x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
|
||||
k = k.repeat(1, c, 1, 1)
|
||||
k = k.view(-1, 1, k.shape[2], k.shape[3])
|
||||
x = x.view(1, -1, x.shape[2], x.shape[3])
|
||||
x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
|
||||
x = torch.nn.functional.conv2d(
|
||||
x, k, bias=None, stride=1, padding=0, groups=n * c
|
||||
)
|
||||
x = x.view(n, c, x.shape[2], x.shape[3])
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
|
||||
""""
|
||||
def gen_kernel(
|
||||
k_size=np.array([15, 15]),
|
||||
scale_factor=np.array([4, 4]),
|
||||
min_var=0.6,
|
||||
max_var=10.0,
|
||||
noise_level=0,
|
||||
):
|
||||
""" "
|
||||
# modified version of https://github.com/assafshocher/BlindSR_dataset_generator
|
||||
# Kai Zhang
|
||||
# min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var
|
||||
@@ -157,13 +172,16 @@ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var
|
||||
|
||||
# Set COV matrix using Lambdas and Theta
|
||||
LAMBDA = np.diag([lambda_1, lambda_2])
|
||||
Q = np.array([[np.cos(theta), -np.sin(theta)],
|
||||
[np.sin(theta), np.cos(theta)]])
|
||||
Q = np.array(
|
||||
[[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
|
||||
)
|
||||
SIGMA = Q @ LAMBDA @ Q.T
|
||||
INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]
|
||||
|
||||
# Set expectation position (shifting kernel for aligned image)
|
||||
MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2)
|
||||
MU = k_size // 2 - 0.5 * (
|
||||
scale_factor - 1
|
||||
) # - 0.5 * (scale_factor - k_size % 2)
|
||||
MU = MU[None, None, :, None]
|
||||
|
||||
# Create meshgrid for Gaussian
|
||||
@@ -188,7 +206,9 @@ def fspecial_gaussian(hsize, sigma):
|
||||
hsize = [hsize, hsize]
|
||||
siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
|
||||
std = sigma
|
||||
[x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
|
||||
[x, y] = np.meshgrid(
|
||||
np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)
|
||||
)
|
||||
arg = -(x * x + y * y) / (2 * std * std)
|
||||
h = np.exp(arg)
|
||||
h[h < scipy.finfo(float).eps * h.max()] = 0
|
||||
@@ -208,10 +228,10 @@ def fspecial_laplacian(alpha):
|
||||
|
||||
|
||||
def fspecial(filter_type, *args, **kwargs):
|
||||
'''
|
||||
"""
|
||||
python code from:
|
||||
https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
|
||||
'''
|
||||
"""
|
||||
if filter_type == 'gaussian':
|
||||
return fspecial_gaussian(*args, **kwargs)
|
||||
if filter_type == 'laplacian':
|
||||
@@ -226,19 +246,19 @@ def fspecial(filter_type, *args, **kwargs):
|
||||
|
||||
|
||||
def bicubic_degradation(x, sf=3):
|
||||
'''
|
||||
"""
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]
|
||||
sf: down-scale factor
|
||||
Return:
|
||||
bicubicly downsampled LR image
|
||||
'''
|
||||
"""
|
||||
x = util.imresize_np(x, scale=1 / sf)
|
||||
return x
|
||||
|
||||
|
||||
def srmd_degradation(x, k, sf=3):
|
||||
''' blur + bicubic downsampling
|
||||
"""blur + bicubic downsampling
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]
|
||||
k: hxw, double
|
||||
@@ -253,14 +273,16 @@ def srmd_degradation(x, k, sf=3):
|
||||
pages={3262--3271},
|
||||
year={2018}
|
||||
}
|
||||
'''
|
||||
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror'
|
||||
"""
|
||||
x = ndimage.filters.convolve(
|
||||
x, np.expand_dims(k, axis=2), mode='wrap'
|
||||
) # 'nearest' | 'mirror'
|
||||
x = bicubic_degradation(x, sf=sf)
|
||||
return x
|
||||
|
||||
|
||||
def dpsr_degradation(x, k, sf=3):
|
||||
''' bicubic downsampling + blur
|
||||
"""bicubic downsampling + blur
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]
|
||||
k: hxw, double
|
||||
@@ -275,21 +297,21 @@ def dpsr_degradation(x, k, sf=3):
|
||||
pages={1671--1681},
|
||||
year={2019}
|
||||
}
|
||||
'''
|
||||
"""
|
||||
x = bicubic_degradation(x, sf=sf)
|
||||
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||
return x
|
||||
|
||||
|
||||
def classical_degradation(x, k, sf=3):
|
||||
''' blur + downsampling
|
||||
"""blur + downsampling
|
||||
Args:
|
||||
x: HxWxC image, [0, 1]/[0, 255]
|
||||
k: hxw, double
|
||||
sf: down-scale factor
|
||||
Return:
|
||||
downsampled LR image
|
||||
'''
|
||||
"""
|
||||
x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
|
||||
# x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
|
||||
st = 0
|
||||
@@ -326,16 +348,25 @@ def add_blur(img, sf=4):
|
||||
wd2 = 4.0 + sf
|
||||
wd = 2.0 + 0.2 * sf
|
||||
|
||||
wd2 = wd2/4
|
||||
wd = wd/4
|
||||
wd2 = wd2 / 4
|
||||
wd = wd / 4
|
||||
|
||||
if random.random() < 0.5:
|
||||
l1 = wd2 * random.random()
|
||||
l2 = wd2 * random.random()
|
||||
k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
|
||||
k = anisotropic_Gaussian(
|
||||
ksize=random.randint(2, 11) + 3,
|
||||
theta=random.random() * np.pi,
|
||||
l1=l1,
|
||||
l2=l2,
|
||||
)
|
||||
else:
|
||||
k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
|
||||
img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
|
||||
k = fspecial(
|
||||
'gaussian', random.randint(2, 4) + 3, wd * random.random()
|
||||
)
|
||||
img = ndimage.filters.convolve(
|
||||
img, np.expand_dims(k, axis=2), mode='mirror'
|
||||
)
|
||||
|
||||
return img
|
||||
|
||||
@@ -348,7 +379,11 @@ def add_resize(img, sf=4):
|
||||
sf1 = random.uniform(0.5 / sf, 1)
|
||||
else:
|
||||
sf1 = 1.0
|
||||
img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
|
||||
return img
|
||||
@@ -370,19 +405,26 @@ def add_resize(img, sf=4):
|
||||
# img = np.clip(img, 0.0, 1.0)
|
||||
# return img
|
||||
|
||||
|
||||
def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
|
||||
noise_level = random.randint(noise_level1, noise_level2)
|
||||
rnum = np.random.rand()
|
||||
if rnum > 0.6: # add color Gaussian noise
|
||||
img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||
img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(
|
||||
np.float32
|
||||
)
|
||||
elif rnum < 0.4: # add grayscale Gaussian noise
|
||||
img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||
img = img + np.random.normal(
|
||||
0, noise_level / 255.0, (*img.shape[:2], 1)
|
||||
).astype(np.float32)
|
||||
else: # add noise
|
||||
L = noise_level2 / 255.
|
||||
L = noise_level2 / 255.0
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img = img + np.random.multivariate_normal(
|
||||
[0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
|
||||
).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
@@ -392,28 +434,37 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
rnum = random.random()
|
||||
if rnum > 0.6:
|
||||
img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
|
||||
img += img * np.random.normal(
|
||||
0, noise_level / 255.0, img.shape
|
||||
).astype(np.float32)
|
||||
elif rnum < 0.4:
|
||||
img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
|
||||
img += img * np.random.normal(
|
||||
0, noise_level / 255.0, (*img.shape[:2], 1)
|
||||
).astype(np.float32)
|
||||
else:
|
||||
L = noise_level2 / 255.
|
||||
L = noise_level2 / 255.0
|
||||
D = np.diag(np.random.rand(3))
|
||||
U = orth(np.random.rand(3, 3))
|
||||
conv = np.dot(np.dot(np.transpose(U), D), U)
|
||||
img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
|
||||
img += img * np.random.multivariate_normal(
|
||||
[0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
|
||||
).astype(np.float32)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
|
||||
|
||||
def add_Poisson_noise(img):
|
||||
img = np.clip((img * 255.0).round(), 0, 255) / 255.
|
||||
img = np.clip((img * 255.0).round(), 0, 255) / 255.0
|
||||
vals = 10 ** (2 * random.random() + 2.0) # [2, 4]
|
||||
if random.random() < 0.5:
|
||||
img = np.random.poisson(img * vals).astype(np.float32) / vals
|
||||
else:
|
||||
img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
|
||||
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
|
||||
noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
|
||||
img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
|
||||
noise_gray = (
|
||||
np.random.poisson(img_gray * vals).astype(np.float32) / vals
|
||||
- img_gray
|
||||
)
|
||||
img += noise_gray[:, :, np.newaxis]
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
return img
|
||||
@@ -422,7 +473,9 @@ def add_Poisson_noise(img):
|
||||
def add_JPEG_noise(img):
|
||||
quality_factor = random.randint(80, 95)
|
||||
img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
|
||||
result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
|
||||
result, encimg = cv2.imencode(
|
||||
'.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]
|
||||
)
|
||||
img = cv2.imdecode(encimg, 1)
|
||||
img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
|
||||
return img
|
||||
@@ -432,10 +485,14 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
|
||||
h, w = lq.shape[:2]
|
||||
rnd_h = random.randint(0, h - lq_patchsize)
|
||||
rnd_w = random.randint(0, w - lq_patchsize)
|
||||
lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
|
||||
lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]
|
||||
|
||||
rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
|
||||
hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
|
||||
hq = hq[
|
||||
rnd_h_H : rnd_h_H + lq_patchsize * sf,
|
||||
rnd_w_H : rnd_w_H + lq_patchsize * sf,
|
||||
:,
|
||||
]
|
||||
return lq, hq
|
||||
|
||||
|
||||
@@ -456,7 +513,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
sf_ori = sf
|
||||
|
||||
h1, w1 = img.shape[:2]
|
||||
img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||
img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
|
||||
h, w = img.shape[:2]
|
||||
|
||||
if h < lq_patchsize * sf or w < lq_patchsize * sf:
|
||||
@@ -466,8 +523,11 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
|
||||
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||
if np.random.rand() < 0.5:
|
||||
img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
img = util.imresize_np(img, 1 / 2, True)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
@@ -476,7 +536,10 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
shuffle_order = random.sample(range(7), 7)
|
||||
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||
if idx1 > idx2: # keep downsample3 last
|
||||
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||
shuffle_order[idx1], shuffle_order[idx2] = (
|
||||
shuffle_order[idx2],
|
||||
shuffle_order[idx1],
|
||||
)
|
||||
|
||||
for i in shuffle_order:
|
||||
|
||||
@@ -491,19 +554,30 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
|
||||
# downsample2
|
||||
if random.random() < 0.75:
|
||||
sf1 = random.uniform(1, 2 * sf)
|
||||
img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||
k_shifted = shift_pixel(k, sf)
|
||||
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||
img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||
k_shifted = (
|
||||
k_shifted / k_shifted.sum()
|
||||
) # blur with shifted kernel
|
||||
img = ndimage.filters.convolve(
|
||||
img, np.expand_dims(k_shifted, axis=2), mode='mirror'
|
||||
)
|
||||
img = img[0::sf, 0::sf, ...] # nearest downsampling
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
|
||||
elif i == 3:
|
||||
# downsample3
|
||||
img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||
img = cv2.resize(
|
||||
img,
|
||||
(int(1 / sf * a), int(1 / sf * b)),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
img = np.clip(img, 0.0, 1.0)
|
||||
|
||||
elif i == 4:
|
||||
@@ -548,15 +622,18 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
sf_ori = sf
|
||||
|
||||
h1, w1 = image.shape[:2]
|
||||
image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop
|
||||
image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop
|
||||
h, w = image.shape[:2]
|
||||
|
||||
hq = image.copy()
|
||||
|
||||
if sf == 4 and random.random() < scale2_prob: # downsample1
|
||||
if np.random.rand() < 0.5:
|
||||
image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
image = cv2.resize(
|
||||
image,
|
||||
(int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
image = util.imresize_np(image, 1 / 2, True)
|
||||
image = np.clip(image, 0.0, 1.0)
|
||||
@@ -565,7 +642,10 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
shuffle_order = random.sample(range(7), 7)
|
||||
idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
|
||||
if idx1 > idx2: # keep downsample3 last
|
||||
shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
|
||||
shuffle_order[idx1], shuffle_order[idx2] = (
|
||||
shuffle_order[idx2],
|
||||
shuffle_order[idx1],
|
||||
)
|
||||
|
||||
for i in shuffle_order:
|
||||
|
||||
@@ -583,20 +663,34 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
# downsample2
|
||||
if random.random() < 0.8:
|
||||
sf1 = random.uniform(1, 2 * sf)
|
||||
image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
|
||||
interpolation=random.choice([1, 2, 3]))
|
||||
image = cv2.resize(
|
||||
image,
|
||||
(
|
||||
int(1 / sf1 * image.shape[1]),
|
||||
int(1 / sf1 * image.shape[0]),
|
||||
),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
else:
|
||||
k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
|
||||
k_shifted = shift_pixel(k, sf)
|
||||
k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel
|
||||
image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
|
||||
k_shifted = (
|
||||
k_shifted / k_shifted.sum()
|
||||
) # blur with shifted kernel
|
||||
image = ndimage.filters.convolve(
|
||||
image, np.expand_dims(k_shifted, axis=2), mode='mirror'
|
||||
)
|
||||
image = image[0::sf, 0::sf, ...] # nearest downsampling
|
||||
|
||||
image = np.clip(image, 0.0, 1.0)
|
||||
|
||||
elif i == 3:
|
||||
# downsample3
|
||||
image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
|
||||
image = cv2.resize(
|
||||
image,
|
||||
(int(1 / sf * a), int(1 / sf * b)),
|
||||
interpolation=random.choice([1, 2, 3]),
|
||||
)
|
||||
image = np.clip(image, 0.0, 1.0)
|
||||
|
||||
elif i == 4:
|
||||
@@ -617,34 +711,41 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
|
||||
# add final JPEG compression noise
|
||||
image = add_JPEG_noise(image)
|
||||
image = util.single2uint(image)
|
||||
example = {"image": image}
|
||||
example = {'image': image}
|
||||
return example
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("hey")
|
||||
print('hey')
|
||||
img = util.imread_uint('utils/test.png', 3)
|
||||
img = img[:448, :448]
|
||||
h = img.shape[0] // 4
|
||||
print("resizing to", h)
|
||||
print('resizing to', h)
|
||||
sf = 4
|
||||
deg_fn = partial(degradation_bsrgan_variant, sf=sf)
|
||||
for i in range(20):
|
||||
print(i)
|
||||
img_hq = img
|
||||
img_lq = deg_fn(img)["image"]
|
||||
img_lq = deg_fn(img)['image']
|
||||
img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
|
||||
print(img_lq)
|
||||
img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
|
||||
img_lq_bicubic = albumentations.SmallestMaxSize(
|
||||
max_size=h, interpolation=cv2.INTER_CUBIC
|
||||
)(image=img_hq)['image']
|
||||
print(img_lq.shape)
|
||||
print("bicubic", img_lq_bicubic.shape)
|
||||
print('bicubic', img_lq_bicubic.shape)
|
||||
print(img_hq.shape)
|
||||
lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0)
|
||||
lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
|
||||
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0)
|
||||
img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
|
||||
lq_nearest = cv2.resize(
|
||||
util.single2uint(img_lq),
|
||||
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0,
|
||||
)
|
||||
lq_bicubic_nearest = cv2.resize(
|
||||
util.single2uint(img_lq_bicubic),
|
||||
(int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
|
||||
interpolation=0,
|
||||
)
|
||||
img_concat = np.concatenate(
|
||||
[lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1
|
||||
)
|
||||
util.imsave(img_concat, str(i) + '.png')
|
||||
|
||||
@@ -6,13 +6,14 @@ import torch
|
||||
import cv2
|
||||
from torchvision.utils import make_grid
|
||||
from datetime import datetime
|
||||
#import matplotlib.pyplot as plt # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
|
||||
|
||||
# import matplotlib.pyplot as plt # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
|
||||
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
||||
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# Kai Zhang (github: https://github.com/cszn)
|
||||
# 03/Mar/2019
|
||||
@@ -20,10 +21,22 @@ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
|
||||
# https://github.com/twhui/SRGAN-pyTorch
|
||||
# https://github.com/xinntao/BasicSR
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
|
||||
IMG_EXTENSIONS = [
|
||||
'.jpg',
|
||||
'.JPG',
|
||||
'.jpeg',
|
||||
'.JPEG',
|
||||
'.png',
|
||||
'.PNG',
|
||||
'.ppm',
|
||||
'.PPM',
|
||||
'.bmp',
|
||||
'.BMP',
|
||||
'.tif',
|
||||
]
|
||||
|
||||
|
||||
def is_image_file(filename):
|
||||
@@ -49,19 +62,19 @@ def surf(Z, cmap='rainbow', figsize=None):
|
||||
ax3 = plt.axes(projection='3d')
|
||||
|
||||
w, h = Z.shape[:2]
|
||||
xx = np.arange(0,w,1)
|
||||
yy = np.arange(0,h,1)
|
||||
xx = np.arange(0, w, 1)
|
||||
yy = np.arange(0, h, 1)
|
||||
X, Y = np.meshgrid(xx, yy)
|
||||
ax3.plot_surface(X,Y,Z,cmap=cmap)
|
||||
#ax3.contour(X,Y,Z, zdim='z',offset=-2,cmap=cmap)
|
||||
ax3.plot_surface(X, Y, Z, cmap=cmap)
|
||||
# ax3.contour(X,Y,Z, zdim='z',offset=-2,cmap=cmap)
|
||||
plt.show()
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# get image pathes
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def get_image_paths(dataroot):
|
||||
@@ -83,26 +96,26 @@ def _get_paths_from_images(path):
|
||||
return images
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# split large images into small images
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
|
||||
w, h = img.shape[:2]
|
||||
patches = []
|
||||
if w > p_max and h > p_max:
|
||||
w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
|
||||
h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
|
||||
w1.append(w-p_size)
|
||||
h1.append(h-p_size)
|
||||
# print(w1)
|
||||
# print(h1)
|
||||
w1 = list(np.arange(0, w - p_size, p_size - p_overlap, dtype=np.int))
|
||||
h1 = list(np.arange(0, h - p_size, p_size - p_overlap, dtype=np.int))
|
||||
w1.append(w - p_size)
|
||||
h1.append(h - p_size)
|
||||
# print(w1)
|
||||
# print(h1)
|
||||
for i in w1:
|
||||
for j in h1:
|
||||
patches.append(img[i:i+p_size, j:j+p_size,:])
|
||||
patches.append(img[i : i + p_size, j : j + p_size, :])
|
||||
else:
|
||||
patches.append(img)
|
||||
|
||||
@@ -118,11 +131,21 @@ def imssave(imgs, img_path):
|
||||
for i, img in enumerate(imgs):
|
||||
if img.ndim == 3:
|
||||
img = img[:, :, [2, 1, 0]]
|
||||
new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
|
||||
new_path = os.path.join(
|
||||
os.path.dirname(img_path),
|
||||
img_name + str('_s{:04d}'.format(i)) + '.png',
|
||||
)
|
||||
cv2.imwrite(new_path, img)
|
||||
|
||||
|
||||
def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
|
||||
def split_imageset(
|
||||
original_dataroot,
|
||||
taget_dataroot,
|
||||
n_channels=3,
|
||||
p_size=800,
|
||||
p_overlap=96,
|
||||
p_max=1000,
|
||||
):
|
||||
"""
|
||||
split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
|
||||
and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
|
||||
@@ -139,15 +162,18 @@ def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800,
|
||||
# img_name, ext = os.path.splitext(os.path.basename(img_path))
|
||||
img = imread_uint(img_path, n_channels=n_channels)
|
||||
patches = patches_from_image(img, p_size, p_overlap, p_max)
|
||||
imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
|
||||
#if original_dataroot == taget_dataroot:
|
||||
#del img_path
|
||||
imssave(
|
||||
patches, os.path.join(taget_dataroot, os.path.basename(img_path))
|
||||
)
|
||||
# if original_dataroot == taget_dataroot:
|
||||
# del img_path
|
||||
|
||||
'''
|
||||
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# makedir
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def mkdir(path):
|
||||
@@ -171,12 +197,12 @@ def mkdir_and_rename(path):
|
||||
os.makedirs(path)
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# read image from path
|
||||
# opencv is fast, but read BGR numpy image
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
# --------------------------------------------
|
||||
@@ -206,6 +232,7 @@ def imsave(img, img_path):
|
||||
img = img[:, :, [2, 1, 0]]
|
||||
cv2.imwrite(img_path, img)
|
||||
|
||||
|
||||
def imwrite(img, img_path):
|
||||
img = np.squeeze(img)
|
||||
if img.ndim == 3:
|
||||
@@ -213,7 +240,6 @@ def imwrite(img, img_path):
|
||||
cv2.imwrite(img_path, img)
|
||||
|
||||
|
||||
|
||||
# --------------------------------------------
|
||||
# get single image of size HxWxn_channles (BGR)
|
||||
# --------------------------------------------
|
||||
@@ -221,7 +247,7 @@ def read_img(path):
|
||||
# read image by cv2
|
||||
# return: Numpy float32, HWC, BGR, [0,1]
|
||||
img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # cv2.IMREAD_GRAYSCALE
|
||||
img = img.astype(np.float32) / 255.
|
||||
img = img.astype(np.float32) / 255.0
|
||||
if img.ndim == 2:
|
||||
img = np.expand_dims(img, axis=2)
|
||||
# some images have 4 channels
|
||||
@@ -230,7 +256,7 @@ def read_img(path):
|
||||
return img
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# image format conversion
|
||||
# --------------------------------------------
|
||||
@@ -238,7 +264,7 @@ def read_img(path):
|
||||
# numpy(single) <---> tensor
|
||||
# numpy(unit) <---> tensor
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
# --------------------------------------------
|
||||
@@ -248,22 +274,22 @@ def read_img(path):
|
||||
|
||||
def uint2single(img):
|
||||
|
||||
return np.float32(img/255.)
|
||||
return np.float32(img / 255.0)
|
||||
|
||||
|
||||
def single2uint(img):
|
||||
|
||||
return np.uint8((img.clip(0, 1)*255.).round())
|
||||
return np.uint8((img.clip(0, 1) * 255.0).round())
|
||||
|
||||
|
||||
def uint162single(img):
|
||||
|
||||
return np.float32(img/65535.)
|
||||
return np.float32(img / 65535.0)
|
||||
|
||||
|
||||
def single2uint16(img):
|
||||
|
||||
return np.uint16((img.clip(0, 1)*65535.).round())
|
||||
return np.uint16((img.clip(0, 1) * 65535.0).round())
|
||||
|
||||
|
||||
# --------------------------------------------
|
||||
@@ -275,14 +301,25 @@ def single2uint16(img):
|
||||
def uint2tensor4(img):
|
||||
if img.ndim == 2:
|
||||
img = np.expand_dims(img, axis=2)
|
||||
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
|
||||
return (
|
||||
torch.from_numpy(np.ascontiguousarray(img))
|
||||
.permute(2, 0, 1)
|
||||
.float()
|
||||
.div(255.0)
|
||||
.unsqueeze(0)
|
||||
)
|
||||
|
||||
|
||||
# convert uint to 3-dimensional torch tensor
|
||||
def uint2tensor3(img):
|
||||
if img.ndim == 2:
|
||||
img = np.expand_dims(img, axis=2)
|
||||
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
|
||||
return (
|
||||
torch.from_numpy(np.ascontiguousarray(img))
|
||||
.permute(2, 0, 1)
|
||||
.float()
|
||||
.div(255.0)
|
||||
)
|
||||
|
||||
|
||||
# convert 2/3/4-dimensional torch tensor to uint
|
||||
@@ -290,7 +327,7 @@ def tensor2uint(img):
|
||||
img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
|
||||
if img.ndim == 3:
|
||||
img = np.transpose(img, (1, 2, 0))
|
||||
return np.uint8((img*255.0).round())
|
||||
return np.uint8((img * 255.0).round())
|
||||
|
||||
|
||||
# --------------------------------------------
|
||||
@@ -305,7 +342,12 @@ def single2tensor3(img):
|
||||
|
||||
# convert single (HxWxC) to 4-dimensional torch tensor
|
||||
def single2tensor4(img):
|
||||
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
|
||||
return (
|
||||
torch.from_numpy(np.ascontiguousarray(img))
|
||||
.permute(2, 0, 1)
|
||||
.float()
|
||||
.unsqueeze(0)
|
||||
)
|
||||
|
||||
|
||||
# convert torch tensor to single
|
||||
@@ -316,6 +358,7 @@ def tensor2single(img):
|
||||
|
||||
return img
|
||||
|
||||
|
||||
# convert torch tensor to single
|
||||
def tensor2single3(img):
|
||||
img = img.data.squeeze().float().cpu().numpy()
|
||||
@@ -327,30 +370,48 @@ def tensor2single3(img):
|
||||
|
||||
|
||||
def single2tensor5(img):
|
||||
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
|
||||
return (
|
||||
torch.from_numpy(np.ascontiguousarray(img))
|
||||
.permute(2, 0, 1, 3)
|
||||
.float()
|
||||
.unsqueeze(0)
|
||||
)
|
||||
|
||||
|
||||
def single32tensor5(img):
|
||||
return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
|
||||
return (
|
||||
torch.from_numpy(np.ascontiguousarray(img))
|
||||
.float()
|
||||
.unsqueeze(0)
|
||||
.unsqueeze(0)
|
||||
)
|
||||
|
||||
|
||||
def single42tensor4(img):
|
||||
return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
|
||||
return (
|
||||
torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
|
||||
)
|
||||
|
||||
|
||||
# from skimage.io import imread, imsave
|
||||
def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
|
||||
'''
|
||||
"""
|
||||
Converts a torch Tensor into an image Numpy array of BGR channel order
|
||||
Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
|
||||
Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
|
||||
'''
|
||||
tensor = tensor.squeeze().float().cpu().clamp_(*min_max) # squeeze first, then clamp
|
||||
tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0]) # to range [0,1]
|
||||
"""
|
||||
tensor = (
|
||||
tensor.squeeze().float().cpu().clamp_(*min_max)
|
||||
) # squeeze first, then clamp
|
||||
tensor = (tensor - min_max[0]) / (
|
||||
min_max[1] - min_max[0]
|
||||
) # to range [0,1]
|
||||
n_dim = tensor.dim()
|
||||
if n_dim == 4:
|
||||
n_img = len(tensor)
|
||||
img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
|
||||
img_np = make_grid(
|
||||
tensor, nrow=int(math.sqrt(n_img)), normalize=False
|
||||
).numpy()
|
||||
img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR
|
||||
elif n_dim == 3:
|
||||
img_np = tensor.numpy()
|
||||
@@ -359,14 +420,17 @@ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
|
||||
img_np = tensor.numpy()
|
||||
else:
|
||||
raise TypeError(
|
||||
'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
|
||||
'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(
|
||||
n_dim
|
||||
)
|
||||
)
|
||||
if out_type == np.uint8:
|
||||
img_np = (img_np * 255.0).round()
|
||||
# Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
|
||||
return img_np.astype(out_type)
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# Augmentation, flipe and/or rotate
|
||||
# --------------------------------------------
|
||||
@@ -374,12 +438,11 @@ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
|
||||
# (1) augmet_img: numpy image of WxHxC or WxH
|
||||
# (2) augment_img_tensor4: tensor image 1xCxWxH
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def augment_img(img, mode=0):
|
||||
'''Kai Zhang (github: https://github.com/cszn)
|
||||
'''
|
||||
"""Kai Zhang (github: https://github.com/cszn)"""
|
||||
if mode == 0:
|
||||
return img
|
||||
elif mode == 1:
|
||||
@@ -399,8 +462,7 @@ def augment_img(img, mode=0):
|
||||
|
||||
|
||||
def augment_img_tensor4(img, mode=0):
|
||||
'''Kai Zhang (github: https://github.com/cszn)
|
||||
'''
|
||||
"""Kai Zhang (github: https://github.com/cszn)"""
|
||||
if mode == 0:
|
||||
return img
|
||||
elif mode == 1:
|
||||
@@ -420,8 +482,7 @@ def augment_img_tensor4(img, mode=0):
|
||||
|
||||
|
||||
def augment_img_tensor(img, mode=0):
|
||||
'''Kai Zhang (github: https://github.com/cszn)
|
||||
'''
|
||||
"""Kai Zhang (github: https://github.com/cszn)"""
|
||||
img_size = img.size()
|
||||
img_np = img.data.cpu().numpy()
|
||||
if len(img_size) == 3:
|
||||
@@ -484,11 +545,11 @@ def augment_imgs(img_list, hflip=True, rot=True):
|
||||
return [_augment(img) for img in img_list]
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# modcrop and shave
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def modcrop(img_in, scale):
|
||||
@@ -497,11 +558,11 @@ def modcrop(img_in, scale):
|
||||
if img.ndim == 2:
|
||||
H, W = img.shape
|
||||
H_r, W_r = H % scale, W % scale
|
||||
img = img[:H - H_r, :W - W_r]
|
||||
img = img[: H - H_r, : W - W_r]
|
||||
elif img.ndim == 3:
|
||||
H, W, C = img.shape
|
||||
H_r, W_r = H % scale, W % scale
|
||||
img = img[:H - H_r, :W - W_r, :]
|
||||
img = img[: H - H_r, : W - W_r, :]
|
||||
else:
|
||||
raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
|
||||
return img
|
||||
@@ -511,11 +572,11 @@ def shave(img_in, border=0):
|
||||
# img_in: Numpy, HWC or HW
|
||||
img = np.copy(img_in)
|
||||
h, w = img.shape[:2]
|
||||
img = img[border:h-border, border:w-border]
|
||||
img = img[border : h - border, border : w - border]
|
||||
return img
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# image processing process on numpy image
|
||||
# channel_convert(in_c, tar_type, img_list):
|
||||
@@ -523,74 +584,92 @@ def shave(img_in, border=0):
|
||||
# bgr2ycbcr(img, only_y=True):
|
||||
# ycbcr2rgb(img):
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def rgb2ycbcr(img, only_y=True):
|
||||
'''same as matlab rgb2ycbcr
|
||||
"""same as matlab rgb2ycbcr
|
||||
only_y: only return Y channel
|
||||
Input:
|
||||
uint8, [0, 255]
|
||||
float, [0, 1]
|
||||
'''
|
||||
"""
|
||||
in_img_type = img.dtype
|
||||
img.astype(np.float32)
|
||||
if in_img_type != np.uint8:
|
||||
img *= 255.
|
||||
img *= 255.0
|
||||
# convert
|
||||
if only_y:
|
||||
rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
|
||||
else:
|
||||
rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
|
||||
[24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
|
||||
rlt = np.matmul(
|
||||
img,
|
||||
[
|
||||
[65.481, -37.797, 112.0],
|
||||
[128.553, -74.203, -93.786],
|
||||
[24.966, 112.0, -18.214],
|
||||
],
|
||||
) / 255.0 + [16, 128, 128]
|
||||
if in_img_type == np.uint8:
|
||||
rlt = rlt.round()
|
||||
else:
|
||||
rlt /= 255.
|
||||
rlt /= 255.0
|
||||
return rlt.astype(in_img_type)
|
||||
|
||||
|
||||
def ycbcr2rgb(img):
|
||||
'''same as matlab ycbcr2rgb
|
||||
"""same as matlab ycbcr2rgb
|
||||
Input:
|
||||
uint8, [0, 255]
|
||||
float, [0, 1]
|
||||
'''
|
||||
"""
|
||||
in_img_type = img.dtype
|
||||
img.astype(np.float32)
|
||||
if in_img_type != np.uint8:
|
||||
img *= 255.
|
||||
img *= 255.0
|
||||
# convert
|
||||
rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
|
||||
[0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
|
||||
rlt = np.matmul(
|
||||
img,
|
||||
[
|
||||
[0.00456621, 0.00456621, 0.00456621],
|
||||
[0, -0.00153632, 0.00791071],
|
||||
[0.00625893, -0.00318811, 0],
|
||||
],
|
||||
) * 255.0 + [-222.921, 135.576, -276.836]
|
||||
if in_img_type == np.uint8:
|
||||
rlt = rlt.round()
|
||||
else:
|
||||
rlt /= 255.
|
||||
rlt /= 255.0
|
||||
return rlt.astype(in_img_type)
|
||||
|
||||
|
||||
def bgr2ycbcr(img, only_y=True):
|
||||
'''bgr version of rgb2ycbcr
|
||||
"""bgr version of rgb2ycbcr
|
||||
only_y: only return Y channel
|
||||
Input:
|
||||
uint8, [0, 255]
|
||||
float, [0, 1]
|
||||
'''
|
||||
"""
|
||||
in_img_type = img.dtype
|
||||
img.astype(np.float32)
|
||||
if in_img_type != np.uint8:
|
||||
img *= 255.
|
||||
img *= 255.0
|
||||
# convert
|
||||
if only_y:
|
||||
rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
|
||||
else:
|
||||
rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
|
||||
[65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
|
||||
rlt = np.matmul(
|
||||
img,
|
||||
[
|
||||
[24.966, 112.0, -18.214],
|
||||
[128.553, -74.203, -93.786],
|
||||
[65.481, -37.797, 112.0],
|
||||
],
|
||||
) / 255.0 + [16, 128, 128]
|
||||
if in_img_type == np.uint8:
|
||||
rlt = rlt.round()
|
||||
else:
|
||||
rlt /= 255.
|
||||
rlt /= 255.0
|
||||
return rlt.astype(in_img_type)
|
||||
|
||||
|
||||
@@ -608,11 +687,11 @@ def channel_convert(in_c, tar_type, img_list):
|
||||
return img_list
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# metric, PSNR and SSIM
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
# --------------------------------------------
|
||||
@@ -620,17 +699,17 @@ def channel_convert(in_c, tar_type, img_list):
|
||||
# --------------------------------------------
|
||||
def calculate_psnr(img1, img2, border=0):
|
||||
# img1 and img2 have range [0, 255]
|
||||
#img1 = img1.squeeze()
|
||||
#img2 = img2.squeeze()
|
||||
# img1 = img1.squeeze()
|
||||
# img2 = img2.squeeze()
|
||||
if not img1.shape == img2.shape:
|
||||
raise ValueError('Input images must have the same dimensions.')
|
||||
h, w = img1.shape[:2]
|
||||
img1 = img1[border:h-border, border:w-border]
|
||||
img2 = img2[border:h-border, border:w-border]
|
||||
img1 = img1[border : h - border, border : w - border]
|
||||
img2 = img2[border : h - border, border : w - border]
|
||||
|
||||
img1 = img1.astype(np.float64)
|
||||
img2 = img2.astype(np.float64)
|
||||
mse = np.mean((img1 - img2)**2)
|
||||
mse = np.mean((img1 - img2) ** 2)
|
||||
if mse == 0:
|
||||
return float('inf')
|
||||
return 20 * math.log10(255.0 / math.sqrt(mse))
|
||||
@@ -640,17 +719,17 @@ def calculate_psnr(img1, img2, border=0):
|
||||
# SSIM
|
||||
# --------------------------------------------
|
||||
def calculate_ssim(img1, img2, border=0):
|
||||
'''calculate SSIM
|
||||
"""calculate SSIM
|
||||
the same outputs as MATLAB's
|
||||
img1, img2: [0, 255]
|
||||
'''
|
||||
#img1 = img1.squeeze()
|
||||
#img2 = img2.squeeze()
|
||||
"""
|
||||
# img1 = img1.squeeze()
|
||||
# img2 = img2.squeeze()
|
||||
if not img1.shape == img2.shape:
|
||||
raise ValueError('Input images must have the same dimensions.')
|
||||
h, w = img1.shape[:2]
|
||||
img1 = img1[border:h-border, border:w-border]
|
||||
img2 = img2[border:h-border, border:w-border]
|
||||
img1 = img1[border : h - border, border : w - border]
|
||||
img2 = img2[border : h - border, border : w - border]
|
||||
|
||||
if img1.ndim == 2:
|
||||
return ssim(img1, img2)
|
||||
@@ -658,7 +737,7 @@ def calculate_ssim(img1, img2, border=0):
|
||||
if img1.shape[2] == 3:
|
||||
ssims = []
|
||||
for i in range(3):
|
||||
ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
|
||||
ssims.append(ssim(img1[:, :, i], img2[:, :, i]))
|
||||
return np.array(ssims).mean()
|
||||
elif img1.shape[2] == 1:
|
||||
return ssim(np.squeeze(img1), np.squeeze(img2))
|
||||
@@ -667,8 +746,8 @@ def calculate_ssim(img1, img2, border=0):
|
||||
|
||||
|
||||
def ssim(img1, img2):
|
||||
C1 = (0.01 * 255)**2
|
||||
C2 = (0.03 * 255)**2
|
||||
C1 = (0.01 * 255) ** 2
|
||||
C2 = (0.03 * 255) ** 2
|
||||
|
||||
img1 = img1.astype(np.float64)
|
||||
img2 = img2.astype(np.float64)
|
||||
@@ -684,16 +763,17 @@ def ssim(img1, img2):
|
||||
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
|
||||
sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
|
||||
|
||||
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
|
||||
(sigma1_sq + sigma2_sq + C2))
|
||||
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
|
||||
(mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
|
||||
)
|
||||
return ssim_map.mean()
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
# --------------------------------------------
|
||||
# matlab's bicubic imresize (numpy and torch) [0, 1]
|
||||
# --------------------------------------------
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
# matlab 'imresize' function, now only support 'bicubic'
|
||||
@@ -701,11 +781,14 @@ def cubic(x):
|
||||
absx = torch.abs(x)
|
||||
absx2 = absx**2
|
||||
absx3 = absx**3
|
||||
return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
|
||||
(-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
|
||||
return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).type_as(absx)) + (
|
||||
-0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
|
||||
) * (((absx > 1) * (absx <= 2)).type_as(absx))
|
||||
|
||||
|
||||
def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
|
||||
def calculate_weights_indices(
|
||||
in_length, out_length, scale, kernel, kernel_width, antialiasing
|
||||
):
|
||||
if (scale < 1) and (antialiasing):
|
||||
# Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
|
||||
kernel_width = kernel_width / scale
|
||||
@@ -729,8 +812,9 @@ def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width
|
||||
|
||||
# The indices of the input pixels involved in computing the k-th output
|
||||
# pixel are in row k of the indices matrix.
|
||||
indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
|
||||
1, P).expand(out_length, P)
|
||||
indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(
|
||||
0, P - 1, P
|
||||
).view(1, P).expand(out_length, P)
|
||||
|
||||
# The weights used to compute the k-th output pixel are in row k of the
|
||||
# weights matrix.
|
||||
@@ -771,7 +855,11 @@ def imresize(img, scale, antialiasing=True):
|
||||
if need_squeeze:
|
||||
img.unsqueeze_(0)
|
||||
in_C, in_H, in_W = img.size()
|
||||
out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
|
||||
out_C, out_H, out_W = (
|
||||
in_C,
|
||||
math.ceil(in_H * scale),
|
||||
math.ceil(in_W * scale),
|
||||
)
|
||||
kernel_width = 4
|
||||
kernel = 'cubic'
|
||||
|
||||
@@ -782,9 +870,11 @@ def imresize(img, scale, antialiasing=True):
|
||||
|
||||
# get weights and indices
|
||||
weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
|
||||
in_H, out_H, scale, kernel, kernel_width, antialiasing)
|
||||
in_H, out_H, scale, kernel, kernel_width, antialiasing
|
||||
)
|
||||
weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
|
||||
in_W, out_W, scale, kernel, kernel_width, antialiasing)
|
||||
in_W, out_W, scale, kernel, kernel_width, antialiasing
|
||||
)
|
||||
# process H dimension
|
||||
# symmetric copying
|
||||
img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
|
||||
@@ -805,7 +895,11 @@ def imresize(img, scale, antialiasing=True):
|
||||
for i in range(out_H):
|
||||
idx = int(indices_H[i][0])
|
||||
for j in range(out_C):
|
||||
out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
|
||||
out_1[j, i, :] = (
|
||||
img_aug[j, idx : idx + kernel_width, :]
|
||||
.transpose(0, 1)
|
||||
.mv(weights_H[i])
|
||||
)
|
||||
|
||||
# process W dimension
|
||||
# symmetric copying
|
||||
@@ -827,7 +921,9 @@ def imresize(img, scale, antialiasing=True):
|
||||
for i in range(out_W):
|
||||
idx = int(indices_W[i][0])
|
||||
for j in range(out_C):
|
||||
out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
|
||||
out_2[j, :, i] = out_1_aug[j, :, idx : idx + kernel_width].mv(
|
||||
weights_W[i]
|
||||
)
|
||||
if need_squeeze:
|
||||
out_2.squeeze_()
|
||||
return out_2
|
||||
@@ -846,7 +942,11 @@ def imresize_np(img, scale, antialiasing=True):
|
||||
img.unsqueeze_(2)
|
||||
|
||||
in_H, in_W, in_C = img.size()
|
||||
out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
|
||||
out_C, out_H, out_W = (
|
||||
in_C,
|
||||
math.ceil(in_H * scale),
|
||||
math.ceil(in_W * scale),
|
||||
)
|
||||
kernel_width = 4
|
||||
kernel = 'cubic'
|
||||
|
||||
@@ -857,9 +957,11 @@ def imresize_np(img, scale, antialiasing=True):
|
||||
|
||||
# get weights and indices
|
||||
weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
|
||||
in_H, out_H, scale, kernel, kernel_width, antialiasing)
|
||||
in_H, out_H, scale, kernel, kernel_width, antialiasing
|
||||
)
|
||||
weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
|
||||
in_W, out_W, scale, kernel, kernel_width, antialiasing)
|
||||
in_W, out_W, scale, kernel, kernel_width, antialiasing
|
||||
)
|
||||
# process H dimension
|
||||
# symmetric copying
|
||||
img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
|
||||
@@ -880,7 +982,11 @@ def imresize_np(img, scale, antialiasing=True):
|
||||
for i in range(out_H):
|
||||
idx = int(indices_H[i][0])
|
||||
for j in range(out_C):
|
||||
out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
|
||||
out_1[i, :, j] = (
|
||||
img_aug[idx : idx + kernel_width, :, j]
|
||||
.transpose(0, 1)
|
||||
.mv(weights_H[i])
|
||||
)
|
||||
|
||||
# process W dimension
|
||||
# symmetric copying
|
||||
@@ -902,7 +1008,9 @@ def imresize_np(img, scale, antialiasing=True):
|
||||
for i in range(out_W):
|
||||
idx = int(indices_W[i][0])
|
||||
for j in range(out_C):
|
||||
out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
|
||||
out_2[:, i, j] = out_1_aug[:, idx : idx + kernel_width, j].mv(
|
||||
weights_W[i]
|
||||
)
|
||||
if need_squeeze:
|
||||
out_2.squeeze_()
|
||||
|
||||
@@ -913,4 +1021,4 @@ if __name__ == '__main__':
|
||||
print('---')
|
||||
# img = imread_uint('test.bmp', 3)
|
||||
# img = uint2single(img)
|
||||
# img_bicubic = imresize_np(img, 1/4)
|
||||
# img_bicubic = imresize_np(img, 1/4)
|
||||
|
||||
@@ -1 +1 @@
|
||||
from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
|
||||
from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
|
||||
|
||||
@@ -5,13 +5,24 @@ from taming.modules.losses.vqperceptual import * # TODO: taming dependency yes/
|
||||
|
||||
|
||||
class LPIPSWithDiscriminator(nn.Module):
|
||||
def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
|
||||
disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
|
||||
perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
|
||||
disc_loss="hinge"):
|
||||
def __init__(
|
||||
self,
|
||||
disc_start,
|
||||
logvar_init=0.0,
|
||||
kl_weight=1.0,
|
||||
pixelloss_weight=1.0,
|
||||
disc_num_layers=3,
|
||||
disc_in_channels=3,
|
||||
disc_factor=1.0,
|
||||
disc_weight=1.0,
|
||||
perceptual_weight=1.0,
|
||||
use_actnorm=False,
|
||||
disc_conditional=False,
|
||||
disc_loss='hinge',
|
||||
):
|
||||
|
||||
super().__init__()
|
||||
assert disc_loss in ["hinge", "vanilla"]
|
||||
assert disc_loss in ['hinge', 'vanilla']
|
||||
self.kl_weight = kl_weight
|
||||
self.pixel_weight = pixelloss_weight
|
||||
self.perceptual_loss = LPIPS().eval()
|
||||
@@ -19,42 +30,68 @@ class LPIPSWithDiscriminator(nn.Module):
|
||||
# output log variance
|
||||
self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)
|
||||
|
||||
self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
|
||||
n_layers=disc_num_layers,
|
||||
use_actnorm=use_actnorm
|
||||
).apply(weights_init)
|
||||
self.discriminator = NLayerDiscriminator(
|
||||
input_nc=disc_in_channels,
|
||||
n_layers=disc_num_layers,
|
||||
use_actnorm=use_actnorm,
|
||||
).apply(weights_init)
|
||||
self.discriminator_iter_start = disc_start
|
||||
self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
|
||||
self.disc_loss = (
|
||||
hinge_d_loss if disc_loss == 'hinge' else vanilla_d_loss
|
||||
)
|
||||
self.disc_factor = disc_factor
|
||||
self.discriminator_weight = disc_weight
|
||||
self.disc_conditional = disc_conditional
|
||||
|
||||
def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
|
||||
if last_layer is not None:
|
||||
nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
|
||||
g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
|
||||
nll_grads = torch.autograd.grad(
|
||||
nll_loss, last_layer, retain_graph=True
|
||||
)[0]
|
||||
g_grads = torch.autograd.grad(
|
||||
g_loss, last_layer, retain_graph=True
|
||||
)[0]
|
||||
else:
|
||||
nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
|
||||
g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
|
||||
nll_grads = torch.autograd.grad(
|
||||
nll_loss, self.last_layer[0], retain_graph=True
|
||||
)[0]
|
||||
g_grads = torch.autograd.grad(
|
||||
g_loss, self.last_layer[0], retain_graph=True
|
||||
)[0]
|
||||
|
||||
d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
|
||||
d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
|
||||
d_weight = d_weight * self.discriminator_weight
|
||||
return d_weight
|
||||
|
||||
def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
|
||||
global_step, last_layer=None, cond=None, split="train",
|
||||
weights=None):
|
||||
rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
|
||||
def forward(
|
||||
self,
|
||||
inputs,
|
||||
reconstructions,
|
||||
posteriors,
|
||||
optimizer_idx,
|
||||
global_step,
|
||||
last_layer=None,
|
||||
cond=None,
|
||||
split='train',
|
||||
weights=None,
|
||||
):
|
||||
rec_loss = torch.abs(
|
||||
inputs.contiguous() - reconstructions.contiguous()
|
||||
)
|
||||
if self.perceptual_weight > 0:
|
||||
p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
|
||||
p_loss = self.perceptual_loss(
|
||||
inputs.contiguous(), reconstructions.contiguous()
|
||||
)
|
||||
rec_loss = rec_loss + self.perceptual_weight * p_loss
|
||||
|
||||
nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
|
||||
weighted_nll_loss = nll_loss
|
||||
if weights is not None:
|
||||
weighted_nll_loss = weights*nll_loss
|
||||
weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
|
||||
weighted_nll_loss = weights * nll_loss
|
||||
weighted_nll_loss = (
|
||||
torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
|
||||
)
|
||||
nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
|
||||
kl_loss = posteriors.kl()
|
||||
kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
|
||||
@@ -67,45 +104,72 @@ class LPIPSWithDiscriminator(nn.Module):
|
||||
logits_fake = self.discriminator(reconstructions.contiguous())
|
||||
else:
|
||||
assert self.disc_conditional
|
||||
logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
|
||||
logits_fake = self.discriminator(
|
||||
torch.cat((reconstructions.contiguous(), cond), dim=1)
|
||||
)
|
||||
g_loss = -torch.mean(logits_fake)
|
||||
|
||||
if self.disc_factor > 0.0:
|
||||
try:
|
||||
d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
|
||||
d_weight = self.calculate_adaptive_weight(
|
||||
nll_loss, g_loss, last_layer=last_layer
|
||||
)
|
||||
except RuntimeError:
|
||||
assert not self.training
|
||||
d_weight = torch.tensor(0.0)
|
||||
else:
|
||||
d_weight = torch.tensor(0.0)
|
||||
|
||||
disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
|
||||
loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
|
||||
disc_factor = adopt_weight(
|
||||
self.disc_factor,
|
||||
global_step,
|
||||
threshold=self.discriminator_iter_start,
|
||||
)
|
||||
loss = (
|
||||
weighted_nll_loss
|
||||
+ self.kl_weight * kl_loss
|
||||
+ d_weight * disc_factor * g_loss
|
||||
)
|
||||
|
||||
log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
|
||||
"{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
|
||||
"{}/rec_loss".format(split): rec_loss.detach().mean(),
|
||||
"{}/d_weight".format(split): d_weight.detach(),
|
||||
"{}/disc_factor".format(split): torch.tensor(disc_factor),
|
||||
"{}/g_loss".format(split): g_loss.detach().mean(),
|
||||
}
|
||||
log = {
|
||||
'{}/total_loss'.format(split): loss.clone().detach().mean(),
|
||||
'{}/logvar'.format(split): self.logvar.detach(),
|
||||
'{}/kl_loss'.format(split): kl_loss.detach().mean(),
|
||||
'{}/nll_loss'.format(split): nll_loss.detach().mean(),
|
||||
'{}/rec_loss'.format(split): rec_loss.detach().mean(),
|
||||
'{}/d_weight'.format(split): d_weight.detach(),
|
||||
'{}/disc_factor'.format(split): torch.tensor(disc_factor),
|
||||
'{}/g_loss'.format(split): g_loss.detach().mean(),
|
||||
}
|
||||
return loss, log
|
||||
|
||||
if optimizer_idx == 1:
|
||||
# second pass for discriminator update
|
||||
if cond is None:
|
||||
logits_real = self.discriminator(inputs.contiguous().detach())
|
||||
logits_fake = self.discriminator(reconstructions.contiguous().detach())
|
||||
logits_fake = self.discriminator(
|
||||
reconstructions.contiguous().detach()
|
||||
)
|
||||
else:
|
||||
logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
|
||||
logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
|
||||
logits_real = self.discriminator(
|
||||
torch.cat((inputs.contiguous().detach(), cond), dim=1)
|
||||
)
|
||||
logits_fake = self.discriminator(
|
||||
torch.cat(
|
||||
(reconstructions.contiguous().detach(), cond), dim=1
|
||||
)
|
||||
)
|
||||
|
||||
disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
|
||||
disc_factor = adopt_weight(
|
||||
self.disc_factor,
|
||||
global_step,
|
||||
threshold=self.discriminator_iter_start,
|
||||
)
|
||||
d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
|
||||
|
||||
log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
|
||||
"{}/logits_real".format(split): logits_real.detach().mean(),
|
||||
"{}/logits_fake".format(split): logits_fake.detach().mean()
|
||||
}
|
||||
log = {
|
||||
'{}/disc_loss'.format(split): d_loss.clone().detach().mean(),
|
||||
'{}/logits_real'.format(split): logits_real.detach().mean(),
|
||||
'{}/logits_fake'.format(split): logits_fake.detach().mean(),
|
||||
}
|
||||
return d_loss, log
|
||||
|
||||
|
||||
@@ -3,21 +3,25 @@ from torch import nn
|
||||
import torch.nn.functional as F
|
||||
from einops import repeat
|
||||
|
||||
from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
|
||||
from taming.modules.discriminator.model import (
|
||||
NLayerDiscriminator,
|
||||
weights_init,
|
||||
)
|
||||
from taming.modules.losses.lpips import LPIPS
|
||||
from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss
|
||||
|
||||
|
||||
def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
|
||||
assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
|
||||
loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
|
||||
loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
|
||||
loss_real = torch.mean(F.relu(1.0 - logits_real), dim=[1, 2, 3])
|
||||
loss_fake = torch.mean(F.relu(1.0 + logits_fake), dim=[1, 2, 3])
|
||||
loss_real = (weights * loss_real).sum() / weights.sum()
|
||||
loss_fake = (weights * loss_fake).sum() / weights.sum()
|
||||
d_loss = 0.5 * (loss_real + loss_fake)
|
||||
return d_loss
|
||||
|
||||
def adopt_weight(weight, global_step, threshold=0, value=0.):
|
||||
|
||||
def adopt_weight(weight, global_step, threshold=0, value=0.0):
|
||||
if global_step < threshold:
|
||||
weight = value
|
||||
return weight
|
||||
@@ -26,57 +30,76 @@ def adopt_weight(weight, global_step, threshold=0, value=0.):
|
||||
def measure_perplexity(predicted_indices, n_embed):
|
||||
# src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
|
||||
# eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
|
||||
encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
|
||||
encodings = (
|
||||
F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
|
||||
)
|
||||
avg_probs = encodings.mean(0)
|
||||
perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
|
||||
cluster_use = torch.sum(avg_probs > 0)
|
||||
return perplexity, cluster_use
|
||||
|
||||
|
||||
def l1(x, y):
|
||||
return torch.abs(x-y)
|
||||
return torch.abs(x - y)
|
||||
|
||||
|
||||
def l2(x, y):
|
||||
return torch.pow((x-y), 2)
|
||||
return torch.pow((x - y), 2)
|
||||
|
||||
|
||||
class VQLPIPSWithDiscriminator(nn.Module):
|
||||
def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
|
||||
disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
|
||||
perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
|
||||
disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
|
||||
pixel_loss="l1"):
|
||||
def __init__(
|
||||
self,
|
||||
disc_start,
|
||||
codebook_weight=1.0,
|
||||
pixelloss_weight=1.0,
|
||||
disc_num_layers=3,
|
||||
disc_in_channels=3,
|
||||
disc_factor=1.0,
|
||||
disc_weight=1.0,
|
||||
perceptual_weight=1.0,
|
||||
use_actnorm=False,
|
||||
disc_conditional=False,
|
||||
disc_ndf=64,
|
||||
disc_loss='hinge',
|
||||
n_classes=None,
|
||||
perceptual_loss='lpips',
|
||||
pixel_loss='l1',
|
||||
):
|
||||
super().__init__()
|
||||
assert disc_loss in ["hinge", "vanilla"]
|
||||
assert perceptual_loss in ["lpips", "clips", "dists"]
|
||||
assert pixel_loss in ["l1", "l2"]
|
||||
assert disc_loss in ['hinge', 'vanilla']
|
||||
assert perceptual_loss in ['lpips', 'clips', 'dists']
|
||||
assert pixel_loss in ['l1', 'l2']
|
||||
self.codebook_weight = codebook_weight
|
||||
self.pixel_weight = pixelloss_weight
|
||||
if perceptual_loss == "lpips":
|
||||
print(f"{self.__class__.__name__}: Running with LPIPS.")
|
||||
if perceptual_loss == 'lpips':
|
||||
print(f'{self.__class__.__name__}: Running with LPIPS.')
|
||||
self.perceptual_loss = LPIPS().eval()
|
||||
else:
|
||||
raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
|
||||
raise ValueError(
|
||||
f'Unknown perceptual loss: >> {perceptual_loss} <<'
|
||||
)
|
||||
self.perceptual_weight = perceptual_weight
|
||||
|
||||
if pixel_loss == "l1":
|
||||
if pixel_loss == 'l1':
|
||||
self.pixel_loss = l1
|
||||
else:
|
||||
self.pixel_loss = l2
|
||||
|
||||
self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
|
||||
n_layers=disc_num_layers,
|
||||
use_actnorm=use_actnorm,
|
||||
ndf=disc_ndf
|
||||
).apply(weights_init)
|
||||
self.discriminator = NLayerDiscriminator(
|
||||
input_nc=disc_in_channels,
|
||||
n_layers=disc_num_layers,
|
||||
use_actnorm=use_actnorm,
|
||||
ndf=disc_ndf,
|
||||
).apply(weights_init)
|
||||
self.discriminator_iter_start = disc_start
|
||||
if disc_loss == "hinge":
|
||||
if disc_loss == 'hinge':
|
||||
self.disc_loss = hinge_d_loss
|
||||
elif disc_loss == "vanilla":
|
||||
elif disc_loss == 'vanilla':
|
||||
self.disc_loss = vanilla_d_loss
|
||||
else:
|
||||
raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
|
||||
print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
|
||||
print(f'VQLPIPSWithDiscriminator running with {disc_loss} loss.')
|
||||
self.disc_factor = disc_factor
|
||||
self.discriminator_weight = disc_weight
|
||||
self.disc_conditional = disc_conditional
|
||||
@@ -84,31 +107,53 @@ class VQLPIPSWithDiscriminator(nn.Module):
|
||||
|
||||
def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
|
||||
if last_layer is not None:
|
||||
nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
|
||||
g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
|
||||
nll_grads = torch.autograd.grad(
|
||||
nll_loss, last_layer, retain_graph=True
|
||||
)[0]
|
||||
g_grads = torch.autograd.grad(
|
||||
g_loss, last_layer, retain_graph=True
|
||||
)[0]
|
||||
else:
|
||||
nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
|
||||
g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
|
||||
nll_grads = torch.autograd.grad(
|
||||
nll_loss, self.last_layer[0], retain_graph=True
|
||||
)[0]
|
||||
g_grads = torch.autograd.grad(
|
||||
g_loss, self.last_layer[0], retain_graph=True
|
||||
)[0]
|
||||
|
||||
d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
|
||||
d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
|
||||
d_weight = d_weight * self.discriminator_weight
|
||||
return d_weight
|
||||
|
||||
def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
|
||||
global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
|
||||
def forward(
|
||||
self,
|
||||
codebook_loss,
|
||||
inputs,
|
||||
reconstructions,
|
||||
optimizer_idx,
|
||||
global_step,
|
||||
last_layer=None,
|
||||
cond=None,
|
||||
split='train',
|
||||
predicted_indices=None,
|
||||
):
|
||||
if not exists(codebook_loss):
|
||||
codebook_loss = torch.tensor([0.]).to(inputs.device)
|
||||
#rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
|
||||
rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
|
||||
codebook_loss = torch.tensor([0.0]).to(inputs.device)
|
||||
# rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
|
||||
rec_loss = self.pixel_loss(
|
||||
inputs.contiguous(), reconstructions.contiguous()
|
||||
)
|
||||
if self.perceptual_weight > 0:
|
||||
p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
|
||||
p_loss = self.perceptual_loss(
|
||||
inputs.contiguous(), reconstructions.contiguous()
|
||||
)
|
||||
rec_loss = rec_loss + self.perceptual_weight * p_loss
|
||||
else:
|
||||
p_loss = torch.tensor([0.0])
|
||||
|
||||
nll_loss = rec_loss
|
||||
#nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
|
||||
# nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
|
||||
nll_loss = torch.mean(nll_loss)
|
||||
|
||||
# now the GAN part
|
||||
@@ -119,49 +164,77 @@ class VQLPIPSWithDiscriminator(nn.Module):
|
||||
logits_fake = self.discriminator(reconstructions.contiguous())
|
||||
else:
|
||||
assert self.disc_conditional
|
||||
logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
|
||||
logits_fake = self.discriminator(
|
||||
torch.cat((reconstructions.contiguous(), cond), dim=1)
|
||||
)
|
||||
g_loss = -torch.mean(logits_fake)
|
||||
|
||||
try:
|
||||
d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
|
||||
d_weight = self.calculate_adaptive_weight(
|
||||
nll_loss, g_loss, last_layer=last_layer
|
||||
)
|
||||
except RuntimeError:
|
||||
assert not self.training
|
||||
d_weight = torch.tensor(0.0)
|
||||
|
||||
disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
|
||||
loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
|
||||
disc_factor = adopt_weight(
|
||||
self.disc_factor,
|
||||
global_step,
|
||||
threshold=self.discriminator_iter_start,
|
||||
)
|
||||
loss = (
|
||||
nll_loss
|
||||
+ d_weight * disc_factor * g_loss
|
||||
+ self.codebook_weight * codebook_loss.mean()
|
||||
)
|
||||
|
||||
log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
|
||||
"{}/quant_loss".format(split): codebook_loss.detach().mean(),
|
||||
"{}/nll_loss".format(split): nll_loss.detach().mean(),
|
||||
"{}/rec_loss".format(split): rec_loss.detach().mean(),
|
||||
"{}/p_loss".format(split): p_loss.detach().mean(),
|
||||
"{}/d_weight".format(split): d_weight.detach(),
|
||||
"{}/disc_factor".format(split): torch.tensor(disc_factor),
|
||||
"{}/g_loss".format(split): g_loss.detach().mean(),
|
||||
}
|
||||
log = {
|
||||
'{}/total_loss'.format(split): loss.clone().detach().mean(),
|
||||
'{}/quant_loss'.format(split): codebook_loss.detach().mean(),
|
||||
'{}/nll_loss'.format(split): nll_loss.detach().mean(),
|
||||
'{}/rec_loss'.format(split): rec_loss.detach().mean(),
|
||||
'{}/p_loss'.format(split): p_loss.detach().mean(),
|
||||
'{}/d_weight'.format(split): d_weight.detach(),
|
||||
'{}/disc_factor'.format(split): torch.tensor(disc_factor),
|
||||
'{}/g_loss'.format(split): g_loss.detach().mean(),
|
||||
}
|
||||
if predicted_indices is not None:
|
||||
assert self.n_classes is not None
|
||||
with torch.no_grad():
|
||||
perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
|
||||
log[f"{split}/perplexity"] = perplexity
|
||||
log[f"{split}/cluster_usage"] = cluster_usage
|
||||
perplexity, cluster_usage = measure_perplexity(
|
||||
predicted_indices, self.n_classes
|
||||
)
|
||||
log[f'{split}/perplexity'] = perplexity
|
||||
log[f'{split}/cluster_usage'] = cluster_usage
|
||||
return loss, log
|
||||
|
||||
if optimizer_idx == 1:
|
||||
# second pass for discriminator update
|
||||
if cond is None:
|
||||
logits_real = self.discriminator(inputs.contiguous().detach())
|
||||
logits_fake = self.discriminator(reconstructions.contiguous().detach())
|
||||
logits_fake = self.discriminator(
|
||||
reconstructions.contiguous().detach()
|
||||
)
|
||||
else:
|
||||
logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
|
||||
logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
|
||||
logits_real = self.discriminator(
|
||||
torch.cat((inputs.contiguous().detach(), cond), dim=1)
|
||||
)
|
||||
logits_fake = self.discriminator(
|
||||
torch.cat(
|
||||
(reconstructions.contiguous().detach(), cond), dim=1
|
||||
)
|
||||
)
|
||||
|
||||
disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
|
||||
disc_factor = adopt_weight(
|
||||
self.disc_factor,
|
||||
global_step,
|
||||
threshold=self.discriminator_iter_start,
|
||||
)
|
||||
d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
|
||||
|
||||
log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
|
||||
"{}/logits_real".format(split): logits_real.detach().mean(),
|
||||
"{}/logits_fake".format(split): logits_fake.detach().mean()
|
||||
}
|
||||
log = {
|
||||
'{}/disc_loss'.format(split): d_loss.clone().detach().mean(),
|
||||
'{}/logits_real'.format(split): logits_real.detach().mean(),
|
||||
'{}/logits_fake'.format(split): logits_fake.detach().mean(),
|
||||
}
|
||||
return d_loss, log
|
||||
|
||||
@@ -11,15 +11,13 @@ from einops import rearrange, repeat, reduce
|
||||
|
||||
DEFAULT_DIM_HEAD = 64
|
||||
|
||||
Intermediates = namedtuple('Intermediates', [
|
||||
'pre_softmax_attn',
|
||||
'post_softmax_attn'
|
||||
])
|
||||
Intermediates = namedtuple(
|
||||
'Intermediates', ['pre_softmax_attn', 'post_softmax_attn']
|
||||
)
|
||||
|
||||
LayerIntermediates = namedtuple('Intermediates', [
|
||||
'hiddens',
|
||||
'attn_intermediates'
|
||||
])
|
||||
LayerIntermediates = namedtuple(
|
||||
'Intermediates', ['hiddens', 'attn_intermediates']
|
||||
)
|
||||
|
||||
|
||||
class AbsolutePositionalEmbedding(nn.Module):
|
||||
@@ -39,11 +37,16 @@ class AbsolutePositionalEmbedding(nn.Module):
|
||||
class FixedPositionalEmbedding(nn.Module):
|
||||
def __init__(self, dim):
|
||||
super().__init__()
|
||||
inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
|
||||
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
|
||||
self.register_buffer('inv_freq', inv_freq)
|
||||
|
||||
def forward(self, x, seq_dim=1, offset=0):
|
||||
t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
|
||||
t = (
|
||||
torch.arange(x.shape[seq_dim], device=x.device).type_as(
|
||||
self.inv_freq
|
||||
)
|
||||
+ offset
|
||||
)
|
||||
sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
|
||||
emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
|
||||
return emb[None, :, :]
|
||||
@@ -51,6 +54,7 @@ class FixedPositionalEmbedding(nn.Module):
|
||||
|
||||
# helpers
|
||||
|
||||
|
||||
def exists(val):
|
||||
return val is not None
|
||||
|
||||
@@ -64,18 +68,21 @@ def default(val, d):
|
||||
def always(val):
|
||||
def inner(*args, **kwargs):
|
||||
return val
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def not_equals(val):
|
||||
def inner(x):
|
||||
return x != val
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def equals(val):
|
||||
def inner(x):
|
||||
return x == val
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
@@ -85,6 +92,7 @@ def max_neg_value(tensor):
|
||||
|
||||
# keyword argument helpers
|
||||
|
||||
|
||||
def pick_and_pop(keys, d):
|
||||
values = list(map(lambda key: d.pop(key), keys))
|
||||
return dict(zip(keys, values))
|
||||
@@ -108,8 +116,15 @@ def group_by_key_prefix(prefix, d):
|
||||
|
||||
|
||||
def groupby_prefix_and_trim(prefix, d):
|
||||
kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
|
||||
kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
|
||||
kwargs_with_prefix, kwargs = group_dict_by_key(
|
||||
partial(string_begins_with, prefix), d
|
||||
)
|
||||
kwargs_without_prefix = dict(
|
||||
map(
|
||||
lambda x: (x[0][len(prefix) :], x[1]),
|
||||
tuple(kwargs_with_prefix.items()),
|
||||
)
|
||||
)
|
||||
return kwargs_without_prefix, kwargs
|
||||
|
||||
|
||||
@@ -139,7 +154,7 @@ class Rezero(nn.Module):
|
||||
class ScaleNorm(nn.Module):
|
||||
def __init__(self, dim, eps=1e-5):
|
||||
super().__init__()
|
||||
self.scale = dim ** -0.5
|
||||
self.scale = dim**-0.5
|
||||
self.eps = eps
|
||||
self.g = nn.Parameter(torch.ones(1))
|
||||
|
||||
@@ -151,7 +166,7 @@ class ScaleNorm(nn.Module):
|
||||
class RMSNorm(nn.Module):
|
||||
def __init__(self, dim, eps=1e-8):
|
||||
super().__init__()
|
||||
self.scale = dim ** -0.5
|
||||
self.scale = dim**-0.5
|
||||
self.eps = eps
|
||||
self.g = nn.Parameter(torch.ones(dim))
|
||||
|
||||
@@ -173,7 +188,7 @@ class GRUGating(nn.Module):
|
||||
def forward(self, x, residual):
|
||||
gated_output = self.gru(
|
||||
rearrange(x, 'b n d -> (b n) d'),
|
||||
rearrange(residual, 'b n d -> (b n) d')
|
||||
rearrange(residual, 'b n d -> (b n) d'),
|
||||
)
|
||||
|
||||
return gated_output.reshape_as(x)
|
||||
@@ -181,6 +196,7 @@ class GRUGating(nn.Module):
|
||||
|
||||
# feedforward
|
||||
|
||||
|
||||
class GEGLU(nn.Module):
|
||||
def __init__(self, dim_in, dim_out):
|
||||
super().__init__()
|
||||
@@ -192,19 +208,18 @@ class GEGLU(nn.Module):
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
|
||||
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
|
||||
super().__init__()
|
||||
inner_dim = int(dim * mult)
|
||||
dim_out = default(dim_out, dim)
|
||||
project_in = nn.Sequential(
|
||||
nn.Linear(dim, inner_dim),
|
||||
nn.GELU()
|
||||
) if not glu else GEGLU(dim, inner_dim)
|
||||
project_in = (
|
||||
nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
|
||||
if not glu
|
||||
else GEGLU(dim, inner_dim)
|
||||
)
|
||||
|
||||
self.net = nn.Sequential(
|
||||
project_in,
|
||||
nn.Dropout(dropout),
|
||||
nn.Linear(inner_dim, dim_out)
|
||||
project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
@@ -214,23 +229,25 @@ class FeedForward(nn.Module):
|
||||
# attention.
|
||||
class Attention(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
dim_head=DEFAULT_DIM_HEAD,
|
||||
heads=8,
|
||||
causal=False,
|
||||
mask=None,
|
||||
talking_heads=False,
|
||||
sparse_topk=None,
|
||||
use_entmax15=False,
|
||||
num_mem_kv=0,
|
||||
dropout=0.,
|
||||
on_attn=False
|
||||
self,
|
||||
dim,
|
||||
dim_head=DEFAULT_DIM_HEAD,
|
||||
heads=8,
|
||||
causal=False,
|
||||
mask=None,
|
||||
talking_heads=False,
|
||||
sparse_topk=None,
|
||||
use_entmax15=False,
|
||||
num_mem_kv=0,
|
||||
dropout=0.0,
|
||||
on_attn=False,
|
||||
):
|
||||
super().__init__()
|
||||
if use_entmax15:
|
||||
raise NotImplementedError("Check out entmax activation instead of softmax activation!")
|
||||
self.scale = dim_head ** -0.5
|
||||
raise NotImplementedError(
|
||||
'Check out entmax activation instead of softmax activation!'
|
||||
)
|
||||
self.scale = dim_head**-0.5
|
||||
self.heads = heads
|
||||
self.causal = causal
|
||||
self.mask = mask
|
||||
@@ -252,7 +269,7 @@ class Attention(nn.Module):
|
||||
self.sparse_topk = sparse_topk
|
||||
|
||||
# entmax
|
||||
#self.attn_fn = entmax15 if use_entmax15 else F.softmax
|
||||
# self.attn_fn = entmax15 if use_entmax15 else F.softmax
|
||||
self.attn_fn = F.softmax
|
||||
|
||||
# add memory key / values
|
||||
@@ -263,20 +280,29 @@ class Attention(nn.Module):
|
||||
|
||||
# attention on attention
|
||||
self.attn_on_attn = on_attn
|
||||
self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
|
||||
self.to_out = (
|
||||
nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU())
|
||||
if on_attn
|
||||
else nn.Linear(inner_dim, dim)
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x,
|
||||
context=None,
|
||||
mask=None,
|
||||
context_mask=None,
|
||||
rel_pos=None,
|
||||
sinusoidal_emb=None,
|
||||
prev_attn=None,
|
||||
mem=None
|
||||
self,
|
||||
x,
|
||||
context=None,
|
||||
mask=None,
|
||||
context_mask=None,
|
||||
rel_pos=None,
|
||||
sinusoidal_emb=None,
|
||||
prev_attn=None,
|
||||
mem=None,
|
||||
):
|
||||
b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
|
||||
b, n, _, h, talking_heads, device = (
|
||||
*x.shape,
|
||||
self.heads,
|
||||
self.talking_heads,
|
||||
x.device,
|
||||
)
|
||||
kv_input = default(context, x)
|
||||
|
||||
q_input = x
|
||||
@@ -297,23 +323,35 @@ class Attention(nn.Module):
|
||||
k = self.to_k(k_input)
|
||||
v = self.to_v(v_input)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
|
||||
q, k, v = map(
|
||||
lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)
|
||||
)
|
||||
|
||||
input_mask = None
|
||||
if any(map(exists, (mask, context_mask))):
|
||||
q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
|
||||
q_mask = default(
|
||||
mask, lambda: torch.ones((b, n), device=device).bool()
|
||||
)
|
||||
k_mask = q_mask if not exists(context) else context_mask
|
||||
k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
|
||||
k_mask = default(
|
||||
k_mask,
|
||||
lambda: torch.ones((b, k.shape[-2]), device=device).bool(),
|
||||
)
|
||||
q_mask = rearrange(q_mask, 'b i -> b () i ()')
|
||||
k_mask = rearrange(k_mask, 'b j -> b () () j')
|
||||
input_mask = q_mask * k_mask
|
||||
|
||||
if self.num_mem_kv > 0:
|
||||
mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
|
||||
mem_k, mem_v = map(
|
||||
lambda t: repeat(t, 'h n d -> b h n d', b=b),
|
||||
(self.mem_k, self.mem_v),
|
||||
)
|
||||
k = torch.cat((mem_k, k), dim=-2)
|
||||
v = torch.cat((mem_v, v), dim=-2)
|
||||
if exists(input_mask):
|
||||
input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
|
||||
input_mask = F.pad(
|
||||
input_mask, (self.num_mem_kv, 0), value=True
|
||||
)
|
||||
|
||||
dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
|
||||
mask_value = max_neg_value(dots)
|
||||
@@ -324,7 +362,9 @@ class Attention(nn.Module):
|
||||
pre_softmax_attn = dots
|
||||
|
||||
if talking_heads:
|
||||
dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
|
||||
dots = einsum(
|
||||
'b h i j, h k -> b k i j', dots, self.pre_softmax_proj
|
||||
).contiguous()
|
||||
|
||||
if exists(rel_pos):
|
||||
dots = rel_pos(dots)
|
||||
@@ -336,7 +376,9 @@ class Attention(nn.Module):
|
||||
if self.causal:
|
||||
i, j = dots.shape[-2:]
|
||||
r = torch.arange(i, device=device)
|
||||
mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
|
||||
mask = rearrange(r, 'i -> () () i ()') < rearrange(
|
||||
r, 'j -> () () () j'
|
||||
)
|
||||
mask = F.pad(mask, (j - i, 0), value=False)
|
||||
dots.masked_fill_(mask, mask_value)
|
||||
del mask
|
||||
@@ -354,14 +396,16 @@ class Attention(nn.Module):
|
||||
attn = self.dropout(attn)
|
||||
|
||||
if talking_heads:
|
||||
attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
|
||||
attn = einsum(
|
||||
'b h i j, h k -> b k i j', attn, self.post_softmax_proj
|
||||
).contiguous()
|
||||
|
||||
out = einsum('b h i j, b h j d -> b h i d', attn, v)
|
||||
out = rearrange(out, 'b h n d -> b n (h d)')
|
||||
|
||||
intermediates = Intermediates(
|
||||
pre_softmax_attn=pre_softmax_attn,
|
||||
post_softmax_attn=post_softmax_attn
|
||||
post_softmax_attn=post_softmax_attn,
|
||||
)
|
||||
|
||||
return self.to_out(out), intermediates
|
||||
@@ -369,28 +413,28 @@ class Attention(nn.Module):
|
||||
|
||||
class AttentionLayers(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
depth,
|
||||
heads=8,
|
||||
causal=False,
|
||||
cross_attend=False,
|
||||
only_cross=False,
|
||||
use_scalenorm=False,
|
||||
use_rmsnorm=False,
|
||||
use_rezero=False,
|
||||
rel_pos_num_buckets=32,
|
||||
rel_pos_max_distance=128,
|
||||
position_infused_attn=False,
|
||||
custom_layers=None,
|
||||
sandwich_coef=None,
|
||||
par_ratio=None,
|
||||
residual_attn=False,
|
||||
cross_residual_attn=False,
|
||||
macaron=False,
|
||||
pre_norm=True,
|
||||
gate_residual=False,
|
||||
**kwargs
|
||||
self,
|
||||
dim,
|
||||
depth,
|
||||
heads=8,
|
||||
causal=False,
|
||||
cross_attend=False,
|
||||
only_cross=False,
|
||||
use_scalenorm=False,
|
||||
use_rmsnorm=False,
|
||||
use_rezero=False,
|
||||
rel_pos_num_buckets=32,
|
||||
rel_pos_max_distance=128,
|
||||
position_infused_attn=False,
|
||||
custom_layers=None,
|
||||
sandwich_coef=None,
|
||||
par_ratio=None,
|
||||
residual_attn=False,
|
||||
cross_residual_attn=False,
|
||||
macaron=False,
|
||||
pre_norm=True,
|
||||
gate_residual=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
|
||||
@@ -403,10 +447,14 @@ class AttentionLayers(nn.Module):
|
||||
self.layers = nn.ModuleList([])
|
||||
|
||||
self.has_pos_emb = position_infused_attn
|
||||
self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
|
||||
self.pia_pos_emb = (
|
||||
FixedPositionalEmbedding(dim) if position_infused_attn else None
|
||||
)
|
||||
self.rotary_pos_emb = always(None)
|
||||
|
||||
assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
|
||||
assert (
|
||||
rel_pos_num_buckets <= rel_pos_max_distance
|
||||
), 'number of relative position buckets must be less than the relative position max distance'
|
||||
self.rel_pos = None
|
||||
|
||||
self.pre_norm = pre_norm
|
||||
@@ -438,15 +486,27 @@ class AttentionLayers(nn.Module):
|
||||
assert 1 < par_ratio <= par_depth, 'par ratio out of range'
|
||||
default_block = tuple(filter(not_equals('f'), default_block))
|
||||
par_attn = par_depth // par_ratio
|
||||
depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper
|
||||
depth_cut = (
|
||||
par_depth * 2 // 3
|
||||
) # 2 / 3 attention layer cutoff suggested by PAR paper
|
||||
par_width = (depth_cut + depth_cut // par_attn) // par_attn
|
||||
assert len(default_block) <= par_width, 'default block is too large for par_ratio'
|
||||
par_block = default_block + ('f',) * (par_width - len(default_block))
|
||||
assert (
|
||||
len(default_block) <= par_width
|
||||
), 'default block is too large for par_ratio'
|
||||
par_block = default_block + ('f',) * (
|
||||
par_width - len(default_block)
|
||||
)
|
||||
par_head = par_block * par_attn
|
||||
layer_types = par_head + ('f',) * (par_depth - len(par_head))
|
||||
elif exists(sandwich_coef):
|
||||
assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
|
||||
layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
|
||||
assert (
|
||||
sandwich_coef > 0 and sandwich_coef <= depth
|
||||
), 'sandwich coefficient should be less than the depth'
|
||||
layer_types = (
|
||||
('a',) * sandwich_coef
|
||||
+ default_block * (depth - sandwich_coef)
|
||||
+ ('f',) * sandwich_coef
|
||||
)
|
||||
else:
|
||||
layer_types = default_block * depth
|
||||
|
||||
@@ -455,7 +515,9 @@ class AttentionLayers(nn.Module):
|
||||
|
||||
for layer_type in self.layer_types:
|
||||
if layer_type == 'a':
|
||||
layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
|
||||
layer = Attention(
|
||||
dim, heads=heads, causal=causal, **attn_kwargs
|
||||
)
|
||||
elif layer_type == 'c':
|
||||
layer = Attention(dim, heads=heads, **attn_kwargs)
|
||||
elif layer_type == 'f':
|
||||
@@ -472,20 +534,17 @@ class AttentionLayers(nn.Module):
|
||||
else:
|
||||
residual_fn = Residual()
|
||||
|
||||
self.layers.append(nn.ModuleList([
|
||||
norm_fn(),
|
||||
layer,
|
||||
residual_fn
|
||||
]))
|
||||
self.layers.append(nn.ModuleList([norm_fn(), layer, residual_fn]))
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x,
|
||||
context=None,
|
||||
mask=None,
|
||||
context_mask=None,
|
||||
mems=None,
|
||||
return_hiddens=False
|
||||
self,
|
||||
x,
|
||||
context=None,
|
||||
mask=None,
|
||||
context_mask=None,
|
||||
mems=None,
|
||||
return_hiddens=False,
|
||||
**kwargs,
|
||||
):
|
||||
hiddens = []
|
||||
intermediates = []
|
||||
@@ -494,7 +553,9 @@ class AttentionLayers(nn.Module):
|
||||
|
||||
mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
|
||||
|
||||
for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
|
||||
for ind, (layer_type, (norm, block, residual_fn)) in enumerate(
|
||||
zip(self.layer_types, self.layers)
|
||||
):
|
||||
is_last = ind == (len(self.layers) - 1)
|
||||
|
||||
if layer_type == 'a':
|
||||
@@ -507,10 +568,22 @@ class AttentionLayers(nn.Module):
|
||||
x = norm(x)
|
||||
|
||||
if layer_type == 'a':
|
||||
out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
|
||||
prev_attn=prev_attn, mem=layer_mem)
|
||||
out, inter = block(
|
||||
x,
|
||||
mask=mask,
|
||||
sinusoidal_emb=self.pia_pos_emb,
|
||||
rel_pos=self.rel_pos,
|
||||
prev_attn=prev_attn,
|
||||
mem=layer_mem,
|
||||
)
|
||||
elif layer_type == 'c':
|
||||
out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
|
||||
out, inter = block(
|
||||
x,
|
||||
context=context,
|
||||
mask=mask,
|
||||
context_mask=context_mask,
|
||||
prev_attn=prev_cross_attn,
|
||||
)
|
||||
elif layer_type == 'f':
|
||||
out = block(x)
|
||||
|
||||
@@ -529,8 +602,7 @@ class AttentionLayers(nn.Module):
|
||||
|
||||
if return_hiddens:
|
||||
intermediates = LayerIntermediates(
|
||||
hiddens=hiddens,
|
||||
attn_intermediates=intermediates
|
||||
hiddens=hiddens, attn_intermediates=intermediates
|
||||
)
|
||||
|
||||
return x, intermediates
|
||||
@@ -544,23 +616,24 @@ class Encoder(AttentionLayers):
|
||||
super().__init__(causal=False, **kwargs)
|
||||
|
||||
|
||||
|
||||
class TransformerWrapper(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
num_tokens,
|
||||
max_seq_len,
|
||||
attn_layers,
|
||||
emb_dim=None,
|
||||
max_mem_len=0.,
|
||||
emb_dropout=0.,
|
||||
num_memory_tokens=None,
|
||||
tie_embedding=False,
|
||||
use_pos_emb=True
|
||||
self,
|
||||
*,
|
||||
num_tokens,
|
||||
max_seq_len,
|
||||
attn_layers,
|
||||
emb_dim=None,
|
||||
max_mem_len=0.0,
|
||||
emb_dropout=0.0,
|
||||
num_memory_tokens=None,
|
||||
tie_embedding=False,
|
||||
use_pos_emb=True,
|
||||
):
|
||||
super().__init__()
|
||||
assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
|
||||
assert isinstance(
|
||||
attn_layers, AttentionLayers
|
||||
), 'attention layers must be one of Encoder or Decoder'
|
||||
|
||||
dim = attn_layers.dim
|
||||
emb_dim = default(emb_dim, dim)
|
||||
@@ -570,23 +643,34 @@ class TransformerWrapper(nn.Module):
|
||||
self.num_tokens = num_tokens
|
||||
|
||||
self.token_emb = nn.Embedding(num_tokens, emb_dim)
|
||||
self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
|
||||
use_pos_emb and not attn_layers.has_pos_emb) else always(0)
|
||||
self.pos_emb = (
|
||||
AbsolutePositionalEmbedding(emb_dim, max_seq_len)
|
||||
if (use_pos_emb and not attn_layers.has_pos_emb)
|
||||
else always(0)
|
||||
)
|
||||
self.emb_dropout = nn.Dropout(emb_dropout)
|
||||
|
||||
self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
|
||||
self.project_emb = (
|
||||
nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
|
||||
)
|
||||
self.attn_layers = attn_layers
|
||||
self.norm = nn.LayerNorm(dim)
|
||||
|
||||
self.init_()
|
||||
|
||||
self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
|
||||
self.to_logits = (
|
||||
nn.Linear(dim, num_tokens)
|
||||
if not tie_embedding
|
||||
else lambda t: t @ self.token_emb.weight.t()
|
||||
)
|
||||
|
||||
# memory tokens (like [cls]) from Memory Transformers paper
|
||||
num_memory_tokens = default(num_memory_tokens, 0)
|
||||
self.num_memory_tokens = num_memory_tokens
|
||||
if num_memory_tokens > 0:
|
||||
self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
|
||||
self.memory_tokens = nn.Parameter(
|
||||
torch.randn(num_memory_tokens, dim)
|
||||
)
|
||||
|
||||
# let funnel encoder know number of memory tokens, if specified
|
||||
if hasattr(attn_layers, 'num_memory_tokens'):
|
||||
@@ -596,18 +680,26 @@ class TransformerWrapper(nn.Module):
|
||||
nn.init.normal_(self.token_emb.weight, std=0.02)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x,
|
||||
return_embeddings=False,
|
||||
mask=None,
|
||||
return_mems=False,
|
||||
return_attn=False,
|
||||
mems=None,
|
||||
**kwargs
|
||||
self,
|
||||
x,
|
||||
return_embeddings=False,
|
||||
mask=None,
|
||||
return_mems=False,
|
||||
return_attn=False,
|
||||
mems=None,
|
||||
embedding_manager=None,
|
||||
**kwargs,
|
||||
):
|
||||
b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
|
||||
x = self.token_emb(x)
|
||||
x += self.pos_emb(x)
|
||||
|
||||
embedded_x = self.token_emb(x)
|
||||
|
||||
if embedding_manager:
|
||||
x = embedding_manager(x, embedded_x)
|
||||
else:
|
||||
x = embedded_x
|
||||
|
||||
x = x + self.pos_emb(x)
|
||||
x = self.emb_dropout(x)
|
||||
|
||||
x = self.project_emb(x)
|
||||
@@ -620,7 +712,9 @@ class TransformerWrapper(nn.Module):
|
||||
if exists(mask):
|
||||
mask = F.pad(mask, (num_mem, 0), value=True)
|
||||
|
||||
x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
|
||||
x, intermediates = self.attn_layers(
|
||||
x, mask=mask, mems=mems, return_hiddens=True, **kwargs
|
||||
)
|
||||
x = self.norm(x)
|
||||
|
||||
mem, x = x[:, :num_mem], x[:, num_mem:]
|
||||
@@ -629,13 +723,30 @@ class TransformerWrapper(nn.Module):
|
||||
|
||||
if return_mems:
|
||||
hiddens = intermediates.hiddens
|
||||
new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
|
||||
new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
|
||||
new_mems = (
|
||||
list(
|
||||
map(
|
||||
lambda pair: torch.cat(pair, dim=-2),
|
||||
zip(mems, hiddens),
|
||||
)
|
||||
)
|
||||
if exists(mems)
|
||||
else hiddens
|
||||
)
|
||||
new_mems = list(
|
||||
map(
|
||||
lambda t: t[..., -self.max_mem_len :, :].detach(), new_mems
|
||||
)
|
||||
)
|
||||
return out, new_mems
|
||||
|
||||
if return_attn:
|
||||
attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
|
||||
attn_maps = list(
|
||||
map(
|
||||
lambda t: t.post_softmax_attn,
|
||||
intermediates.attn_intermediates,
|
||||
)
|
||||
)
|
||||
return out, attn_maps
|
||||
|
||||
return out
|
||||
|
||||
|
||||
1190
ldm/simplet2i.py
61
ldm/util.py
@@ -12,22 +12,26 @@ from queue import Queue
|
||||
|
||||
from inspect import isfunction
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
|
||||
def log_txt_as_img(wh, xc, size=10):
|
||||
# wh a tuple of (width, height)
|
||||
# xc a list of captions to plot
|
||||
b = len(xc)
|
||||
txts = list()
|
||||
for bi in range(b):
|
||||
txt = Image.new("RGB", wh, color="white")
|
||||
txt = Image.new('RGB', wh, color='white')
|
||||
draw = ImageDraw.Draw(txt)
|
||||
font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
|
||||
font = ImageFont.load_default()
|
||||
nc = int(40 * (wh[0] / 256))
|
||||
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
|
||||
lines = '\n'.join(
|
||||
xc[bi][start : start + nc] for start in range(0, len(xc[bi]), nc)
|
||||
)
|
||||
|
||||
try:
|
||||
draw.text((0, 0), lines, fill="black", font=font)
|
||||
draw.text((0, 0), lines, fill='black', font=font)
|
||||
except UnicodeEncodeError:
|
||||
print("Cant encode string for logging. Skipping.")
|
||||
print('Cant encode string for logging. Skipping.')
|
||||
|
||||
txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
|
||||
txts.append(txt)
|
||||
@@ -69,22 +73,26 @@ def mean_flat(tensor):
|
||||
def count_params(model, verbose=False):
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
if verbose:
|
||||
print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
|
||||
print(
|
||||
f'{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.'
|
||||
)
|
||||
return total_params
|
||||
|
||||
|
||||
def instantiate_from_config(config):
|
||||
if not "target" in config:
|
||||
def instantiate_from_config(config, **kwargs):
|
||||
if not 'target' in config:
|
||||
if config == '__is_first_stage__':
|
||||
return None
|
||||
elif config == "__is_unconditional__":
|
||||
elif config == '__is_unconditional__':
|
||||
return None
|
||||
raise KeyError("Expected key `target` to instantiate.")
|
||||
return get_obj_from_str(config["target"])(**config.get("params", dict()))
|
||||
raise KeyError('Expected key `target` to instantiate.')
|
||||
return get_obj_from_str(config['target'])(
|
||||
**config.get('params', dict()), **kwargs
|
||||
)
|
||||
|
||||
|
||||
def get_obj_from_str(string, reload=False):
|
||||
module, cls = string.rsplit(".", 1)
|
||||
module, cls = string.rsplit('.', 1)
|
||||
if reload:
|
||||
module_imp = importlib.import_module(module)
|
||||
importlib.reload(module_imp)
|
||||
@@ -100,31 +108,36 @@ def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
|
||||
else:
|
||||
res = func(data)
|
||||
Q.put([idx, res])
|
||||
Q.put("Done")
|
||||
Q.put('Done')
|
||||
|
||||
|
||||
def parallel_data_prefetch(
|
||||
func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
|
||||
func: callable,
|
||||
data,
|
||||
n_proc,
|
||||
target_data_type='ndarray',
|
||||
cpu_intensive=True,
|
||||
use_worker_id=False,
|
||||
):
|
||||
# if target_data_type not in ["ndarray", "list"]:
|
||||
# raise ValueError(
|
||||
# "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
|
||||
# )
|
||||
if isinstance(data, np.ndarray) and target_data_type == "list":
|
||||
raise ValueError("list expected but function got ndarray.")
|
||||
if isinstance(data, np.ndarray) and target_data_type == 'list':
|
||||
raise ValueError('list expected but function got ndarray.')
|
||||
elif isinstance(data, abc.Iterable):
|
||||
if isinstance(data, dict):
|
||||
print(
|
||||
f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
|
||||
)
|
||||
data = list(data.values())
|
||||
if target_data_type == "ndarray":
|
||||
if target_data_type == 'ndarray':
|
||||
data = np.asarray(data)
|
||||
else:
|
||||
data = list(data)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
|
||||
f'The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}.'
|
||||
)
|
||||
|
||||
if cpu_intensive:
|
||||
@@ -134,7 +147,7 @@ def parallel_data_prefetch(
|
||||
Q = Queue(1000)
|
||||
proc = Thread
|
||||
# spawn processes
|
||||
if target_data_type == "ndarray":
|
||||
if target_data_type == 'ndarray':
|
||||
arguments = [
|
||||
[func, Q, part, i, use_worker_id]
|
||||
for i, part in enumerate(np.array_split(data, n_proc))
|
||||
@@ -148,7 +161,7 @@ def parallel_data_prefetch(
|
||||
arguments = [
|
||||
[func, Q, part, i, use_worker_id]
|
||||
for i, part in enumerate(
|
||||
[data[i: i + step] for i in range(0, len(data), step)]
|
||||
[data[i : i + step] for i in range(0, len(data), step)]
|
||||
)
|
||||
]
|
||||
processes = []
|
||||
@@ -157,7 +170,7 @@ def parallel_data_prefetch(
|
||||
processes += [p]
|
||||
|
||||
# start processes
|
||||
print(f"Start prefetching...")
|
||||
print(f'Start prefetching...')
|
||||
import time
|
||||
|
||||
start = time.time()
|
||||
@@ -170,13 +183,13 @@ def parallel_data_prefetch(
|
||||
while k < n_proc:
|
||||
# get result
|
||||
res = Q.get()
|
||||
if res == "Done":
|
||||
if res == 'Done':
|
||||
k += 1
|
||||
else:
|
||||
gather_res[res[0]] = res[1]
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e)
|
||||
print('Exception: ', e)
|
||||
for p in processes:
|
||||
p.terminate()
|
||||
|
||||
@@ -184,7 +197,7 @@ def parallel_data_prefetch(
|
||||
finally:
|
||||
for p in processes:
|
||||
p.join()
|
||||
print(f"Prefetching complete. [{time.time() - start} sec.]")
|
||||
print(f'Prefetching complete. [{time.time() - start} sec.]')
|
||||
|
||||
if target_data_type == 'ndarray':
|
||||
if not isinstance(gather_res[0], np.ndarray):
|
||||
|
||||
@@ -14,7 +14,7 @@ from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from ldm.util import ismap
|
||||
import time
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
def download_models(mode):
|
||||
|
||||
@@ -117,7 +117,8 @@ def get_cond(mode, selected_path):
|
||||
c = rearrange(c, '1 c h w -> 1 h w c')
|
||||
c = 2. * c - 1.
|
||||
|
||||
c = c.to(torch.device("cuda"))
|
||||
device = choose_torch_device()
|
||||
c = c.to(device)
|
||||
example["LR_image"] = c
|
||||
example["image"] = c_up
|
||||
|
||||
@@ -267,4 +268,4 @@ def make_convolutional_sample(batch, model, mode="vanilla", custom_steps=None, e
|
||||
log["sample"] = x_sample
|
||||
log["time"] = t1 - t0
|
||||
|
||||
return log
|
||||
return log
|
||||
|
||||
23
requirements.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
albumentations==0.4.3
|
||||
einops==0.3.0
|
||||
huggingface-hub==0.8.1
|
||||
imageio==2.9.0
|
||||
imageio-ffmpeg==0.4.2
|
||||
kornia==0.6.0
|
||||
numpy==1.23.1
|
||||
--pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
omegaconf==2.1.1
|
||||
opencv-python==4.6.0.66
|
||||
pillow==9.2.0
|
||||
pudb==2019.2
|
||||
torch==1.12.1
|
||||
torchvision==0.12.0
|
||||
pytorch-lightning==1.4.2
|
||||
streamlit==1.12.0
|
||||
test-tube>=0.7.5
|
||||
torch-fidelity==0.3.0
|
||||
torchmetrics==0.6.0
|
||||
transformers==4.19.2
|
||||
-e git+https://github.com/openai/CLIP.git@main#egg=clip
|
||||
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
||||
-e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k-diffusion
|
||||
837
scripts/dream.py
@@ -1,376 +1,633 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||
|
||||
import argparse
|
||||
import shlex
|
||||
import atexit
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from PIL import Image,PngImagePlugin
|
||||
|
||||
# readline unavailable on windows systems
|
||||
try:
|
||||
import readline
|
||||
readline_available = True
|
||||
except:
|
||||
readline_available = False
|
||||
|
||||
debugging = False
|
||||
import copy
|
||||
import warnings
|
||||
import time
|
||||
import ldm.dream.readline
|
||||
from ldm.dream.pngwriter import PngWriter, PromptFormatter
|
||||
from ldm.dream.server import DreamServer, ThreadingDreamServer
|
||||
from ldm.dream.image_util import make_grid
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
def main():
|
||||
''' Initialize command-line parsers and the diffusion model '''
|
||||
"""Initialize command-line parsers and the diffusion model"""
|
||||
arg_parser = create_argv_parser()
|
||||
opt = arg_parser.parse_args()
|
||||
opt = arg_parser.parse_args()
|
||||
|
||||
if opt.laion400m:
|
||||
# defaults suitable to the older latent diffusion weights
|
||||
width = 256
|
||||
height = 256
|
||||
config = "configs/latent-diffusion/txt2img-1p4B-eval.yaml"
|
||||
weights = "models/ldm/text2img-large/model.ckpt"
|
||||
else:
|
||||
# some defaults suitable for stable diffusion weights
|
||||
width = 512
|
||||
height = 512
|
||||
config = "configs/stable-diffusion/v1-inference.yaml"
|
||||
weights = "models/ldm/stable-diffusion-v1/model.ckpt"
|
||||
print('--laion400m flag has been deprecated. Please use --model laion400m instead.')
|
||||
sys.exit(-1)
|
||||
if opt.weights != 'model':
|
||||
print('--weights argument has been deprecated. Please configure ./configs/models.yaml, and call it using --model instead.')
|
||||
sys.exit(-1)
|
||||
|
||||
try:
|
||||
models = OmegaConf.load(opt.config)
|
||||
width = models[opt.model].width
|
||||
height = models[opt.model].height
|
||||
config = models[opt.model].config
|
||||
weights = models[opt.model].weights
|
||||
except (FileNotFoundError, IOError, KeyError) as e:
|
||||
print(f'{e}. Aborting.')
|
||||
sys.exit(-1)
|
||||
|
||||
# command line history will be stored in a file called "~/.dream_history"
|
||||
if readline_available:
|
||||
setup_readline()
|
||||
|
||||
print("* Initializing, be patient...\n")
|
||||
print('* Initializing, be patient...\n')
|
||||
sys.path.append('.')
|
||||
from pytorch_lightning import logging
|
||||
from ldm.simplet2i import T2I
|
||||
|
||||
# these two lines prevent a horrible warning message from appearing
|
||||
# when the frozen CLIP tokenizer is imported
|
||||
import transformers
|
||||
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
|
||||
# creating a simple text2image object with a handful of
|
||||
# defaults passed on the command line.
|
||||
# additional parameters will be added (or overriden) during
|
||||
# the user input loop
|
||||
t2i = T2I(width=width,
|
||||
height=height,
|
||||
batch_size=opt.batch_size,
|
||||
outdir=opt.outdir,
|
||||
sampler_name=opt.sampler_name,
|
||||
weights=weights,
|
||||
full_precision=opt.full_precision,
|
||||
config=config,
|
||||
latent_diffusion_weights=opt.laion400m # this is solely for recreating the prompt
|
||||
t2i = T2I(
|
||||
width=width,
|
||||
height=height,
|
||||
sampler_name=opt.sampler_name,
|
||||
weights=weights,
|
||||
full_precision=opt.full_precision,
|
||||
config=config,
|
||||
grid = opt.grid,
|
||||
# this is solely for recreating the prompt
|
||||
latent_diffusion_weights=opt.laion400m,
|
||||
embedding_path=opt.embedding_path,
|
||||
device_type=opt.device
|
||||
)
|
||||
|
||||
# make sure the output directory exists
|
||||
if not os.path.exists(opt.outdir):
|
||||
os.makedirs(opt.outdir)
|
||||
|
||||
# gets rid of annoying messages about random seed
|
||||
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
|
||||
|
||||
# gets rid of annoying messages about random seed
|
||||
logging.getLogger('pytorch_lightning').setLevel(logging.ERROR)
|
||||
|
||||
# load the infile as a list of lines
|
||||
infile = None
|
||||
try:
|
||||
if opt.infile is not None:
|
||||
infile = open(opt.infile,'r')
|
||||
except FileNotFoundError as e:
|
||||
print(e)
|
||||
exit(-1)
|
||||
if opt.infile:
|
||||
try:
|
||||
if os.path.isfile(opt.infile):
|
||||
infile = open(opt.infile, 'r', encoding='utf-8')
|
||||
elif opt.infile == '-': # stdin
|
||||
infile = sys.stdin
|
||||
else:
|
||||
raise FileNotFoundError(f'{opt.infile} not found.')
|
||||
except (FileNotFoundError, IOError) as e:
|
||||
print(f'{e}. Aborting.')
|
||||
sys.exit(-1)
|
||||
|
||||
# preload the model
|
||||
if not debugging:
|
||||
t2i.load_model()
|
||||
print("\n* Initialization done! Awaiting your command (-h for help, 'q' to quit, 'cd' to change output dir, 'pwd' to print output dir)...")
|
||||
tic = time.time()
|
||||
t2i.load_model()
|
||||
print(
|
||||
f'>> model loaded in', '%4.2fs' % (time.time() - tic)
|
||||
)
|
||||
|
||||
log_path = os.path.join(opt.outdir,'dream_log.txt')
|
||||
with open(log_path,'a') as log:
|
||||
cmd_parser = create_cmd_parser()
|
||||
main_loop(t2i,cmd_parser,log,infile)
|
||||
log.close()
|
||||
if infile:
|
||||
infile.close()
|
||||
if not infile:
|
||||
print(
|
||||
"\n* Initialization done! Awaiting your command (-h for help, 'q' to quit)"
|
||||
)
|
||||
|
||||
cmd_parser = create_cmd_parser()
|
||||
if opt.web:
|
||||
dream_server_loop(t2i, opt.host, opt.port)
|
||||
else:
|
||||
main_loop(t2i, opt.outdir, opt.prompt_as_dir, cmd_parser, infile)
|
||||
|
||||
|
||||
def main_loop(t2i,parser,log,infile):
|
||||
''' prompt/read/execute loop '''
|
||||
def main_loop(t2i, outdir, prompt_as_dir, parser, infile):
|
||||
"""prompt/read/execute loop"""
|
||||
done = False
|
||||
|
||||
last_seeds = []
|
||||
path_filter = re.compile(r'[<>:"/\\|?*]')
|
||||
|
||||
# os.pathconf is not available on Windows
|
||||
if hasattr(os, 'pathconf'):
|
||||
path_max = os.pathconf(outdir, 'PC_PATH_MAX')
|
||||
name_max = os.pathconf(outdir, 'PC_NAME_MAX')
|
||||
else:
|
||||
path_max = 260
|
||||
name_max = 255
|
||||
|
||||
while not done:
|
||||
try:
|
||||
command = infile.readline() if infile else input("dream> ")
|
||||
command = get_next_command(infile)
|
||||
except EOFError:
|
||||
done = True
|
||||
break
|
||||
|
||||
if infile and len(command)==0:
|
||||
done = True
|
||||
break
|
||||
|
||||
if command.startswith(('#','//')):
|
||||
# skip empty lines
|
||||
if not command.strip():
|
||||
continue
|
||||
|
||||
if command.startswith(('#', '//')):
|
||||
continue
|
||||
|
||||
# before splitting, escape single quotes so as not to mess
|
||||
# up the parser
|
||||
command = command.replace("'", "\\'")
|
||||
|
||||
try:
|
||||
elements = shlex.split(command)
|
||||
except ValueError as e:
|
||||
print(str(e))
|
||||
continue
|
||||
|
||||
if len(elements)==0:
|
||||
continue
|
||||
|
||||
if elements[0]=='q':
|
||||
if elements[0] == 'q':
|
||||
done = True
|
||||
break
|
||||
|
||||
if elements[0]=='cd' and len(elements)>1:
|
||||
if os.path.exists(elements[1]):
|
||||
print(f"setting image output directory to {elements[1]}")
|
||||
t2i.outdir=elements[1]
|
||||
else:
|
||||
print(f"directory {elements[1]} does not exist")
|
||||
continue
|
||||
|
||||
if elements[0]=='pwd':
|
||||
print(f"current output directory is {t2i.outdir}")
|
||||
continue
|
||||
|
||||
if elements[0].startswith('!dream'): # in case a stored prompt still contains the !dream command
|
||||
if elements[0].startswith(
|
||||
'!dream'
|
||||
): # in case a stored prompt still contains the !dream command
|
||||
elements.pop(0)
|
||||
|
||||
|
||||
# rearrange the arguments to mimic how it works in the Dream bot.
|
||||
switches = ['']
|
||||
switches_started = False
|
||||
|
||||
for el in elements:
|
||||
if el[0]=='-' and not switches_started:
|
||||
if el[0] == '-' and not switches_started:
|
||||
switches_started = True
|
||||
if switches_started:
|
||||
switches.append(el)
|
||||
else:
|
||||
switches[0] += el
|
||||
switches[0] += ' '
|
||||
switches[0] = switches[0][:len(switches[0])-1]
|
||||
switches[0] = switches[0][: len(switches[0]) - 1]
|
||||
|
||||
try:
|
||||
opt = parser.parse_args(switches)
|
||||
opt = parser.parse_args(switches)
|
||||
except SystemExit:
|
||||
parser.print_help()
|
||||
continue
|
||||
if len(opt.prompt)==0:
|
||||
print("Try again with a prompt!")
|
||||
if len(opt.prompt) == 0:
|
||||
print('Try again with a prompt!')
|
||||
continue
|
||||
if opt.seed is not None and opt.seed < 0: # retrieve previous value!
|
||||
try:
|
||||
opt.seed = last_seeds[opt.seed]
|
||||
print(f'reusing previous seed {opt.seed}')
|
||||
except IndexError:
|
||||
print(f'No previous seed at position {opt.seed} found')
|
||||
opt.seed = None
|
||||
|
||||
do_grid = opt.grid or t2i.grid
|
||||
|
||||
if opt.with_variations is not None:
|
||||
# shotgun parsing, woo
|
||||
parts = []
|
||||
broken = False # python doesn't have labeled loops...
|
||||
for part in opt.with_variations.split(','):
|
||||
seed_and_weight = part.split(':')
|
||||
if len(seed_and_weight) != 2:
|
||||
print(f'could not parse with_variation part "{part}"')
|
||||
broken = True
|
||||
break
|
||||
try:
|
||||
seed = int(seed_and_weight[0])
|
||||
weight = float(seed_and_weight[1])
|
||||
except ValueError:
|
||||
print(f'could not parse with_variation part "{part}"')
|
||||
broken = True
|
||||
break
|
||||
parts.append([seed, weight])
|
||||
if broken:
|
||||
continue
|
||||
if len(parts) > 0:
|
||||
opt.with_variations = parts
|
||||
else:
|
||||
opt.with_variations = None
|
||||
|
||||
if opt.outdir:
|
||||
if not os.path.exists(opt.outdir):
|
||||
os.makedirs(opt.outdir)
|
||||
current_outdir = opt.outdir
|
||||
elif prompt_as_dir:
|
||||
# sanitize the prompt to a valid folder name
|
||||
subdir = path_filter.sub('_', opt.prompt)[:name_max].rstrip(' .')
|
||||
|
||||
# truncate path to maximum allowed length
|
||||
# 27 is the length of '######.##########.##.png', plus two separators and a NUL
|
||||
subdir = subdir[:(path_max - 27 - len(os.path.abspath(outdir)))]
|
||||
current_outdir = os.path.join(outdir, subdir)
|
||||
|
||||
print ('Writing files to directory: "' + current_outdir + '"')
|
||||
|
||||
# make sure the output directory exists
|
||||
if not os.path.exists(current_outdir):
|
||||
os.makedirs(current_outdir)
|
||||
else:
|
||||
current_outdir = outdir
|
||||
|
||||
# Here is where the images are actually generated!
|
||||
try:
|
||||
file_writer = PngWriter(current_outdir)
|
||||
prefix = file_writer.unique_prefix()
|
||||
seeds = set()
|
||||
results = [] # list of filename, prompt pairs
|
||||
grid_images = dict() # seed -> Image, only used if `do_grid`
|
||||
def image_writer(image, seed, upscaled=False):
|
||||
if do_grid:
|
||||
grid_images[seed] = image
|
||||
else:
|
||||
if upscaled and opt.save_original:
|
||||
filename = f'{prefix}.{seed}.postprocessed.png'
|
||||
else:
|
||||
filename = f'{prefix}.{seed}.png'
|
||||
if opt.variation_amount > 0:
|
||||
iter_opt = argparse.Namespace(**vars(opt)) # copy
|
||||
this_variation = [[seed, opt.variation_amount]]
|
||||
if opt.with_variations is None:
|
||||
iter_opt.with_variations = this_variation
|
||||
else:
|
||||
iter_opt.with_variations = opt.with_variations + this_variation
|
||||
iter_opt.variation_amount = 0
|
||||
normalized_prompt = PromptFormatter(t2i, iter_opt).normalize_prompt()
|
||||
metadata_prompt = f'{normalized_prompt} -S{iter_opt.seed}'
|
||||
elif opt.with_variations is not None:
|
||||
normalized_prompt = PromptFormatter(t2i, opt).normalize_prompt()
|
||||
metadata_prompt = f'{normalized_prompt} -S{opt.seed}' # use the original seed - the per-iteration value is the last variation-seed
|
||||
else:
|
||||
normalized_prompt = PromptFormatter(t2i, opt).normalize_prompt()
|
||||
metadata_prompt = f'{normalized_prompt} -S{seed}'
|
||||
path = file_writer.save_image_and_prompt_to_png(image, metadata_prompt, filename)
|
||||
if (not upscaled) or opt.save_original:
|
||||
# only append to results if we didn't overwrite an earlier output
|
||||
results.append([path, metadata_prompt])
|
||||
|
||||
seeds.add(seed)
|
||||
|
||||
t2i.prompt2image(image_callback=image_writer, **vars(opt))
|
||||
|
||||
if do_grid and len(grid_images) > 0:
|
||||
grid_img = make_grid(list(grid_images.values()))
|
||||
first_seed = next(iter(seeds))
|
||||
filename = f'{prefix}.{first_seed}.png'
|
||||
# TODO better metadata for grid images
|
||||
normalized_prompt = PromptFormatter(t2i, opt).normalize_prompt()
|
||||
metadata_prompt = f'{normalized_prompt} -S{first_seed} --grid -N{len(grid_images)}'
|
||||
path = file_writer.save_image_and_prompt_to_png(
|
||||
grid_img, metadata_prompt, filename
|
||||
)
|
||||
results = [[path, metadata_prompt]]
|
||||
|
||||
last_seeds = list(seeds)
|
||||
|
||||
except AssertionError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
if opt.init_img is None:
|
||||
results = t2i.txt2img(**vars(opt))
|
||||
else:
|
||||
results = t2i.img2img(**vars(opt))
|
||||
print("Outputs:")
|
||||
write_log_message(t2i,opt,results,log)
|
||||
|
||||
except OSError as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
print("goodbye!")
|
||||
print('Outputs:')
|
||||
log_path = os.path.join(current_outdir, 'dream_log.txt')
|
||||
write_log_message(results, log_path)
|
||||
|
||||
print('goodbye!')
|
||||
|
||||
|
||||
def write_log_message(t2i,opt,results,logfile):
|
||||
''' logs the name of the output image, its prompt and seed to the terminal, log file, and a Dream text chunk in the PNG metadata '''
|
||||
switches = _reconstruct_switches(t2i,opt)
|
||||
prompt_str = ' '.join(switches)
|
||||
|
||||
# when multiple images are produced in batch, then we keep track of where each starts
|
||||
last_seed = None
|
||||
img_num = 1
|
||||
batch_size = opt.batch_size or t2i.batch_size
|
||||
seenit = {}
|
||||
|
||||
seeds = [a[1] for a in results]
|
||||
if batch_size > 1:
|
||||
seeds = f"(seeds for each batch row: {seeds})"
|
||||
def get_next_command(infile=None) -> str: #command string
|
||||
if infile is None:
|
||||
command = input('dream> ')
|
||||
else:
|
||||
seeds = f"(seeds for individual images: {seeds})"
|
||||
command = infile.readline()
|
||||
if not command:
|
||||
raise EOFError
|
||||
else:
|
||||
command = command.strip()
|
||||
print(f'#{command}')
|
||||
return command
|
||||
|
||||
for r in results:
|
||||
seed = r[1]
|
||||
log_message = (f'{r[0]}: {prompt_str} -S{seed}')
|
||||
def dream_server_loop(t2i, host, port):
|
||||
print('\n* --web was specified, starting web server...')
|
||||
# Change working directory to the stable-diffusion directory
|
||||
os.chdir(
|
||||
os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||
)
|
||||
|
||||
if batch_size > 1:
|
||||
if seed != last_seed:
|
||||
img_num = 1
|
||||
log_message += f' # (batch image {img_num} of {batch_size})'
|
||||
else:
|
||||
img_num += 1
|
||||
log_message += f' # (batch image {img_num} of {batch_size})'
|
||||
last_seed = seed
|
||||
print(log_message)
|
||||
logfile.write(log_message+"\n")
|
||||
logfile.flush()
|
||||
if r[0] not in seenit:
|
||||
seenit[r[0]] = True
|
||||
try:
|
||||
if opt.grid:
|
||||
_write_prompt_to_png(r[0],f'{prompt_str} -g -S{seed} {seeds}')
|
||||
else:
|
||||
_write_prompt_to_png(r[0],f'{prompt_str} -S{seed}')
|
||||
except FileNotFoundError:
|
||||
print(f"Could not open file '{r[0]}' for reading")
|
||||
# Start server
|
||||
DreamServer.model = t2i
|
||||
dream_server = ThreadingDreamServer((host, port))
|
||||
print(">> Started Stable Diffusion dream server!")
|
||||
if host == '0.0.0.0':
|
||||
print(f"Point your browser at http://localhost:{port} or use the host's DNS name or IP address.")
|
||||
else:
|
||||
print(">> Default host address now 127.0.0.1 (localhost). Use --host 0.0.0.0 to bind any address.")
|
||||
print(f">> Point your browser at http://{host}:{port}.")
|
||||
|
||||
def _reconstruct_switches(t2i,opt):
|
||||
'''Normalize the prompt and switches'''
|
||||
switches = list()
|
||||
switches.append(f'"{opt.prompt}"')
|
||||
switches.append(f'-s{opt.steps or t2i.steps}')
|
||||
switches.append(f'-b{opt.batch_size or t2i.batch_size}')
|
||||
switches.append(f'-W{opt.width or t2i.width}')
|
||||
switches.append(f'-H{opt.height or t2i.height}')
|
||||
switches.append(f'-C{opt.cfg_scale or t2i.cfg_scale}')
|
||||
if opt.init_img:
|
||||
switches.append(f'-I{opt.init_img}')
|
||||
if opt.strength and opt.init_img is not None:
|
||||
switches.append(f'-f{opt.strength or t2i.strength}')
|
||||
if t2i.full_precision:
|
||||
switches.append('-F')
|
||||
return switches
|
||||
try:
|
||||
dream_server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
dream_server.server_close()
|
||||
|
||||
|
||||
def write_log_message(results, log_path):
|
||||
"""logs the name of the output image, prompt, and prompt args to the terminal and log file"""
|
||||
log_lines = [f'{path}: {prompt}\n' for path, prompt in results]
|
||||
print(*log_lines, sep='')
|
||||
|
||||
with open(log_path, 'a', encoding='utf-8') as file:
|
||||
file.writelines(log_lines)
|
||||
|
||||
|
||||
SAMPLER_CHOICES=[
|
||||
'ddim',
|
||||
'k_dpm_2_a',
|
||||
'k_dpm_2',
|
||||
'k_euler_a',
|
||||
'k_euler',
|
||||
'k_heun',
|
||||
'k_lms',
|
||||
'plms',
|
||||
]
|
||||
|
||||
def _write_prompt_to_png(path,prompt):
|
||||
info = PngImagePlugin.PngInfo()
|
||||
info.add_text("Dream",prompt)
|
||||
im = Image.open(path)
|
||||
im.save(path,"PNG",pnginfo=info)
|
||||
|
||||
def create_argv_parser():
|
||||
parser = argparse.ArgumentParser(description="Parse script's command line args")
|
||||
parser.add_argument("--laion400m",
|
||||
"--latent_diffusion",
|
||||
"-l",
|
||||
dest='laion400m',
|
||||
action='store_true',
|
||||
help="fallback to the latent diffusion (laion400m) weights and config")
|
||||
parser.add_argument("--from_file",
|
||||
dest='infile',
|
||||
type=str,
|
||||
help="if specified, load prompts from this file")
|
||||
parser.add_argument('-n','--iterations',
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of images to generate")
|
||||
parser.add_argument('-F','--full_precision',
|
||||
dest='full_precision',
|
||||
action='store_true',
|
||||
help="use slower full precision math for calculations")
|
||||
parser.add_argument('-b','--batch_size',
|
||||
type=int,
|
||||
default=1,
|
||||
help="number of images to produce per iteration (faster, but doesn't generate individual seeds")
|
||||
parser.add_argument('--sampler','-m',
|
||||
dest="sampler_name",
|
||||
choices=['plms','ddim', 'klms'],
|
||||
default='klms',
|
||||
help="which sampler to use (klms) - can only be set on command line")
|
||||
parser.add_argument('-o',
|
||||
'--outdir',
|
||||
type=str,
|
||||
default="outputs/img-samples",
|
||||
help="directory in which to place generated images and a log of prompts and seeds")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""Generate images using Stable Diffusion.
|
||||
Use --web to launch the web interface.
|
||||
Use --from_file to load prompts from a file path or standard input ("-").
|
||||
Otherwise you will be dropped into an interactive command prompt (type -h for help.)
|
||||
Other command-line arguments are defaults that can usually be overridden
|
||||
prompt the command prompt.
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
'--laion400m',
|
||||
'--latent_diffusion',
|
||||
'-l',
|
||||
dest='laion400m',
|
||||
action='store_true',
|
||||
help='Fallback to the latent diffusion (laion400m) weights and config',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--from_file',
|
||||
dest='infile',
|
||||
type=str,
|
||||
help='If specified, load prompts from this file',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-n',
|
||||
'--iterations',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of images to generate',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-F',
|
||||
'--full_precision',
|
||||
dest='full_precision',
|
||||
action='store_true',
|
||||
help='Use more memory-intensive full precision math for calculations',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-g',
|
||||
'--grid',
|
||||
action='store_true',
|
||||
help='Generate a grid instead of individual images',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-A',
|
||||
'-m',
|
||||
'--sampler',
|
||||
dest='sampler_name',
|
||||
choices=SAMPLER_CHOICES,
|
||||
metavar='SAMPLER_NAME',
|
||||
default='k_lms',
|
||||
help=f'Set the initial sampler. Default: k_lms. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--outdir',
|
||||
'-o',
|
||||
type=str,
|
||||
default='outputs/img-samples',
|
||||
help='Directory to save generated images and a log of prompts and seeds. Default: outputs/img-samples',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--embedding_path',
|
||||
type=str,
|
||||
help='Path to a pre-trained embedding manager checkpoint - can only be set on command line',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prompt_as_dir',
|
||||
'-p',
|
||||
action='store_true',
|
||||
help='Place images in subdirectories named after the prompt.',
|
||||
)
|
||||
# GFPGAN related args
|
||||
parser.add_argument(
|
||||
'--gfpgan_bg_upsampler',
|
||||
type=str,
|
||||
default='realesrgan',
|
||||
help='Background upsampler. Default: realesrgan. Options: realesrgan, none. Only used if --gfpgan is specified',
|
||||
|
||||
)
|
||||
parser.add_argument(
|
||||
'--gfpgan_bg_tile',
|
||||
type=int,
|
||||
default=400,
|
||||
help='Tile size for background sampler, 0 for no tile during testing. Default: 400.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--gfpgan_model_path',
|
||||
type=str,
|
||||
default='experiments/pretrained_models/GFPGANv1.3.pth',
|
||||
help='Indicates the path to the GFPGAN model, relative to --gfpgan_dir.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--gfpgan_dir',
|
||||
type=str,
|
||||
default='../GFPGAN',
|
||||
help='Indicates the directory containing the GFPGAN code.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--web',
|
||||
dest='web',
|
||||
action='store_true',
|
||||
help='Start in web server mode.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--host',
|
||||
type=str,
|
||||
default='127.0.0.1',
|
||||
help='Web server: Host or IP to listen on. Set to 0.0.0.0 to accept traffic from other devices on your network.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--port',
|
||||
type=int,
|
||||
default='9090',
|
||||
help='Web server: Port to listen on'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--weights',
|
||||
default='model',
|
||||
help='Indicates the Stable Diffusion model to use.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
'-d',
|
||||
type=str,
|
||||
default='cuda',
|
||||
help="device to run stable diffusion on. defaults to cuda `torch.cuda.current_device()` if available"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
default='stable-diffusion-1.4',
|
||||
help='Indicates which diffusion model to load. (currently "stable-diffusion-1.4" (default) or "laion400m")',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--config',
|
||||
default ='configs/models.yaml',
|
||||
help ='Path to configuration file for alternate models.',
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
|
||||
def create_cmd_parser():
|
||||
parser = argparse.ArgumentParser(description='Example: dream> a fantastic alien landscape -W1024 -H960 -s100 -n12')
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Example: dream> a fantastic alien landscape -W1024 -H960 -s100 -n12'
|
||||
)
|
||||
parser.add_argument('prompt')
|
||||
parser.add_argument('-s','--steps',type=int,help="number of steps")
|
||||
parser.add_argument('-S','--seed',type=int,help="image seed")
|
||||
parser.add_argument('-n','--iterations',type=int,default=1,help="number of samplings to perform")
|
||||
parser.add_argument('-b','--batch_size',type=int,default=1,help="number of images to produce per sampling")
|
||||
parser.add_argument('-W','--width',type=int,help="image width, multiple of 64")
|
||||
parser.add_argument('-H','--height',type=int,help="image height, multiple of 64")
|
||||
parser.add_argument('-C','--cfg_scale',default=7.5,type=float,help="prompt configuration scale")
|
||||
parser.add_argument('-g','--grid',action='store_true',help="generate a grid")
|
||||
parser.add_argument('-i','--individual',action='store_true',help="generate individual files (default)")
|
||||
parser.add_argument('-I','--init_img',type=str,help="path to input image (supersedes width and height)")
|
||||
parser.add_argument('-f','--strength',default=0.75,type=float,help="strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely")
|
||||
parser.add_argument('-x','--skip_normalize',action='store_true',help="skip subprompt weight normalization")
|
||||
parser.add_argument('-s', '--steps', type=int, help='Number of steps')
|
||||
parser.add_argument(
|
||||
'-S',
|
||||
'--seed',
|
||||
type=int,
|
||||
help='Image seed; a +ve integer, or use -1 for the previous seed, -2 for the one before that, etc',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-n',
|
||||
'--iterations',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of samplings to perform (slower, but will provide seeds for individual images)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-W', '--width', type=int, help='Image width, multiple of 64'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-H', '--height', type=int, help='Image height, multiple of 64'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-C',
|
||||
'--cfg_scale',
|
||||
default=7.5,
|
||||
type=float,
|
||||
help='Classifier free guidance (CFG) scale - higher numbers cause generator to "try" harder.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-g', '--grid', action='store_true', help='generate a grid'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--outdir',
|
||||
'-o',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Directory to save generated images and a log of prompts and seeds',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-i',
|
||||
'--individual',
|
||||
action='store_true',
|
||||
help='Generate individual files (default)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-I',
|
||||
'--init_img',
|
||||
type=str,
|
||||
help='Path to input image for img2img mode (supersedes width and height)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-T',
|
||||
'-fit',
|
||||
'--fit',
|
||||
action='store_true',
|
||||
help='If specified, will resize the input image to fit within the dimensions of width x height (512x512 default)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-f',
|
||||
'--strength',
|
||||
default=0.75,
|
||||
type=float,
|
||||
help='Strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-G',
|
||||
'--gfpgan_strength',
|
||||
default=0,
|
||||
type=float,
|
||||
help='The strength at which to apply the GFPGAN model to the result, in order to improve faces.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-U',
|
||||
'--upscale',
|
||||
nargs='+',
|
||||
default=None,
|
||||
type=float,
|
||||
help='Scale factor (2, 4) for upscaling followed by upscaling strength (0-1.0). If strength not specified, defaults to 0.75'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-save_orig',
|
||||
'--save_original',
|
||||
action='store_true',
|
||||
help='Save original. Use it when upscaling to save both versions.',
|
||||
)
|
||||
# variants is going to be superseded by a generalized "prompt-morph" function
|
||||
# parser.add_argument('-v','--variants',type=int,help="in img2img mode, the first generated image will get passed back to img2img to generate the requested number of variants")
|
||||
parser.add_argument(
|
||||
'-x',
|
||||
'--skip_normalize',
|
||||
action='store_true',
|
||||
help='Skip subprompt weight normalization',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-A',
|
||||
'-m',
|
||||
'--sampler',
|
||||
dest='sampler_name',
|
||||
default=None,
|
||||
type=str,
|
||||
choices=SAMPLER_CHOICES,
|
||||
metavar='SAMPLER_NAME',
|
||||
help=f'Switch to a different sampler. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-t',
|
||||
'--log_tokenization',
|
||||
action='store_true',
|
||||
help='shows how the prompt is split into tokens'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v',
|
||||
'--variation_amount',
|
||||
default=0.0,
|
||||
type=float,
|
||||
help='If > 0, generates variations on the initial seed instead of random seeds per iteration. Must be between 0 and 1. Higher values will be more different.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-V',
|
||||
'--with_variations',
|
||||
default=None,
|
||||
type=str,
|
||||
help='list of variations to apply, in the format `seed:weight,seed:weight,...'
|
||||
)
|
||||
return parser
|
||||
|
||||
if readline_available:
|
||||
def setup_readline():
|
||||
readline.set_completer(Completer(['cd','pwd',
|
||||
'--steps','-s','--seed','-S','--iterations','-n','--batch_size','-b',
|
||||
'--width','-W','--height','-H','--cfg_scale','-C','--grid','-g',
|
||||
'--individual','-i','--init_img','-I','--strength','-f']).complete)
|
||||
readline.set_completer_delims(" ")
|
||||
readline.parse_and_bind('tab: complete')
|
||||
load_history()
|
||||
|
||||
def load_history():
|
||||
histfile = os.path.join(os.path.expanduser('~'),".dream_history")
|
||||
try:
|
||||
readline.read_history_file(histfile)
|
||||
readline.set_history_length(1000)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
atexit.register(readline.write_history_file,histfile)
|
||||
|
||||
class Completer():
|
||||
def __init__(self,options):
|
||||
self.options = sorted(options)
|
||||
return
|
||||
|
||||
def complete(self,text,state):
|
||||
buffer = readline.get_line_buffer()
|
||||
|
||||
if text.startswith(('-I','--init_img')):
|
||||
return self._path_completions(text,state,('.png'))
|
||||
|
||||
if buffer.strip().endswith('cd') or text.startswith(('.','/')):
|
||||
return self._path_completions(text,state,())
|
||||
|
||||
response = None
|
||||
if state == 0:
|
||||
# This is the first time for this text, so build a match list.
|
||||
if text:
|
||||
self.matches = [s
|
||||
for s in self.options
|
||||
if s and s.startswith(text)]
|
||||
else:
|
||||
self.matches = self.options[:]
|
||||
|
||||
# Return the state'th item from the match list,
|
||||
# if we have that many.
|
||||
try:
|
||||
response = self.matches[state]
|
||||
except IndexError:
|
||||
response = None
|
||||
return response
|
||||
|
||||
def _path_completions(self,text,state,extensions):
|
||||
# get the path so far
|
||||
if text.startswith('-I'):
|
||||
path = text.replace('-I','',1).lstrip()
|
||||
elif text.startswith('--init_img='):
|
||||
path = text.replace('--init_img=','',1).lstrip()
|
||||
else:
|
||||
path = text
|
||||
|
||||
matches = list()
|
||||
|
||||
path = os.path.expanduser(path)
|
||||
if len(path)==0:
|
||||
matches.append(text+'./')
|
||||
else:
|
||||
dir = os.path.dirname(path)
|
||||
dir_list = os.listdir(dir)
|
||||
for n in dir_list:
|
||||
if n.startswith('.') and len(n)>1:
|
||||
continue
|
||||
full_path = os.path.join(dir,n)
|
||||
if full_path.startswith(path):
|
||||
if os.path.isdir(full_path):
|
||||
matches.append(os.path.join(os.path.dirname(text),n)+'/')
|
||||
elif n.endswith(extensions):
|
||||
matches.append(os.path.join(os.path.dirname(text),n))
|
||||
|
||||
try:
|
||||
response = matches[state]
|
||||
except IndexError:
|
||||
response = None
|
||||
return response
|
||||
|
||||
if __name__ == "__main__":
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -6,7 +6,7 @@ import numpy as np
|
||||
import torch
|
||||
from main import instantiate_from_config
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
def make_batch(image, mask, device):
|
||||
image = np.array(Image.open(image).convert("RGB"))
|
||||
@@ -61,8 +61,8 @@ if __name__ == "__main__":
|
||||
model.load_state_dict(torch.load("models/ldm/inpainting_big/last.ckpt")["state_dict"],
|
||||
strict=False)
|
||||
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
model = model.to(device)
|
||||
device = choose_torch_device()
|
||||
model = model.to(device)
|
||||
sampler = DDIMSampler(model)
|
||||
|
||||
os.makedirs(opt.outdir, exist_ok=True)
|
||||
|
||||
115
scripts/merge_embeddings.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from ldm.modules.encoders.modules import FrozenCLIPEmbedder, BERTEmbedder
|
||||
from ldm.modules.embedding_manager import EmbeddingManager
|
||||
|
||||
import argparse, os
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
|
||||
def get_placeholder_loop(placeholder_string, embedder, use_bert):
|
||||
|
||||
new_placeholder = None
|
||||
|
||||
while True:
|
||||
if new_placeholder is None:
|
||||
new_placeholder = input(f"Placeholder string {placeholder_string} was already used. Please enter a replacement string: ")
|
||||
else:
|
||||
new_placeholder = input(f"Placeholder string '{new_placeholder}' maps to more than a single token. Please enter another string: ")
|
||||
|
||||
token = get_bert_token_for_string(embedder.tknz_fn, new_placeholder) if use_bert else get_clip_token_for_string(embedder.tokenizer, new_placeholder)
|
||||
|
||||
if token is not None:
|
||||
return new_placeholder, token
|
||||
|
||||
def get_clip_token_for_string(tokenizer, string):
|
||||
batch_encoding = tokenizer(
|
||||
string,
|
||||
truncation=True,
|
||||
max_length=77,
|
||||
return_length=True,
|
||||
return_overflowing_tokens=False,
|
||||
padding="max_length",
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
tokens = batch_encoding["input_ids"]
|
||||
|
||||
if torch.count_nonzero(tokens - 49407) == 2:
|
||||
return tokens[0, 1]
|
||||
|
||||
return None
|
||||
|
||||
def get_bert_token_for_string(tokenizer, string):
|
||||
token = tokenizer(string)
|
||||
if torch.count_nonzero(token) == 3:
|
||||
return token[0, 1]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--manager_ckpts",
|
||||
type=str,
|
||||
nargs="+",
|
||||
required=True,
|
||||
help="Paths to a set of embedding managers to be merged."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Output path for the merged manager",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-sd", "--use_bert",
|
||||
action="store_true",
|
||||
help="Flag to denote that we are not merging stable diffusion embeddings"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.use_bert:
|
||||
embedder = BERTEmbedder(n_embed=1280, n_layer=32).cuda()
|
||||
else:
|
||||
embedder = FrozenCLIPEmbedder().cuda()
|
||||
|
||||
EmbeddingManager = partial(EmbeddingManager, embedder, ["*"])
|
||||
|
||||
string_to_token_dict = {}
|
||||
string_to_param_dict = torch.nn.ParameterDict()
|
||||
|
||||
placeholder_to_src = {}
|
||||
|
||||
for manager_ckpt in args.manager_ckpts:
|
||||
print(f"Parsing {manager_ckpt}...")
|
||||
|
||||
manager = EmbeddingManager()
|
||||
manager.load(manager_ckpt)
|
||||
|
||||
for placeholder_string in manager.string_to_token_dict:
|
||||
if not placeholder_string in string_to_token_dict:
|
||||
string_to_token_dict[placeholder_string] = manager.string_to_token_dict[placeholder_string]
|
||||
string_to_param_dict[placeholder_string] = manager.string_to_param_dict[placeholder_string]
|
||||
|
||||
placeholder_to_src[placeholder_string] = manager_ckpt
|
||||
else:
|
||||
new_placeholder, new_token = get_placeholder_loop(placeholder_string, embedder, use_bert=args.use_bert)
|
||||
string_to_token_dict[new_placeholder] = new_token
|
||||
string_to_param_dict[new_placeholder] = manager.string_to_param_dict[placeholder_string]
|
||||
|
||||
placeholder_to_src[new_placeholder] = manager_ckpt
|
||||
|
||||
print("Saving combined manager...")
|
||||
merged_manager = EmbeddingManager()
|
||||
merged_manager.string_to_param_dict = string_to_param_dict
|
||||
merged_manager.string_to_token_dict = string_to_token_dict
|
||||
merged_manager.save(args.output_path)
|
||||
|
||||
print("Managers merged. Final list of placeholders: ")
|
||||
print(placeholder_to_src)
|
||||
@@ -18,6 +18,7 @@ from pytorch_lightning import seed_everything
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from ldm.models.diffusion.plms import PLMSSampler
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
|
||||
def chunk(it, size):
|
||||
@@ -40,7 +41,7 @@ def load_model_from_config(config, ckpt, verbose=False):
|
||||
print("unexpected keys:")
|
||||
print(u)
|
||||
|
||||
model.cuda()
|
||||
model.to(choose_torch_device())
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
@@ -199,7 +200,7 @@ def main():
|
||||
config = OmegaConf.load(f"{opt.config}")
|
||||
model = load_model_from_config(config, f"{opt.ckpt}")
|
||||
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
device = torch.device(choose_torch_device())
|
||||
model = model.to(device)
|
||||
|
||||
if opt.plms:
|
||||
@@ -241,8 +242,10 @@ def main():
|
||||
print(f"target t_enc is {t_enc} steps")
|
||||
|
||||
precision_scope = autocast if opt.precision == "autocast" else nullcontext
|
||||
if device.type in ['mps', 'cpu']:
|
||||
precision_scope = nullcontext # have to use f32 on mps
|
||||
with torch.no_grad():
|
||||
with precision_scope("cuda"):
|
||||
with precision_scope(device.type):
|
||||
with model.ema_scope():
|
||||
tic = time.time()
|
||||
all_samples = list()
|
||||
|
||||
@@ -12,14 +12,13 @@ from pytorch_lightning import seed_everything
|
||||
from torch import autocast
|
||||
from contextlib import contextmanager, nullcontext
|
||||
|
||||
import accelerate
|
||||
import k_diffusion as K
|
||||
import torch.nn as nn
|
||||
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.util import instantiate_from_config
|
||||
from ldm.models.diffusion.ddim import DDIMSampler
|
||||
from ldm.models.diffusion.plms import PLMSSampler
|
||||
|
||||
from ldm.dream.devices import choose_torch_device
|
||||
|
||||
def chunk(it, size):
|
||||
it = iter(it)
|
||||
@@ -41,7 +40,7 @@ def load_model_from_config(config, ckpt, verbose=False):
|
||||
print("unexpected keys:")
|
||||
print(u)
|
||||
|
||||
model.cuda()
|
||||
model.to(choose_torch_device())
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
@@ -191,18 +190,17 @@ def main():
|
||||
opt.ckpt = "models/ldm/text2img-large/model.ckpt"
|
||||
opt.outdir = "outputs/txt2img-samples-laion400m"
|
||||
|
||||
seed_everything(opt.seed)
|
||||
|
||||
config = OmegaConf.load(f"{opt.config}")
|
||||
model = load_model_from_config(config, f"{opt.ckpt}")
|
||||
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
model = model.to(device)
|
||||
seed_everything(opt.seed)
|
||||
|
||||
device = torch.device(choose_torch_device())
|
||||
model = model.to(device)
|
||||
|
||||
#for klms
|
||||
model_wrap = K.external.CompVisDenoiser(model)
|
||||
accelerator = accelerate.Accelerator()
|
||||
device = accelerator.device
|
||||
class CFGDenoiser(nn.Module):
|
||||
def __init__(self, model):
|
||||
super().__init__()
|
||||
@@ -243,16 +241,22 @@ def main():
|
||||
|
||||
start_code = None
|
||||
if opt.fixed_code:
|
||||
start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
|
||||
shape = [opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f]
|
||||
if device.type == 'mps':
|
||||
start_code = torch.randn(shape, device='cpu').to(device)
|
||||
else:
|
||||
torch.randn(shape, device=device)
|
||||
|
||||
precision_scope = autocast if opt.precision=="autocast" else nullcontext
|
||||
if device.type in ['mps', 'cpu']:
|
||||
precision_scope = nullcontext # have to use f32 on mps
|
||||
with torch.no_grad():
|
||||
with precision_scope("cuda"):
|
||||
with precision_scope(device.type):
|
||||
with model.ema_scope():
|
||||
tic = time.time()
|
||||
all_samples = list()
|
||||
for n in trange(opt.n_iter, desc="Sampling", disable =not accelerator.is_main_process):
|
||||
for prompts in tqdm(data, desc="data", disable =not accelerator.is_main_process):
|
||||
for n in trange(opt.n_iter, desc="Sampling"):
|
||||
for prompts in tqdm(data, desc="data"):
|
||||
uc = None
|
||||
if opt.scale != 1.0:
|
||||
uc = model.get_learned_conditioning(batch_size * [""])
|
||||
@@ -279,13 +283,10 @@ def main():
|
||||
x = torch.randn([opt.n_samples, *shape], device=device) * sigmas[0] # for GPU draw
|
||||
model_wrap_cfg = CFGDenoiser(model_wrap)
|
||||
extra_args = {'cond': c, 'uncond': uc, 'cond_scale': opt.scale}
|
||||
samples_ddim = K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args, disable=not accelerator.is_main_process)
|
||||
samples_ddim = K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args)
|
||||
|
||||
x_samples_ddim = model.decode_first_stage(samples_ddim)
|
||||
x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
|
||||
if opt.klms:
|
||||
x_sample = accelerator.gather(x_samples_ddim)
|
||||
|
||||
if not opt.skip_save:
|
||||
for x_sample in x_samples_ddim:
|
||||
|
||||
@@ -1,32 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
|
||||
# Before running stable-diffusion on an internet-isolated machine,
|
||||
# run this script from one with internet connectivity. The
|
||||
# two machines must share a common .cache directory.
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
import clip
|
||||
from transformers import BertTokenizerFast
|
||||
import sys
|
||||
import transformers
|
||||
import os
|
||||
import warnings
|
||||
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
# this will preload the Bert tokenizer fles
|
||||
print("preloading bert tokenizer...")
|
||||
from transformers import BertTokenizerFast
|
||||
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
|
||||
print("...success")
|
||||
print('preloading bert tokenizer...')
|
||||
|
||||
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
||||
print('...success')
|
||||
|
||||
# this will download requirements for Kornia
|
||||
print("preloading Kornia requirements (ignore the warnings)...")
|
||||
import kornia
|
||||
print("...success")
|
||||
print('preloading Kornia requirements (ignore the deprecation warnings)...')
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning)
|
||||
import kornia
|
||||
print('...success')
|
||||
|
||||
# doesn't work - probably wrong logger
|
||||
# logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)
|
||||
version='openai/clip-vit-large-patch14'
|
||||
version = 'openai/clip-vit-large-patch14'
|
||||
|
||||
print('preloading CLIP model (Ignore the warnings)...')
|
||||
print('preloading CLIP model (Ignore the deprecation warnings)...')
|
||||
sys.stdout.flush()
|
||||
import clip
|
||||
from transformers import CLIPTokenizer, CLIPTextModel
|
||||
tokenizer =CLIPTokenizer.from_pretrained(version)
|
||||
transformer=CLIPTextModel.from_pretrained(version)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained(version)
|
||||
transformer = CLIPTextModel.from_pretrained(version)
|
||||
print('\n\n...success')
|
||||
|
||||
# In the event that the user has installed GFPGAN and also elected to use
|
||||
# RealESRGAN, this will attempt to download the model needed by RealESRGANer
|
||||
gfpgan = False
|
||||
try:
|
||||
from realesrgan import RealESRGANer
|
||||
|
||||
gfpgan = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
if gfpgan:
|
||||
print('Loading models from RealESRGAN and facexlib')
|
||||
try:
|
||||
from basicsr.archs.rrdbnet_arch import RRDBNet
|
||||
from facexlib.utils.face_restoration_helper import FaceRestoreHelper
|
||||
|
||||
RealESRGANer(
|
||||
scale=2,
|
||||
model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
|
||||
model=RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=2,
|
||||
),
|
||||
)
|
||||
|
||||
RealESRGANer(
|
||||
scale=4,
|
||||
model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
|
||||
model=RRDBNet(
|
||||
num_in_ch=3,
|
||||
num_out_ch=3,
|
||||
num_feat=64,
|
||||
num_block=23,
|
||||
num_grow_ch=32,
|
||||
scale=4,
|
||||
),
|
||||
)
|
||||
|
||||
FaceRestoreHelper(1, det_model='retinaface_resnet50')
|
||||
print('...success')
|
||||
except Exception:
|
||||
import traceback
|
||||
|
||||
print('Error loading GFPGAN:')
|
||||
print(traceback.format_exc())
|
||||
|
||||
BIN
static/colab_notebook.png
Normal file
|
After Width: | Height: | Size: 799 KiB |
BIN
static/dream-py-demo.png
Normal file
|
After Width: | Height: | Size: 499 KiB |
97
static/dream_web/index.css
Normal file
@@ -0,0 +1,97 @@
|
||||
* {
|
||||
font-family: 'Arial';
|
||||
}
|
||||
#header {
|
||||
text-decoration: dotted underline;
|
||||
}
|
||||
#search {
|
||||
margin-top: 20vh;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
max-width: 1024px;
|
||||
text-align: center;
|
||||
}
|
||||
fieldset {
|
||||
border: none;
|
||||
}
|
||||
div {
|
||||
padding: 10px 10px 10px 10px;
|
||||
}
|
||||
#fieldset-search {
|
||||
display: flex;
|
||||
}
|
||||
#scaling-inprocess-message{
|
||||
font-weight: bold;
|
||||
font-style: italic;
|
||||
display: none;
|
||||
}
|
||||
#prompt {
|
||||
flex-grow: 1;
|
||||
|
||||
border-radius: 20px 0px 0px 20px;
|
||||
padding: 5px 10px 5px 10px;
|
||||
border: 1px solid black;
|
||||
border-right: none;
|
||||
outline: none;
|
||||
}
|
||||
#submit {
|
||||
border-radius: 0px 20px 20px 0px;
|
||||
padding: 5px 10px 5px 10px;
|
||||
border: 1px solid black;
|
||||
}
|
||||
#reset-all {
|
||||
background-color: pink;
|
||||
}
|
||||
#results {
|
||||
text-align: center;
|
||||
// max-width: 1024px;
|
||||
margin: auto;
|
||||
padding-top: 10px;
|
||||
}
|
||||
#results img {
|
||||
cursor: pointer;
|
||||
height: 30vh;
|
||||
border-radius: 5px;
|
||||
margin: 10px;
|
||||
}
|
||||
#fieldset-config {
|
||||
line-height:2em;
|
||||
}
|
||||
input[type="number"] {
|
||||
width: 60px;
|
||||
}
|
||||
#seed {
|
||||
width: 150px;
|
||||
}
|
||||
hr {
|
||||
// width: 200px;
|
||||
}
|
||||
label {
|
||||
white-space: nowrap;
|
||||
}
|
||||
#progress-section {
|
||||
display: none;
|
||||
}
|
||||
#progress-image {
|
||||
width: 30vh;
|
||||
height: 30vh;
|
||||
}
|
||||
#cancel-button {
|
||||
cursor: pointer;
|
||||
color: red;
|
||||
}
|
||||
#txt2img {
|
||||
background-color: #DCDCDC;
|
||||
}
|
||||
#img2img {
|
||||
background-color: #F5F5F5;
|
||||
}
|
||||
#gfpgan {
|
||||
background-color: #DCDCDC;
|
||||
}
|
||||
#progress-section {
|
||||
background-color: #F5F5F5;
|
||||
}
|
||||
#about {
|
||||
background-color: #DCDCDC;
|
||||
}
|
||||
111
static/dream_web/index.html
Normal file
@@ -0,0 +1,111 @@
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Stable Diffusion Dream Server</title>
|
||||
<meta charset="utf-8">
|
||||
<link rel="icon" href="data:,">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<link rel="stylesheet" href="static/dream_web/index.css">
|
||||
<script src="config.js"></script>
|
||||
<script src="static/dream_web/index.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div id="search">
|
||||
<h2 id="header">Stable Diffusion Dream Server</h2>
|
||||
|
||||
<form id="generate-form" method="post" action="#">
|
||||
<div id="txt2img">
|
||||
<fieldset id="fieldset-search">
|
||||
<input type="text" id="prompt" name="prompt">
|
||||
<input type="submit" id="submit" value="Generate">
|
||||
</fieldset>
|
||||
<fieldset id="fieldset-config">
|
||||
<label for="iterations">Images to generate:</label>
|
||||
<input value="1" type="number" id="iterations" name="iterations" size="4">
|
||||
<label for="steps">Steps:</label>
|
||||
<input value="50" type="number" id="steps" name="steps">
|
||||
<label for="cfgscale">Cfg Scale:</label>
|
||||
<input value="7.5" type="number" id="cfgscale" name="cfgscale" step="any">
|
||||
<label for="sampler">Sampler:</label>
|
||||
<select id="sampler" name="sampler" value="k_lms">
|
||||
<option value="ddim">DDIM</option>
|
||||
<option value="plms">PLMS</option>
|
||||
<option value="k_lms" selected>KLMS</option>
|
||||
<option value="k_dpm_2">KDPM_2</option>
|
||||
<option value="k_dpm_2_a">KDPM_2A</option>
|
||||
<option value="k_euler">KEULER</option>
|
||||
<option value="k_euler_a">KEULER_A</option>
|
||||
<option value="k_heun">KHEUN</option>
|
||||
</select>
|
||||
<br>
|
||||
<label title="Set to multiple of 64" for="width">Width:</label>
|
||||
<select id="width" name="width" value="512">
|
||||
<option value="64">64</option> <option value="128">128</option>
|
||||
<option value="192">192</option> <option value="256">256</option>
|
||||
<option value="320">320</option> <option value="384">384</option>
|
||||
<option value="448">448</option> <option value="512" selected>512</option>
|
||||
<option value="576">576</option> <option value="640">640</option>
|
||||
<option value="704">704</option> <option value="768">768</option>
|
||||
<option value="832">832</option> <option value="896">896</option>
|
||||
<option value="960">960</option> <option value="1024">1024</option>
|
||||
</select>
|
||||
<label title="Set to multiple of 64" for="height">Height:</label>
|
||||
<select id="height" name="height" value="512">
|
||||
<option value="64">64</option> <option value="128">128</option>
|
||||
<option value="192">192</option> <option value="256">256</option>
|
||||
<option value="320">320</option> <option value="384">384</option>
|
||||
<option value="448">448</option> <option value="512" selected>512</option>
|
||||
<option value="576">576</option> <option value="640">640</option>
|
||||
<option value="704">704</option> <option value="768">768</option>
|
||||
<option value="832">832</option> <option value="896">896</option>
|
||||
<option value="960">960</option> <option value="1024">1024</option>
|
||||
</select>
|
||||
<label title="Set to -1 for random seed" for="seed">Seed:</label>
|
||||
<input value="-1" type="number" id="seed" name="seed">
|
||||
<button type="button" id="reset-seed">↺</button>
|
||||
<input type="checkbox" name="progress_images" id="progress_images">
|
||||
<label for="progress_images">Display in-progress images (slows down generation):</label>
|
||||
<button type="button" id="reset-all">Reset to Defaults</button>
|
||||
</div>
|
||||
<div id="img2img">
|
||||
<label title="Upload an image to use img2img" for="initimg">Initial image:</label>
|
||||
<input type="file" id="initimg" name="initimg" accept=".jpg, .jpeg, .png">
|
||||
<br>
|
||||
<label for="strength">Img2Img Strength:</label>
|
||||
<input value="0.75" type="number" id="strength" name="strength" step="0.01" min="0" max="1">
|
||||
<input type="checkbox" id="fit" name="fit" checked>
|
||||
<label title="Rescale image to fit within requested width and height" for="fit">Fit to width/height:</label>
|
||||
</div>
|
||||
<div id="gfpgan">
|
||||
<label title="Strength of the gfpgan (face fixing) algorithm." for="gfpgan_strength">GPFGAN Strength (0 to disable):</label>
|
||||
<input value="0.8" min="0" max="1" type="number" id="gfpgan_strength" name="gfpgan_strength" step="0.05">
|
||||
<label title="Upscaling to perform using ESRGAN." for="upscale_level">Upscaling Level</label>
|
||||
<select id="upscale_level" name="upscale_level" value="">
|
||||
<option value="" selected>None</option>
|
||||
<option value="2">2x</option>
|
||||
<option value="4">4x</option>
|
||||
</select>
|
||||
<label title="Strength of the esrgan (upscaling) algorithm." for="upscale_strength">Upscale Strength:</label>
|
||||
<input value="0.75" min="0" max="1" type="number" id="upscale_strength" name="upscale_strength" step="0.05">
|
||||
</div>
|
||||
</fieldset>
|
||||
</form>
|
||||
<div id="about">For news and support for this web service, visit our <a href="http://github.com/lstein/stable-diffusion">GitHub site</a></div>
|
||||
<br>
|
||||
<div id="progress-section">
|
||||
<progress id="progress-bar" value="0" max="1"></progress>
|
||||
<span id="cancel-button" title="Cancel">✖</span>
|
||||
<br>
|
||||
<img id="progress-image" src='data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg"/>'></img>
|
||||
<div id="scaling-inprocess-message">
|
||||
<i><span>Postprocessing...</span><span id="processing_cnt">1/3</span></i>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="results">
|
||||
<div id="no-results-message">
|
||||
<i><p>No results...</p></i>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
161
static/dream_web/index.js
Normal file
@@ -0,0 +1,161 @@
|
||||
function toBase64(file) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const r = new FileReader();
|
||||
r.readAsDataURL(file);
|
||||
r.onload = () => resolve(r.result);
|
||||
r.onerror = (error) => reject(error);
|
||||
});
|
||||
}
|
||||
|
||||
function appendOutput(src, seed, config) {
|
||||
let outputNode = document.createElement("img");
|
||||
outputNode.src = src;
|
||||
|
||||
let altText = seed.toString() + " | " + config.prompt;
|
||||
outputNode.alt = altText;
|
||||
outputNode.title = altText;
|
||||
|
||||
// Reload image config
|
||||
outputNode.addEventListener('click', () => {
|
||||
let form = document.querySelector("#generate-form");
|
||||
for (const [k, v] of new FormData(form)) {
|
||||
form.querySelector(`*[name=${k}]`).value = config[k];
|
||||
}
|
||||
document.querySelector("#seed").value = seed;
|
||||
|
||||
saveFields(document.querySelector("#generate-form"));
|
||||
});
|
||||
|
||||
document.querySelector("#results").prepend(outputNode);
|
||||
}
|
||||
|
||||
function saveFields(form) {
|
||||
for (const [k, v] of new FormData(form)) {
|
||||
if (typeof v !== 'object') { // Don't save 'file' type
|
||||
localStorage.setItem(k, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function loadFields(form) {
|
||||
for (const [k, v] of new FormData(form)) {
|
||||
const item = localStorage.getItem(k);
|
||||
if (item != null) {
|
||||
form.querySelector(`*[name=${k}]`).value = item;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function clearFields(form) {
|
||||
localStorage.clear();
|
||||
let prompt = form.prompt.value;
|
||||
form.reset();
|
||||
form.prompt.value = prompt;
|
||||
}
|
||||
|
||||
const BLANK_IMAGE_URL = 'data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg"/>';
|
||||
async function generateSubmit(form) {
|
||||
const prompt = document.querySelector("#prompt").value;
|
||||
|
||||
// Convert file data to base64
|
||||
let formData = Object.fromEntries(new FormData(form));
|
||||
formData.initimg = formData.initimg.name !== '' ? await toBase64(formData.initimg) : null;
|
||||
|
||||
let strength = formData.strength;
|
||||
let totalSteps = formData.initimg ? Math.floor(strength * formData.steps) : formData.steps;
|
||||
|
||||
let progressSectionEle = document.querySelector('#progress-section');
|
||||
progressSectionEle.style.display = 'initial';
|
||||
let progressEle = document.querySelector('#progress-bar');
|
||||
progressEle.setAttribute('max', totalSteps);
|
||||
let progressImageEle = document.querySelector('#progress-image');
|
||||
progressImageEle.src = BLANK_IMAGE_URL;
|
||||
|
||||
progressImageEle.style.display = {}.hasOwnProperty.call(formData, 'progress_images') ? 'initial': 'none';
|
||||
|
||||
// Post as JSON, using Fetch streaming to get results
|
||||
fetch(form.action, {
|
||||
method: form.method,
|
||||
body: JSON.stringify(formData),
|
||||
}).then(async (response) => {
|
||||
const reader = response.body.getReader();
|
||||
|
||||
let noOutputs = true;
|
||||
while (true) {
|
||||
let {value, done} = await reader.read();
|
||||
value = new TextDecoder().decode(value);
|
||||
if (done) {
|
||||
progressSectionEle.style.display = 'none';
|
||||
break;
|
||||
}
|
||||
|
||||
for (let event of value.split('\n').filter(e => e !== '')) {
|
||||
const data = JSON.parse(event);
|
||||
|
||||
if (data.event === 'result') {
|
||||
noOutputs = false;
|
||||
document.querySelector("#no-results-message")?.remove();
|
||||
appendOutput(data.url, data.seed, data.config);
|
||||
progressEle.setAttribute('value', 0);
|
||||
progressEle.setAttribute('max', totalSteps);
|
||||
} else if (data.event === 'upscaling-started') {
|
||||
document.getElementById("processing_cnt").textContent=data.processed_file_cnt;
|
||||
document.getElementById("scaling-inprocess-message").style.display = "block";
|
||||
} else if (data.event === 'upscaling-done') {
|
||||
document.getElementById("scaling-inprocess-message").style.display = "none";
|
||||
} else if (data.event === 'step') {
|
||||
progressEle.setAttribute('value', data.step);
|
||||
if (data.url) {
|
||||
progressImageEle.src = data.url;
|
||||
}
|
||||
} else if (data.event === 'canceled') {
|
||||
// avoid alerting as if this were an error case
|
||||
noOutputs = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-enable form, remove no-results-message
|
||||
form.querySelector('fieldset').removeAttribute('disabled');
|
||||
document.querySelector("#prompt").value = prompt;
|
||||
document.querySelector('progress').setAttribute('value', '0');
|
||||
|
||||
if (noOutputs) {
|
||||
alert("Error occurred while generating.");
|
||||
}
|
||||
});
|
||||
|
||||
// Disable form while generating
|
||||
form.querySelector('fieldset').setAttribute('disabled','');
|
||||
document.querySelector("#prompt").value = `Generating: "${prompt}"`;
|
||||
}
|
||||
|
||||
window.onload = () => {
|
||||
document.querySelector("#generate-form").addEventListener('submit', (e) => {
|
||||
e.preventDefault();
|
||||
const form = e.target;
|
||||
|
||||
generateSubmit(form);
|
||||
});
|
||||
document.querySelector("#generate-form").addEventListener('change', (e) => {
|
||||
saveFields(e.target.form);
|
||||
});
|
||||
document.querySelector("#reset-seed").addEventListener('click', (e) => {
|
||||
document.querySelector("#seed").value = -1;
|
||||
saveFields(e.target.form);
|
||||
});
|
||||
document.querySelector("#reset-all").addEventListener('click', (e) => {
|
||||
clearFields(e.target.form);
|
||||
});
|
||||
loadFields(document.querySelector("#generate-form"));
|
||||
|
||||
document.querySelector('#cancel-button').addEventListener('click', () => {
|
||||
fetch('/cancel').catch(e => {
|
||||
console.error(e);
|
||||
});
|
||||
});
|
||||
|
||||
if (!config.gfpgan_model_exists) {
|
||||
document.querySelector("#gfpgan").style.display = 'none';
|
||||
}
|
||||
};
|
||||
BIN
static/dream_web_server.png
Normal file
|
After Width: | Height: | Size: 536 KiB |
BIN
static/logo_temp.png
Normal file
|
After Width: | Height: | Size: 34 KiB |
BIN
static/variation_walkthru/000001.3357757885.png
Normal file
|
After Width: | Height: | Size: 429 KiB |
BIN
static/variation_walkthru/000002.1614299449.png
Normal file
|
After Width: | Height: | Size: 445 KiB |
BIN
static/variation_walkthru/000002.3647897225.png
Normal file
|
After Width: | Height: | Size: 426 KiB |
BIN
static/variation_walkthru/000003.1614299449.png
Normal file
|
After Width: | Height: | Size: 427 KiB |
BIN
static/variation_walkthru/000004.3747154981.png
Normal file
|
After Width: | Height: | Size: 424 KiB |