Merge branch 'development' into main

This merge adds the following major features: * Support for image variations. * Security fix for webGUI (binds to localhost by default, use --host=0.0.0.0 to allow access from external interface. * Scalable configs/models.yaml configuration file for adding more models as they become available. * More tuning and exception handling for M1 hardware running MPS. * Various documentation fixes.
make results section of webgui full width
2026-01-15 16:18:06 -05:00 · 2022-09-03 11:58:46 -04:00 · 2022-09-03 11:58:05 -04:00 · 2022-09-03 11:49:37 -04:00 · 2022-09-03 11:45:21 -04:00 · 2022-09-03 11:36:04 -04:00
89 changed files with 11724 additions and 3265 deletions
--- a/.dev_scripts/diff_images.py
+++ b/.dev_scripts/diff_images.py
@@ -0,0 +1,32 @@
+import argparse
+
+import numpy as np
+from PIL import Image
+
+
+def read_image_int16(image_path):
+    image = Image.open(image_path)
+    return np.array(image).astype(np.int16)
+
+
+def calc_images_mean_L1(image1_path, image2_path):
+    image1 = read_image_int16(image1_path)
+    image2 = read_image_int16(image2_path)
+    assert image1.shape == image2.shape
+
+    mean_L1 = np.abs(image1 - image2).mean()
+    return mean_L1
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('image1_path')
+    parser.add_argument('image2_path')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    mean_L1 = calc_images_mean_L1(args.image1_path, args.image2_path)
+    print(mean_L1)
--- a/.dev_scripts/images/v1_4_astronaut_rides_horse_plms_step50_seed42.png
+++ b/.dev_scripts/images/v1_4_astronaut_rides_horse_plms_step50_seed42.png
--- a/.dev_scripts/sample_command.txt
+++ b/.dev_scripts/sample_command.txt
@@ -0,0 +1 @@
+"a photograph of an astronaut riding a horse" -s50 -S42
--- a/.dev_scripts/test_regression_txt2img_dream_v1_4.sh
+++ b/.dev_scripts/test_regression_txt2img_dream_v1_4.sh
@@ -0,0 +1,20 @@
+# generate an image
+PROMPT_FILE=".dev_scripts/sample_command.txt"
+OUT_DIR="outputs/img-samples/test_regression_txt2img_v1_4"
+SAMPLES_DIR=${OUT_DIR}
+python scripts/dream.py \
+    --from_file ${PROMPT_FILE} \
+    --outdir ${OUT_DIR} \
+    --sampler plms \
+    --full_precision
+
+# original output by CompVis/stable-diffusion
+IMAGE1=".dev_scripts/images/v1_4_astronaut_rides_horse_plms_step50_seed42.png"
+# new output
+IMAGE2=`ls -A ${SAMPLES_DIR}/*.png | sort | tail -n 1`
+
+echo ""
+echo "comparing the following two images"
+echo "IMAGE1: ${IMAGE1}"
+echo "IMAGE2: ${IMAGE2}"
+python .dev_scripts/diff_images.py ${IMAGE1} ${IMAGE2}
--- a/.dev_scripts/test_regression_txt2img_v1_4.sh
+++ b/.dev_scripts/test_regression_txt2img_v1_4.sh
@@ -0,0 +1,23 @@
+# generate an image
+PROMPT="a photograph of an astronaut riding a horse"
+OUT_DIR="outputs/txt2img-samples/test_regression_txt2img_v1_4"
+SAMPLES_DIR="outputs/txt2img-samples/test_regression_txt2img_v1_4/samples"
+python scripts/orig_scripts/txt2img.py \
+    --prompt "${PROMPT}" \
+    --outdir ${OUT_DIR} \
+    --plms \
+    --ddim_steps 50 \
+    --n_samples 1 \
+    --n_iter 1 \
+    --seed 42
+
+# original output by CompVis/stable-diffusion
+IMAGE1=".dev_scripts/images/v1_4_astronaut_rides_horse_plms_step50_seed42.png"
+# new output
+IMAGE2=`ls -A ${SAMPLES_DIR}/*.png | sort | tail -n 1`
+
+echo ""
+echo "comparing the following two images"
+echo "IMAGE1: ${IMAGE1}"
+echo "IMAGE2: ${IMAGE2}"
+python .dev_scripts/diff_images.py ${IMAGE1} ${IMAGE2}
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+# Auto normalizes line endings on commit so devs don't need to change local settings.
+# Only affects text files and ignores other file types. 
+# For more info see: https://www.aleksandrhovhannisyan.com/blog/crlf-vs-lf-normalizing-line-endings-in-git/
+* text=auto
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,36 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe your environment**
+- GPU: [cuda/amd/mps/cpu]
+- VRAM: [if known]
+- CPU arch: [x86/arm]
+- OS: [Linux/Windows/macOS]
+- Python: [Anaconda/miniconda/miniforge/pyenv/other (explain)]
+- Branch: [if `git status` says anything other than "On branch main" paste it here]
+- Commit: [run `git show` and paste the line that starts with "Merge" here]
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,185 @@
+# ignore default image save location and model symbolic link
+outputs/
+models/ldm/stable-diffusion-v1/model.ckpt
+
+# ignore a directory which serves as a place for initial images
+inputs/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# emacs autosave and recovery files
+*~
+.#*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+src
+**/__pycache__/
+outputs
+
+# Logs and associated folders 
+# created from generated embeddings.
+logs
+testtube
+checkpoints
+# If it's a Mac
+.DS_Store
--- a/.gitmodules
+++ b/.gitmodules
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,137 @@
+# **Changelog**
+
+## v1.13 (in process)
+
+- Supports a Google Colab notebook for a standalone server running on Google hardware [Arturo Mendivil](https://github.com/artmen1516)
+- WebUI supports GFPGAN/ESRGAN facial reconstruction and upscaling [Kevin Gibbons](https://github.com/bakkot)
+- WebUI supports incremental display of in-progress images during generation [Kevin Gibbons](https://github.com/bakkot)
+- Output directory can be specified on the dream> command line.
+- The grid was displaying duplicated images when not enough images to fill the final row [Muhammad Usama](https://github.com/SMUsamaShah)
+- Can specify --grid on dream.py command line as the default.
+- Miscellaneous internal bug and stability fixes.
+
+---
+
+## v1.12 (28 August 2022)
+
+- Improved file handling, including ability to read prompts from standard input.
+  (kudos to [Yunsaki](https://github.com/yunsaki)
+- The web server is now integrated with the dream.py script. Invoke by adding --web to
+  the dream.py command arguments.
+- Face restoration and upscaling via GFPGAN and Real-ESGAN are now automatically
+  enabled if the GFPGAN directory is located as a sibling to Stable Diffusion.
+  VRAM requirements are modestly reduced. Thanks to both [Blessedcoolant](https://github.com/blessedcoolant) and
+  [Oceanswave](https://github.com/oceanswave) for their work on this.
+- You can now swap samplers on the dream> command line. [Blessedcoolant](https://github.com/blessedcoolant)
+
+---
+
+## v1.11 (26 August 2022)
+
+- NEW FEATURE: Support upscaling and face enhancement using the GFPGAN module. (kudos to [Oceanswave](https://github.com/Oceanswave)
+- You now can specify a seed of -1 to use the previous image's seed, -2 to use the seed for the image generated before that, etc.
+  Seed memory only extends back to the previous command, but will work on all images generated with the -n# switch.
+- Variant generation support temporarily disabled pending more general solution.
+- Created a feature branch named **yunsaki-morphing-dream** which adds experimental support for
+  iteratively modifying the prompt and its parameters. Please see[ Pull Request #86](https://github.com/lstein/stable-diffusion/pull/86)
+  for a synopsis of how this works. Note that when this feature is eventually added to the main branch, it will may be modified
+  significantly.
+
+---
+
+## v1.10 (25 August 2022)
+
+- A barebones but fully functional interactive web server for online generation of txt2img and img2img.
+
+---
+
+## v1.09 (24 August 2022)
+
+- A new -v option allows you to generate multiple variants of an initial image
+  in img2img mode. (kudos to [Oceanswave](https://github.com/Oceanswave). [
+  See this discussion in the PR for examples and details on use](https://github.com/lstein/stable-diffusion/pull/71#issuecomment-1226700810))
+- Added ability to personalize text to image generation (kudos to [Oceanswave](https://github.com/Oceanswave) and [nicolai256](https://github.com/nicolai256))
+- Enabled all of the samplers from k_diffusion
+
+---
+
+## v1.08 (24 August 2022)
+
+- Escape single quotes on the dream> command before trying to parse. This avoids
+  parse errors.
+- Removed instruction to get Python3.8 as first step in Windows install.
+  Anaconda3 does it for you.
+- Added bounds checks for numeric arguments that could cause crashes.
+- Cleaned up the copyright and license agreement files.
+
+---
+
+## v1.07 (23 August 2022)
+
+- Image filenames will now never fill gaps in the sequence, but will be assigned the
+  next higher name in the chosen directory. This ensures that the alphabetic and chronological
+  sort orders are the same.
+
+---
+
+## v1.06 (23 August 2022)
+
+- Added weighted prompt support contributed by [xraxra](https://github.com/xraxra)
+- Example of using weighted prompts to tweak a demonic figure contributed by [bmaltais](https://github.com/bmaltais)
+
+---
+
+## v1.05 (22 August 2022 - after the drop)
+
+- Filenames now use the following formats:
+  000010.95183149.png -- Two files produced by the same command (e.g. -n2),
+  000010.26742632.png -- distinguished by a different seed.
+
+  000011.455191342.01.png -- Two files produced by the same command using
+  000011.455191342.02.png -- a batch size>1 (e.g. -b2). They have the same seed.
+
+  000011.4160627868.grid#1-4.png -- a grid of four images (-g); the whole grid can
+  be regenerated with the indicated key
+
+- It should no longer be possible for one image to overwrite another
+- You can use the "cd" and "pwd" commands at the dream> prompt to set and retrieve
+  the path of the output directory.
+
+---
+
+## v1.04 (22 August 2022 - after the drop)
+
+- Updated README to reflect installation of the released weights.
+- Suppressed very noisy and inconsequential warning when loading the frozen CLIP
+  tokenizer.
+
+---
+
+## v1.03 (22 August 2022)
+
+- The original txt2img and img2img scripts from the CompViz repository have been moved into
+  a subfolder named "orig_scripts", to reduce confusion.
+
+---
+
+## v1.02 (21 August 2022)
+
+- A copy of the prompt and all of its switches and options is now stored in the corresponding
+  image in a tEXt metadata field named "Dream". You can read the prompt using scripts/images2prompt.py,
+  or an image editor that allows you to explore the full metadata.
+  **Please run "conda env update -f environment.yaml" to load the k_lms dependencies!!**
+
+---
+
+## v1.01 (21 August 2022)
+
+- added k_lms sampling.
+  **Please run "conda env update -f environment.yaml" to load the k_lms dependencies!!**
+- use half precision arithmetic by default, resulting in faster execution and lower memory requirements
+  Pass argument --full_precision to dream.py to get slower but more accurate image generation
+
+---
+
+## Links
+
+- **[Read Me](readme.md)**
--- a/30
+++ b/30
@@ -1,9 +1,27 @@
-All rights reserved by the authors.
-You must not distribute the weights provided to you directly or indirectly without explicit consent of the authors.
-You must not distribute harmful, offensive, dehumanizing content or otherwise harmful representations of people or their environments, cultures, religions, etc. produced with the model weights
-or other generated content described in the "Misuse and Malicious Use" section in the model card.
-The model weights are provided for research purposes only.
+MIT License

+Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
+
+This software is derived from a fork of the source code available from
+https://github.com/pesser/stable-diffusion and
+https://github.com/CompViz/stable-diffusion. They carry the following
+copyrights:
+
+Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich
+Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
+
+Please see individual source code files for copyright and authorship
+attributions.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
@@ -11,4 +29,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
--- a/LICENSE-ModelWeights.txt
+++ b/LICENSE-ModelWeights.txt
@@ -0,0 +1,294 @@
+Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
+
+CreativeML Open RAIL-M
+dated August 22, 2022
+
+Section I: PREAMBLE
+
+Multimodal generative models are being widely adopted and used, and
+have the potential to transform the way artists, among other
+individuals, conceive and benefit from AI or ML technologies as a tool
+for content creation.
+
+Notwithstanding the current and potential benefits that these
+artifacts can bring to society at large, there are also concerns about
+potential misuses of them, either due to their technical limitations
+or ethical considerations.
+
+In short, this license strives for both the open and responsible
+downstream use of the accompanying model. When it comes to the open
+character, we took inspiration from open source permissive licenses
+regarding the grant of IP rights. Referring to the downstream
+responsible use, we added use-based restrictions not permitting the
+use of the Model in very specific scenarios, in order for the licensor
+to be able to enforce the license in case potential misuses of the
+Model may occur. At the same time, we strive to promote open and
+responsible research on generative models for art and content
+generation.
+
+Even though downstream derivative versions of the model could be
+released under different licensing terms, the latter will always have
+to include - at minimum - the same use-based restrictions as the ones
+in the original license (this license). We believe in the intersection
+between open and responsible AI development; thus, this License aims
+to strike a balance between both in order to enable responsible
+open-science in the field of AI.
+
+This License governs the use of the model (and its derivatives) and is
+informed by the model card associated with the model.
+
+NOW THEREFORE, You and Licensor agree as follows:
+
+1. Definitions
+
+- "License" means the terms and conditions for use, reproduction, and
+  Distribution as defined in this document.
+
+- "Data" means a collection of information and/or content extracted
+  from the dataset used with the Model, including to train, pretrain,
+  or otherwise evaluate the Model. The Data is not licensed under this
+  License.
+
+- "Output" means the results of operating a Model as embodied in
+  informational content resulting therefrom.
+
+- "Model" means any accompanying machine-learning based assemblies
+  (including checkpoints), consisting of learnt weights, parameters
+  (including optimizer states), corresponding to the model
+  architecture as embodied in the Complementary Material, that have
+  been trained or tuned, in whole or in part on the Data, using the
+  Complementary Material.
+
+- "Derivatives of the Model" means all modifications to the Model,
+  works based on the Model, or any other model which is created or
+  initialized by transfer of patterns of the weights, parameters,
+  activations or output of the Model, to the other model, in order to
+  cause the other model to perform similarly to the Model, including -
+  but not limited to - distillation methods entailing the use of
+  intermediate data representations or methods based on the generation
+  of synthetic data by the Model for training the other model.
+
+- "Complementary Material" means the accompanying source code and
+  scripts used to define, run, load, benchmark or evaluate the Model,
+  and used to prepare data for training or evaluation, if any. This
+  includes any accompanying documentation, tutorials, examples, etc,
+  if any.
+
+- "Distribution" means any transmission, reproduction, publication or
+  other sharing of the Model or Derivatives of the Model to a third
+  party, including providing the Model as a hosted service made
+  available by electronic or other remote means - e.g. API-based or
+  web access.
+
+- "Licensor" means the copyright owner or entity authorized by the
+  copyright owner that is granting the License, including the persons
+  or entities that may have rights in the Model and/or distributing
+  the Model.
+
+- "You" (or "Your") means an individual or Legal Entity exercising
+  permissions granted by this License and/or making use of the Model
+  for whichever purpose and in any field of use, including usage of
+  the Model in an end-use application - e.g. chatbot, translator,
+  image generator.
+
+- "Third Parties" means individuals or legal entities that are not
+  under common control with Licensor or You.
+
+- "Contribution" means any work of authorship, including the original
+  version of the Model and any modifications or additions to that
+  Model or Derivatives of the Model thereof, that is intentionally
+  submitted to Licensor for inclusion in the Model by the copyright
+  owner or by an individual or Legal Entity authorized to submit on
+  behalf of the copyright owner. For the purposes of this definition,
+  "submitted" means any form of electronic, verbal, or written
+  communication sent to the Licensor or its representatives, including
+  but not limited to communication on electronic mailing lists, source
+  code control systems, and issue tracking systems that are managed
+  by, or on behalf of, the Licensor for the purpose of discussing and
+  improving the Model, but excluding communication that is
+  conspicuously marked or otherwise designated in writing by the
+  copyright owner as "Not a Contribution."
+
+- "Contributor" means Licensor and any individual or Legal Entity on
+  behalf of whom a Contribution has been received by Licensor and
+  subsequently incorporated within the Model.
+
+Section II: INTELLECTUAL PROPERTY RIGHTS
+
+Both copyright and patent grants apply to the Model, Derivatives of
+the Model and Complementary Material. The Model and Derivatives of the
+Model are subject to additional terms as described in Section III.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare, publicly display, publicly
+perform, sublicense, and distribute the Complementary Material, the
+Model, and Derivatives of the Model.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License and where and as applicable, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this paragraph) patent
+license to make, have made, use, offer to sell, sell, import, and
+otherwise transfer the Model and the Complementary Material, where
+such license applies only to those patent claims licensable by such
+Contributor that are necessarily infringed by their Contribution(s)
+alone or by combination of their Contribution(s) with the Model to
+which such Contribution(s) was submitted. If You institute patent
+litigation against any entity (including a cross-claim or counterclaim
+in a lawsuit) alleging that the Model and/or Complementary Material or
+a Contribution incorporated within the Model and/or Complementary
+Material constitutes direct or contributory patent infringement, then
+any patent licenses granted to You under this License for the Model
+and/or Work shall terminate as of the date such litigation is asserted
+or filed.
+
+Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
+
+4. Distribution and Redistribution. You may host for Third Party
+remote access purposes (e.g. software-as-a-service), reproduce and
+distribute copies of the Model or Derivatives of the Model thereof in
+any medium, with or without modifications, provided that You meet the
+following conditions: Use-based restrictions as referenced in
+paragraph 5 MUST be included as an enforceable provision by You in any
+type of legal agreement (e.g. a license) governing the use and/or
+distribution of the Model or Derivatives of the Model, and You shall
+give notice to subsequent users You Distribute to, that the Model or
+Derivatives of the Model are subject to paragraph 5. This provision
+does not apply to the use of Complementary Material.  You must give
+any Third Party recipients of the Model or Derivatives of the Model a
+copy of this License; You must cause any modified files to carry
+prominent notices stating that You changed the files; You must retain
+all copyright, patent, trademark, and attribution notices excluding
+those notices that do not pertain to any part of the Model,
+Derivatives of the Model.  You may add Your own copyright statement to
+Your modifications and may provide additional or different license
+terms and conditions - respecting paragraph 4.a. - for use,
+reproduction, or Distribution of Your modifications, or for any such
+Derivatives of the Model as a whole, provided Your use, reproduction,
+and Distribution of the Model otherwise complies with the conditions
+stated in this License.
+
+5. Use-based restrictions. The restrictions set forth in Attachment A
+are considered Use-based restrictions. Therefore You cannot use the
+Model and the Derivatives of the Model for the specified restricted
+uses. You may use the Model subject to this License, including only
+for lawful purposes and in accordance with the License. Use may
+include creating any content with, finetuning, updating, running,
+training, evaluating and/or reparametrizing the Model. You shall
+require all of Your users who use the Model or a Derivative of the
+Model to comply with the terms of this paragraph (paragraph 5).
+
+6. The Output You Generate. Except as set forth herein, Licensor
+claims no rights in the Output You generate using the Model. You are
+accountable for the Output you generate and its subsequent uses. No
+use of the output can contravene any provision as stated in the
+License.
+
+Section IV: OTHER PROVISIONS
+
+7. Updates and Runtime Restrictions. To the maximum extent permitted
+by law, Licensor reserves the right to restrict (remotely or
+otherwise) usage of the Model in violation of this License, update the
+Model through electronic means, or modify the Output of the Model
+based on updates. You shall undertake reasonable efforts to use the
+latest version of the Model.
+
+8. Trademarks and related. Nothing in this License permits You to make
+use of Licensors’ trademarks, trade names, logos or to otherwise
+suggest endorsement or misrepresent the relationship between the
+parties; and any rights not expressly granted herein are reserved by
+the Licensors.
+
+9. Disclaimer of Warranty. Unless required by applicable law or agreed
+to in writing, Licensor provides the Model and the Complementary
+Material (and each Contributor provides its Contributions) on an "AS
+IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR
+A PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Model, Derivatives of
+the Model, and the Complementary Material and assume any risks
+associated with Your exercise of permissions under this License.
+
+10. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise, unless
+required by applicable law (such as deliberate and grossly negligent
+acts) or agreed to in writing, shall any Contributor be liable to You
+for damages, including any direct, indirect, special, incidental, or
+consequential damages of any character arising as a result of this
+License or out of the use or inability to use the Model and the
+Complementary Material (including but not limited to damages for loss
+of goodwill, work stoppage, computer failure or malfunction, or any
+and all other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+11. Accepting Warranty or Additional Liability. While redistributing
+the Model, Derivatives of the Model and the Complementary Material
+thereof, You may choose to offer, and charge a fee for, acceptance of
+support, warranty, indemnity, or other liability obligations and/or
+rights consistent with this License. However, in accepting such
+obligations, You may act only on Your own behalf and on Your sole
+responsibility, not on behalf of any other Contributor, and only if
+You agree to indemnify, defend, and hold each Contributor harmless for
+any liability incurred by, or claims asserted against, such
+Contributor by reason of your accepting any such warranty or
+additional liability.
+
+12. If any provision of this License is held to be invalid, illegal or
+unenforceable, the remaining provisions shall be unaffected thereby
+and remain valid as if such provision had not been set forth herein.
+
+END OF TERMS AND CONDITIONS
+
+
+
+
+Attachment A
+
+Use Restrictions
+
+You agree not to use the Model or Derivatives of the Model:
+
+- In any way that violates any applicable national, federal, state,
+  local or international law or regulation;
+
+- For the purpose of exploiting, harming or attempting to exploit or
+  harm minors in any way;
+
+- To generate or disseminate verifiably false information and/or
+  content with the purpose of harming others;
+
+- To generate or disseminate personal identifiable information that
+  can be used to harm an individual;
+
+- To defame, disparage or otherwise harass others;
+
+- For fully automated decision making that adversely impacts an
+  individual’s legal rights or otherwise creates or modifies a
+  binding, enforceable obligation;
+
+pp- For any use intended to or which has the effect of discriminating
+  against or harming individuals or groups based on online or offline
+  social behavior or known or predicted personal or personality
+  characteristics;
+
+- To exploit any of the vulnerabilities of a specific group of persons
+  based on their age, social, physical or mental characteristics, in
+  order to materially distort the behavior of a person pertaining to
+  that group in a manner that causes or is likely to cause that person
+  or another person physical or psychological harm;
+
+- For any use intended to or which has the effect of discriminating
+  against individuals or groups based on legally protected
+  characteristics or categories;
+
+- To provide medical advice and medical results interpretation;
+
+- To generate or disseminate information for the purpose to be used
+  for administration of justice, law enforcement, immigration or
+  asylum processes, such as predicting an individual will commit
+  fraud/crime commitment (e.g. by text profiling, drawing causal
+  relationships between assertions made in documents, indiscriminate
+  and arbitrarily-targeted use).
--- a/README-CompViz.md
+++ b/README-CompViz.md
@@ -0,0 +1,210 @@
+# Original README from CompViz/stable-diffusion
+*Stable Diffusion was made possible thanks to a collaboration with [Stability AI](https://stability.ai/) and [Runway](https://runwayml.com/) and builds upon our previous work:*
+
+[**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)<br/>
+[Robin Rombach](https://github.com/rromb)\*,
+[Andreas Blattmann](https://github.com/ablattmann)\*,
+[Dominik Lorenz](https://github.com/qp-qp)\,
+[Patrick Esser](https://github.com/pesser),
+[Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
+
+**CVPR '22 Oral**
+
+which is available on [GitHub](https://github.com/CompVis/latent-diffusion). PDF at [arXiv](https://arxiv.org/abs/2112.10752). Please also visit our [Project page](https://ommer-lab.com/research/latent-diffusion-models/).
+
+![txt2img-stable2](assets/stable-samples/txt2img/merged-0006.png)
+[Stable Diffusion](#stable-diffusion-v1) is a latent text-to-image diffusion
+model.
+Thanks to a generous compute donation from [Stability AI](https://stability.ai/) and support from [LAION](https://laion.ai/), we were able to train a Latent Diffusion Model on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. 
+Similar to Google's [Imagen](https://arxiv.org/abs/2205.11487), 
+this model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts.
+With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
+See [this section](#stable-diffusion-v1) below and the [model card](https://huggingface.co/CompVis/stable-diffusion).
+
+  
+## Requirements
+
+A suitable [conda](https://conda.io/) environment named `ldm` can be created
+and activated with:
+
+```
+conda env create -f environment.yaml
+conda activate ldm
+```
+
+You can also update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running
+
+```
+conda install pytorch torchvision -c pytorch
+pip install transformers==4.19.2
+pip install -e .
+```
+
+## Stable Diffusion v1
+
+Stable Diffusion v1 refers to a specific configuration of the model
+architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet
+and CLIP ViT-L/14 text encoder for the diffusion model. The model was pretrained on 256x256 images and 
+then finetuned on 512x512 images.
+
+*Note: Stable Diffusion v1 is a general text-to-image diffusion model and therefore mirrors biases and (mis-)conceptions that are present
+in its training data. 
+Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/CompVis/stable-diffusion).
+Research into the safe deployment of general text-to-image models is an ongoing effort. To prevent misuse and harm, we currently provide access to the checkpoints only for [academic research purposes upon request](https://stability.ai/academia-access-form).
+**This is an experiment in safe and community-driven publication of a capable and general text-to-image model. We are working on a public release with a more permissive license that also incorporates ethical considerations.***
+
+[Request access to Stable Diffusion v1 checkpoints for academic research](https://stability.ai/academia-access-form) 
+
+### Weights
+
+We currently provide three checkpoints, `sd-v1-1.ckpt`, `sd-v1-2.ckpt` and `sd-v1-3.ckpt`,
+which were trained as follows,
+
+- `sd-v1-1.ckpt`: 237k steps at resolution `256x256` on [laion2B-en](https://huggingface.co/datasets/laion/laion2B-en).
+  194k steps at resolution `512x512` on [laion-high-resolution](https://huggingface.co/datasets/laion/laion-high-resolution) (170M examples from LAION-5B with resolution `>= 1024x1024`).
+- `sd-v1-2.ckpt`: Resumed from `sd-v1-1.ckpt`.
+  515k steps at resolution `512x512` on "laion-improved-aesthetics" (a subset of laion2B-en,
+filtered to images with an original size `>= 512x512`, estimated aesthetics score `> 5.0`, and an estimated watermark probability `< 0.5`. The watermark estimate is from the LAION-5B metadata, the aesthetics score is estimated using an [improved aesthetics estimator](https://github.com/christophschuhmann/improved-aesthetic-predictor)).
+- `sd-v1-3.ckpt`: Resumed from `sd-v1-2.ckpt`. 195k steps at resolution `512x512` on "laion-improved-aesthetics" and 10\% dropping of the text-conditioning to improve [classifier-free guidance sampling](https://arxiv.org/abs/2207.12598).
+
+Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
+5.0, 6.0, 7.0, 8.0) and 50 PLMS sampling
+steps show the relative improvements of the checkpoints:
+![sd evaluation results](assets/v1-variants-scores.jpg)
+
+
+
+### Text-to-Image with Stable Diffusion
+![txt2img-stable2](assets/stable-samples/txt2img/merged-0005.png)
+![txt2img-stable2](assets/stable-samples/txt2img/merged-0007.png)
+
+Stable Diffusion is a latent diffusion model conditioned on the (non-pooled) text embeddings of a CLIP ViT-L/14 text encoder.
+
+
+#### Sampling Script
+
+After [obtaining the weights](#weights), link them
+```
+mkdir -p models/ldm/stable-diffusion-v1/
+ln -s <path/to/model.ckpt> models/ldm/stable-diffusion-v1/model.ckpt 
+```
+and sample with
+```
+python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms 
+```
+
+By default, this uses a guidance scale of `--scale 7.5`, [Katherine Crowson's implementation](https://github.com/CompVis/latent-diffusion/pull/51) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler, 
+and renders images of size 512x512 (which it was trained on) in 50 steps. All supported arguments are listed below (type `python scripts/txt2img.py --help`).
+
+```commandline
+usage: txt2img.py [-h] [--prompt [PROMPT]] [--outdir [OUTDIR]] [--skip_grid] [--skip_save] [--ddim_steps DDIM_STEPS] [--plms] [--laion400m] [--fixed_code] [--ddim_eta DDIM_ETA] [--n_iter N_ITER] [--H H] [--W W] [--C C] [--f F] [--n_samples N_SAMPLES] [--n_rows N_ROWS]
+                  [--scale SCALE] [--from-file FROM_FILE] [--config CONFIG] [--ckpt CKPT] [--seed SEED] [--precision {full,autocast}]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --prompt [PROMPT]     the prompt to render
+  --outdir [OUTDIR]     dir to write results to
+  --skip_grid           do not save a grid, only individual samples. Helpful when evaluating lots of samples
+  --skip_save           do not save individual samples. For speed measurements.
+  --ddim_steps DDIM_STEPS
+                        number of ddim sampling steps
+  --plms                use plms sampling
+  --laion400m           uses the LAION400M model
+  --fixed_code          if enabled, uses the same starting code across samples
+  --ddim_eta DDIM_ETA   ddim eta (eta=0.0 corresponds to deterministic sampling
+  --n_iter N_ITER       sample this often
+  --H H                 image height, in pixel space
+  --W W                 image width, in pixel space
+  --C C                 latent channels
+  --f F                 downsampling factor
+  --n_samples N_SAMPLES
+                        how many samples to produce for each given prompt. A.k.a. batch size
+                        (note that the seeds for each image in the batch will be unavailable)
+  --n_rows N_ROWS       rows in the grid (default: n_samples)
+  --scale SCALE         unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
+  --from-file FROM_FILE
+                        if specified, load prompts from this file
+  --config CONFIG       path to config which constructs model
+  --ckpt CKPT           path to checkpoint of model
+  --seed SEED           the seed (for reproducible sampling)
+  --precision {full,autocast}
+                        evaluate at this precision
+
+```
+Note: The inference config for all v1 versions is designed to be used with EMA-only checkpoints. 
+For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from
+non-EMA to EMA weights. If you want to examine the effect of EMA vs no EMA, we provide "full" checkpoints
+which contain both types of weights. For these, `use_ema=False` will load and use the non-EMA weights.
+
+
+#### Diffusers Integration
+
+Another way to download and sample Stable Diffusion is by using the [diffusers library](https://github.com/huggingface/diffusers/tree/main#new--stable-diffusion-is-now-fully-compatible-with-diffusers)
+```py
+# make sure you're logged in with `huggingface-cli login`
+from torch import autocast
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained(
+	"CompVis/stable-diffusion-v1-3-diffusers", 
+	use_auth_token=True
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+with autocast("cuda"):
+    image = pipe(prompt)["sample"][0]  
+    
+image.save("astronaut_rides_horse.png")
+```
+
+
+
+### Image Modification with Stable Diffusion
+
+By using a diffusion-denoising mechanism as first proposed by [SDEdit](https://arxiv.org/abs/2108.01073), the model can be used for different 
+tasks such as text-guided image-to-image translation and upscaling. Similar to the txt2img sampling script, 
+we provide a script to perform image modification with Stable Diffusion.  
+
+The following describes an example where a rough sketch made in [Pinta](https://www.pinta-project.com/) is converted into a detailed artwork.
+```
+python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img <path-to-img.jpg> --strength 0.8
+```
+Here, strength is a value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. 
+Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. See the following example.
+
+**Input**
+
+![sketch-in](assets/stable-samples/img2img/sketch-mountains-input.jpg)
+
+**Outputs**
+
+![out3](assets/stable-samples/img2img/mountains-3.png)
+![out2](assets/stable-samples/img2img/mountains-2.png)
+
+This procedure can, for example, also be used to upscale samples from the base model.
+
+
+## Comments 
+
+- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion)
+and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). 
+Thanks for open-sourcing!
+
+- The implementation of the transformer encoder is from [x-transformers](https://github.com/lucidrains/x-transformers) by [lucidrains](https://github.com/lucidrains?tab=repositories). 
+
+
+## BibTeX
+
+```
+@misc{rombach2021highresolution,
+      title={High-Resolution Image Synthesis with Latent Diffusion Models}, 
+      author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
+      year={2021},
+      eprint={2112.10752},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+```
+
+
--- a/README-Mac-MPS.md
+++ b/README-Mac-MPS.md
@@ -0,0 +1,322 @@
+# macOS Instructions
+
+Requirements
+
+- macOS 12.3 Monterey or later
+- Python
+- Patience
+- Apple Silicon*
+
+*I haven't tested any of this on Intel Macs but I have read that one person got
+it to work, so Apple Silicon might not be requried.
+
+Things have moved really fast and so these instructions change often and are
+often out-of-date. One of the problems is that there are so many different ways to
+run this.
+
+We are trying to build a testing setup so that when we make changes it doesn't
+always break.
+
+How to (this hasn't been 100% tested yet):
+
+First get the weights checkpoint download started - it's big:
+
+1. Sign up at https://huggingface.co
+2. Go to the [Stable diffusion diffusion model page](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)
+3. Accept the terms and click Access Repository: 
+4. Download [sd-v1-4.ckpt (4.27 GB)](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/blob/main/sd-v1-4.ckpt) and note where you have saved it (probably the Downloads folder)
+
+While that is downloading, open Terminal and run the following commands one at a time.
+
+```
+# install brew (and Xcode command line tools):
+/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+
+# install python 3, git, cmake, protobuf:
+brew install cmake protobuf rust
+
+# install miniconda (M1 arm64 version):
+curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o Miniconda3-latest-MacOSX-arm64.sh
+/bin/bash Miniconda3-latest-MacOSX-arm64.sh
+
+# clone the repo
+git clone https://github.com/lstein/stable-diffusion.git
+cd stable-diffusion
+
+#
+# wait until the checkpoint file has downloaded, then proceed
+#
+
+# create symlink to checkpoint
+mkdir -p models/ldm/stable-diffusion-v1/
+
+PATH_TO_CKPT="$HOME/Downloads"  # or wherever you saved sd-v1-4.ckpt
+
+ln -s "$PATH_TO_CKPT/sd-v1-4.ckpt" models/ldm/stable-diffusion-v1/model.ckpt
+
+# install packages
+PIP_EXISTS_ACTION=w CONDA_SUBDIR=osx-arm64 conda env create -f environment-mac.yaml
+conda activate ldm
+
+# only need to do this once
+python scripts/preload_models.py
+
+# run SD!
+python scripts/dream.py --full_precision  # half-precision requires autocast and won't work
+```
+
+The original scripts should work as well.
+
+```
+python scripts/orig_scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms
+```
+
+Note, `export PIP_EXISTS_ACTION=w` is a precaution to fix `conda env create -f environment-mac.yaml`
+never finishing in some situations. So it isn't required but wont hurt.
+
+After you follow all the instructions and run dream.py you might get several
+errors. Here's the errors I've seen and found solutions for.
+
+### Is it slow?
+
+Be sure to specify 1 sample and 1 iteration.
+
+	python ./scripts/orig_scripts/txt2img.py --prompt "ocean" --ddim_steps 5 --n_samples 1 --n_iter 1
+
+### Doesn't work anymore?
+
+PyTorch nightly includes support for MPS. Because of this, this setup is
+inherently unstable. One morning I woke up and it no longer worked no matter
+what I did until I switched to miniforge. However, I have another Mac that works
+just fine with Anaconda. If you can't get it to work, please search a little
+first because many of the errors will get posted and solved. If you can't find
+a solution please [create an issue](https://github.com/lstein/stable-diffusion/issues).
+
+One debugging step is to update to the latest version of PyTorch nightly.
+
+	conda install pytorch torchvision torchaudio -c pytorch-nightly
+
+If `conda env create -f environment-mac.yaml` takes forever run this.
+
+	git clean -f
+
+And run this.
+
+	conda clean --yes --all
+
+Or you could reset Anaconda.
+
+	conda update --force-reinstall -y -n base -c defaults conda
+
+### "No module named cv2", torch, 'ldm', 'transformers', 'taming', etc.
+
+There are several causes of these errors.
+
+First, did you remember to `conda activate ldm`? If your terminal prompt
+begins with "(ldm)" then you activated it. If it begins with "(base)"
+or something else you haven't.
+
+Second, you might've run `./scripts/preload_models.py` or `./scripts/dream.py`
+instead of `python ./scripts/preload_models.py` or `python ./scripts/dream.py`.
+The cause of this error is long so it's below.
+
+Third, if it says you're missing taming you need to rebuild your virtual
+environment.
+
+	conda env remove -n ldm
+	conda env create -f environment-mac.yaml
+
+Fourth, If you have activated the ldm virtual environment and tried rebuilding
+it, maybe the problem could be that I have something installed that
+you don't and you'll just need to manually install it. Make sure you
+activate the virtual environment so it installs there instead of
+globally.
+
+	conda activate ldm
+	pip install *name*
+
+You might also need to install Rust (I mention this again below).
+
+### How many snakes are living in your computer?
+
+Here's the reason why you have to specify which python to use.
+There are several versions of python on macOS and the computer is
+picking the wrong one. More specifically, preload_models.py and dream.py says to
+find the first `python3` in the path environment variable. You can see which one
+it is picking with `which python3`. These are the mostly likely paths you'll see.
+
+	% which python3
+	/usr/bin/python3
+
+The above path is part of the OS. However, that path is a stub that asks you if
+you want to install Xcode. If you have Xcode installed already,
+/usr/bin/python3 will execute /Library/Developer/CommandLineTools/usr/bin/python3 or
+/Applications/Xcode.app/Contents/Developer/usr/bin/python3 (depending on which
+Xcode you've selected with `xcode-select`).
+
+	% which python3
+	/opt/homebrew/bin/python3
+
+If you installed python3 with Homebrew and you've modified your path to search
+for Homebrew binaries before system ones, you'll see the above path.
+
+	% which python
+	/opt/anaconda3/bin/python
+
+If you drop the "3" you get an entirely different python. Note: starting in
+macOS 12.3, /usr/bin/python no longer exists (it was python 2 anyway).
+
+If you have Anaconda installed, this is what you'll see. There is a
+/opt/anaconda3/bin/python3 also.
+
+	(ldm) % which python
+	/Users/name/miniforge3/envs/ldm/bin/python
+
+This is what you'll see if you have miniforge and you've correctly activated
+the ldm environment. This is the goal.
+
+It's all a mess and you should know [how to modify the path environment variable](https://support.apple.com/guide/terminal/use-environment-variables-apd382cc5fa-4f58-4449-b20a-41c53c006f8f/mac)
+if you want to fix it. Here's a brief hint of all the ways you can modify it
+(don't really have the time to explain it all here).
+
+- ~/.zshrc
+- ~/.bash_profile
+- ~/.bashrc
+- /etc/paths.d
+- /etc/path
+
+Which one you use will depend on what you have installed except putting a file
+in /etc/paths.d is what I prefer to do.
+
+### Debugging?
+
+Tired of waiting for your renders to finish before you can see if it
+works? Reduce the steps! The image quality will be horrible but at least you'll
+get quick feedback.
+
+	python ./scripts/txt2img.py --prompt "ocean" --ddim_steps 5 --n_samples 1 --n_iter 1
+
+### OSError: Can't load tokenizer for 'openai/clip-vit-large-patch14'...
+
+	python scripts/preload_models.py
+
+### "The operator [name] is not current implemented for the MPS device." (sic)
+
+Example error.
+
+```
+...
+NotImplementedError: The operator 'aten::_index_put_impl_' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on [https://github.com/pytorch/pytorch/issues/77764](https://github.com/pytorch/pytorch/issues/77764). As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.
+```
+
+The lstein branch includes this fix in [environment-mac.yaml](https://github.com/lstein/stable-diffusion/blob/main/environment-mac.yaml).
+
+### "Could not build wheels for tokenizers"
+
+I have not seen this error because I had Rust installed on my computer before I started playing with Stable Diffusion. The fix is to install Rust.
+
+	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+### How come `--seed` doesn't work?
+
+First this:
+
+> Completely reproducible results are not guaranteed across PyTorch
+releases, individual commits, or different platforms. Furthermore,
+results may not be reproducible between CPU and GPU executions, even
+when using identical seeds.
+
+[PyTorch docs](https://pytorch.org/docs/stable/notes/randomness.html)
+
+Second, we might have a fix that at least gets a consistent seed sort of. We're
+still working on it.
+
+### libiomp5.dylib error?
+
+	OMP: Error #15: Initializing libiomp5.dylib, but found libomp.dylib already initialized.
+
+You are likely using an Intel package by mistake. Be sure to run conda with
+the environment variable `CONDA_SUBDIR=osx-arm64`, like so:
+
+`CONDA_SUBDIR=osx-arm64 conda install ...`
+
+This error happens with Anaconda on Macs when the Intel-only `mkl` is pulled in by
+a dependency. [nomkl](https://stackoverflow.com/questions/66224879/what-is-the-nomkl-python-package-used-for)
+is a metapackage designed to prevent this, by making it impossible to install
+`mkl`, but if your environment is already broken it may not work.
+
+Do *not* use `os.environ['KMP_DUPLICATE_LIB_OK']='True'` or equivalents as this
+masks the underlying issue of using Intel packages.
+
+### Not enough memory.
+
+This seems to be a common problem and is probably the underlying
+problem for a lot of symptoms (listed below). The fix is to lower your
+image size or to add `model.half()` right after the model is loaded. I
+should probably test it out. I've read that the reason this fixes
+problems is because it converts the model from 32-bit to 16-bit and
+that leaves more RAM for other things. I have no idea how that would
+affect the quality of the images though.
+
+See [this issue](https://github.com/CompVis/stable-diffusion/issues/71).
+
+### "Error: product of dimension sizes > 2**31'"
+
+This error happens with img2img, which I haven't played with too much
+yet. But I know it's because your image is too big or the resolution
+isn't a multiple of 32x32. Because the stable-diffusion model was
+trained on images that were 512 x 512, it's always best to use that
+output size (which is the default). However, if you're using that size
+and you get the above error, try 256 x 256 or 512 x 256 or something
+as the source image.
+
+BTW, 2**31-1 = [2,147,483,647](https://en.wikipedia.org/wiki/2,147,483,647#In_computing), which is also 32-bit signed [LONG_MAX](https://en.wikipedia.org/wiki/C_data_types) in C.
+
+### I just got Rickrolled! Do I have a virus?
+
+You don't have a virus. It's part of the project. Here's
+[Rick](https://github.com/lstein/stable-diffusion/blob/main/assets/rick.jpeg)
+and here's [the
+code](https://github.com/lstein/stable-diffusion/blob/69ae4b35e0a0f6ee1af8bb9a5d0016ccb27e36dc/scripts/txt2img.py#L79)
+that swaps him in. It's a NSFW filter, which IMO, doesn't work very
+good (and we call this "computer vision", sheesh).
+
+Actually, this could be happening because there's not enough RAM. You could try the `model.half()` suggestion or specify smaller output images.
+
+### My images come out black
+
+We might have this fixed, we are still testing.
+
+There's a [similar issue](https://github.com/CompVis/stable-diffusion/issues/69)
+on CUDA GPU's where the images come out green. Maybe it's the same issue?
+Someone in that issue says to use "--precision full", but this fork
+actually disables that flag. I don't know why, someone else provided
+that code and I don't know what it does. Maybe the `model.half()`
+suggestion above would fix this issue too. I should probably test it.
+
+### "view size is not compatible with input tensor's size and stride"
+
+```
+  File "/opt/anaconda3/envs/ldm/lib/python3.10/site-packages/torch/nn/functional.py", line 2511, in layer_norm
+    return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
+RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+```
+
+Update to the latest version of lstein/stable-diffusion. We were
+patching pytorch but we found a file in stable-diffusion that we could
+change instead. This is a 32-bit vs 16-bit problem.
+
+### The processor must support the Intel bla bla bla
+
+What? Intel? On an Apple Silicon?
+
+	Intel MKL FATAL ERROR: This system does not meet the minimum requirements for use of the Intel(R) Math Kernel Library.
+	The processor must support the Intel(R) Supplemental Streaming SIMD Extensions 3 (Intel(R) SSSE3) instructions.
+	The processor must support the Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) instructions.
+	The processor must support the Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
+
+This is due to the Intel `mkl` package getting picked up when you try to install
+something that depends on it-- Rosetta can translate some Intel instructions but
+not the specialized ones here. To avoid this, make sure to use the environment
+variable `CONDA_SUBDIR=osx-arm64`, which restricts the Conda environment to only
+use ARM packages, and use `nomkl` as described above.
--- a/README.md
+++ b/README.md
--- a/Stable-Diffusion-local-Windows.ipynb
+++ b/Stable-Diffusion-local-Windows.ipynb
@@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Easy-peasy Windows install"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that you will need NVIDIA drivers, Python 3.10, and Git installed\n",
+    "beforehand - simplified\n",
+    "[step-by-step instructions](https://github.com/lstein/stable-diffusion/wiki/Easy-peasy-Windows-install)\n",
+    "are available in the wiki (you'll only need steps 1, 2, & 3 )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run each cell in turn. In VSCode, either hit SHIFT-ENTER, or click on the little ▶️ to the left of the cell. In Jupyter/JupyterLab, you **must** hit SHIFT-ENTER"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install pew"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%cmd\n",
+    "git clone https://github.com/lstein/stable-diffusion.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd stable-diffusion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile requirements.txt\n",
+    "albumentations==0.4.3\n",
+    "einops==0.3.0\n",
+    "huggingface-hub==0.8.1\n",
+    "imageio-ffmpeg==0.4.2\n",
+    "imageio==2.9.0\n",
+    "kornia==0.6.0\n",
+    "omegaconf==2.1.1\n",
+    "opencv-python==4.6.0.66\n",
+    "pillow==9.2.0\n",
+    "pudb==2019.2\n",
+    "pytorch-lightning==1.4.2\n",
+    "streamlit==1.12.0\n",
+    "# Regular \"taming-transformers\" doesn't seem to work\n",
+    "taming-transformers-rom1504==0.0.6\n",
+    "test-tube>=0.7.5\n",
+    "torch-fidelity==0.3.0\n",
+    "torchmetrics==0.6.0\n",
+    "torchvision==0.12.0\n",
+    "transformers==4.19.2\n",
+    "git+https://github.com/openai/CLIP.git@main#egg=clip\n",
+    "git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion\n",
+    "# No CUDA in PyPi builds\n",
+    "torch@https://download.pytorch.org/whl/cu113/torch-1.11.0%2Bcu113-cp310-cp310-win_amd64.whl\n",
+    "# No MKL in PyPi builds (faster, more robust than OpenBLAS)\n",
+    "numpy@https://download.lfd.uci.edu/pythonlibs/archived/numpy-1.22.4+mkl-cp310-cp310-win_amd64.whl\n",
+    "-e .\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%cmd\n",
+    "pew new --python 3.10 -r requirements.txt --dont-activate ldm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Switch the notebook kernel to the new 'ldm' environment!\n",
+    "\n",
+    "## VSCode: restart VSCode and come back to this cell\n",
+    "\n",
+    "1. Ctrl+Shift+P\n",
+    "1. Type \"Select Interpreter\" and select \"Jupyter: Select Interpreter to Start Jupyter Server\"\n",
+    "1. VSCode will say that it needs to install packages. Click the \"Install\" button.\n",
+    "1. Once the install is finished, do 1 & 2 again\n",
+    "1. Pick 'ldm'\n",
+    "1. Run the following cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd stable-diffusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Jupyter/JupyterLab\n",
+    "\n",
+    "1. Run the cell below\n",
+    "1. Click on the toolbar where it says \"(ipyknel)\" ↗️. You should get a pop-up asking you to \"Select Kernel\". Pick 'ldm' from the drop-down.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### DO NOT RUN THE FOLLOWING CELL IF YOU ARE USING VSCODE!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# DO NOT RUN THIS CELL IF YOU ARE USING VSCODE!!\n",
+    "%%cmd\n",
+    "pew workon ldm\n",
+    "pip3 install ipykernel\n",
+    "python -m ipykernel install --name=ldm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### When running the next cell, Jupyter/JupyterLab users might get a warning saying \"IProgress not found\". This can be ignored."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%run \"scripts/preload_models.py\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%cmd\n",
+    "mkdir \"models/ldm/stable-diffusion-v1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now copy the SD model you downloaded from Hugging Face into the above new directory, and (if necessary) rename it to 'model.ckpt'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now go create some magic!\n",
+    "\n",
+    "VSCode\n",
+    "\n",
+    "- The actual input box for the 'dream' prompt will appear at the very top of the VSCode window. Type in your commands and hit 'ENTER'.\n",
+    "- To quit, hit the 'Interrupt' button in the toolbar up there ⬆️ a couple of times, then hit ENTER (you'll probably see a terrifying traceback from Python - just ignore it).\n",
+    "\n",
+    "Jupyter/JupyterLab\n",
+    "\n",
+    "- The input box for the 'dream' prompt will appear below. Type in your commands and hit 'ENTER'.\n",
+    "- To quit, hit the interrupt button (⏹️) in the toolbar up there ⬆️ a couple of times, then hit ENTER (you'll probably see a terrifying traceback from Python - just ignore it)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%run \"scripts/dream.py\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Once this seems to be working well, you can try opening a terminal\n",
+    "\n",
+    "- VSCode: type ('CTRL+`')\n",
+    "- Jupyter/JupyterLab: File|New Terminal\n",
+    "- Or jump out of the notebook entirely, and open Powershell/Command Prompt\n",
+    "\n",
+    "Now:\n",
+    "\n",
+    "1. `cd` to wherever the 'stable-diffusion' directory is\n",
+    "1. Run `pew workon ldm`\n",
+    "1. Run `winpty python scripts\\dream.py`"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.6 ('ldm')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "a05e4574567b7bc2c98f7f9aa579f9ea5b8739b54844ab610ac85881c4be2659"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/Stable_Diffusion_AI_Notebook.ipynb
+++ b/Stable_Diffusion_AI_Notebook.ipynb
@@ -0,0 +1,256 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Stable_Diffusion_AI_Notebook.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "private_outputs": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU",
+    "gpuClass": "standard"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Stable Diffusion AI Notebook\n",
+        "\n",
+        "<img src=\"https://user-images.githubusercontent.com/60411196/186547976-d9de378a-9de8-4201-9c25-c057a9c59bad.jpeg\" alt=\"stable-diffusion-ai\" width=\"170px\"/> <br>\n",
+        "#### Instructions:\n",
+        "1. Execute each cell in order to mount a Dream bot and create images from text. <br>\n",
+        "2. Once cells 1-8 were run correctly you'll be executing a terminal in cell #9, you'll to enter `pipenv run scripts/dream.py` command to run Dream bot.<br> \n",
+        "3. After launching dream bot, you'll see: <br> `Dream > ` in terminal. <br> Insert a command, eg. `Dream > Astronaut floating in a distant galaxy`, or type `-h` for help.\n",
+        "3. After completion you'll see your generated images in path `stable-diffusion/outputs/img-samples/`, you can also display images in cell #10.\n",
+        "4. To quit Dream bot use `q` command. <br> \n",
+        "---\n",
+        "<font color=\"red\">Note:</font> It takes some time to load, but after installing all dependencies you can use the bot all time you want while colab instance is up. <br>\n",
+        "<font color=\"red\">Requirements:</font> For this notebook to work you need to have [Stable-Diffusion-v-1-4](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original) stored in your Google Drive, it will be needed in cell #6\n",
+        "##### For more details visit Github repository: [lstein/stable-diffusion](https://github.com/lstein/stable-diffusion)\n",
+        "---\n"
+      ],
+      "metadata": {
+        "id": "ycYWcsEKc6w7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 1. Check current GPU assigned\n",
+        "!nvidia-smi -L\n",
+        "!nvidia-smi"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "a2Z5Qu_o8VtQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "vbI9ZsQHzjqF"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 2. Download stable-diffusion Repository\n",
+        "from os.path import exists\n",
+        "\n",
+        "if exists(\"/content/stable-diffusion/\")==True:\n",
+        "  print(\"Already downloaded repo\")\n",
+        "else:\n",
+        "  !git clone --quiet https://github.com/lstein/stable-diffusion.git # Original repo\n",
+        "  %cd stable-diffusion/\n",
+        "  !git checkout --quiet tags/release-1.09\n",
+        "  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 3. Install Python 3.8 \n",
+        "%%capture --no-stderr\n",
+        "import gc\n",
+        "!apt-get -qq install python3.8\n",
+        "gc.collect()"
+      ],
+      "metadata": {
+        "id": "daHlozvwKesj",
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 4. Install dependencies from file in a VirtualEnv\n",
+        "#@markdown Be patient, it takes ~ 5 - 7min <br>\n",
+        "%%capture --no-stderr\n",
+        "#Virtual environment\n",
+        "!pip install pipenv -q\n",
+        "!pip install colab-xterm\n",
+        "%load_ext colabxterm\n",
+        "!pipenv --python 3.8\n",
+        "!pipenv install -r requirements.txt --skip-lock\n",
+        "gc.collect()\n"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "QbXcGXYEFSNB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 5. Mount google Drive\n",
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "YEWPV-sF1RDM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 6. Drive Path to model\n",
+        "#@markdown Path should start with /content/drive/path-to-your-file <br>\n",
+        "#@markdown <font color=\"red\">Note:</font> Model should be downloaded from https://huggingface.co <br>\n",
+        "#@markdown Lastest release: [Stable-Diffusion-v-1-4](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original)\n",
+        "from os.path import exists\n",
+        "\n",
+        "model_path = \"\" #@param {type:\"string\"}\n",
+        "if exists(model_path)==True:\n",
+        "  print(\"✅ Valid directory\")\n",
+        "else: \n",
+        "  print(\"❌ File doesn't exist\")"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "zRTJeZ461WGu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 7. Symlink to model\n",
+        "\n",
+        "from os.path import exists\n",
+        "import os \n",
+        "\n",
+        "# Folder creation if it doesn't exist\n",
+        "if exists(\"/content/stable-diffusion/models/ldm/stable-diffusion-v1\")==True:\n",
+        "  print(\"❗ Dir stable-diffusion-v1 already exists\")\n",
+        "else:\n",
+        "  %mkdir /content/stable-diffusion/models/ldm/stable-diffusion-v1\n",
+        "  print(\"✅ Dir stable-diffusion-v1 created\")\n",
+        "\n",
+        "# Symbolic link if it doesn't exist\n",
+        "if exists(\"/content/stable-diffusion/models/ldm/stable-diffusion-v1/model.ckpt\")==True:\n",
+        "  print(\"❗ Symlink already created\")\n",
+        "else: \n",
+        "  src = model_path\n",
+        "  dst = '/content/stable-diffusion/models/ldm/stable-diffusion-v1/model.ckpt'\n",
+        "  os.symlink(src, dst) \n",
+        "  print(\"✅ Symbolic link created successfully\")"
+      ],
+      "metadata": {
+        "id": "UY-NNz4I8_aG",
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 8. Load small ML models required\n",
+        "%%capture --no-stderr\n",
+        "!pipenv run scripts/preload_models.py\n",
+        "gc.collect()"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "ChIDWxLVHGGJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 9. Run Terminal and Execute Dream bot\n",
+        "#@markdown <font color=\"blue\">Steps:</font> <br>\n",
+        "#@markdown 1. Execute command `pipenv run scripts/dream.py` to run dream bot.<br>\n",
+        "#@markdown 2. After initialized you'll see `Dream>` line.<br>\n",
+        "#@markdown 3. Example text: `Astronaut floating in a distant galaxy` <br>\n",
+        "#@markdown 4. To quit Dream bot use: `q` command.<br>\n",
+        "\n",
+        "#Run from virtual env\n",
+        "\n",
+        "%xterm\n",
+        "gc.collect()"
+      ],
+      "metadata": {
+        "id": "ir4hCrMIuUpl",
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title 10. Show generated images\n",
+        "\n",
+        "import glob\n",
+        "import matplotlib.pyplot as plt\n",
+        "import matplotlib.image as mpimg\n",
+        "%matplotlib inline\n",
+        "\n",
+        "images = []\n",
+        "for img_path in glob.glob('/content/stable-diffusion/outputs/img-samples/*.png'):\n",
+        "    images.append(mpimg.imread(img_path))\n",
+        "\n",
+        "# Remove ticks and labels on x-axis and y-axis both\n",
+        "\n",
+        "plt.figure(figsize=(20,10))\n",
+        "\n",
+        "columns = 5\n",
+        "for i, image in enumerate(images):\n",
+        "    ax = plt.subplot(len(images) / columns + 1, columns, i + 1)\n",
+        "    ax.axes.xaxis.set_visible(False)\n",
+        "    ax.axes.yaxis.set_visible(False)\n",
+        "    ax.axis('off')\n",
+        "    plt.imshow(image)\n",
+        "    gc.collect()\n",
+        "\n"
+      ],
+      "metadata": {
+        "cellView": "form",
+        "id": "qnLohSHmKoGk"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/VARIATIONS.md
+++ b/VARIATIONS.md
@@ -0,0 +1,113 @@
+# Cheat Sheat for Generating Variations
+
+Release 1.13 of SD-Dream adds support for image variations. There are two things that you can do:
+
+1. Generate a series of systematic variations of an image, given a
+prompt. The amount of variation from one image to the next can be
+controlled.
+
+2. Given two or more variations that you like, you can combine them in
+a weighted fashion
+
+This cheat sheet provides a quick guide for how this works in
+practice, using variations to create the desired image of Xena,
+Warrior Princess.
+
+## Step 1 -- find a base image that you like
+
+The prompt we will use throughout is "lucy lawless as xena, warrior
+princess, character portrait, high resolution." This will be indicated
+as "prompt" in the examples below.
+
+First we let SD create a series of images in the usual way, in this case
+requesting six iterations:
+
+~~~
+dream> lucy lawless as xena, warrior princess, character portrait, high resolution -n6
+...
+Outputs:
+./outputs/Xena/000001.1579445059.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S1579445059
+./outputs/Xena/000001.1880768722.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S1880768722
+./outputs/Xena/000001.332057179.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S332057179
+./outputs/Xena/000001.2224800325.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S2224800325
+./outputs/Xena/000001.465250761.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S465250761
+./outputs/Xena/000001.3357757885.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -S3357757885
+~~~
+
+The one with seed 3357757885 looks nice:
+
+<img src="static/variation_walkthru/000001.3357757885.png"/>
+
+Let's try to generate some variations. Using the same seed, we pass
+the argument -v0.1 (or --variant_amount), which generates a series of
+variations each differing by a variation amount of 0.2. This number
+ranges from 0 to 1.0, with higher numbers being larger amounts of
+variation.
+
+~~~
+dream> "prompt" -n6 -S3357757885 -v0.2
+...
+Outputs:
+./outputs/Xena/000002.784039624.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 784039624:0.2 -S3357757885
+./outputs/Xena/000002.3647897225.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.2 -S3357757885
+./outputs/Xena/000002.917731034.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 917731034:0.2 -S3357757885
+./outputs/Xena/000002.4116285959.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 4116285959:0.2 -S3357757885
+./outputs/Xena/000002.1614299449.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 1614299449:0.2 -S3357757885
+./outputs/Xena/000002.1335553075.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 1335553075:0.2 -S3357757885
+~~~
+
+Note that the output for each image has a -V option giving the
+"variant subseed" for that image, consisting of a seed followed by the
+variation amount used to generate it.
+
+This gives us a series of closely-related variations, including the
+two shown here.
+
+<img src="static/variation_walkthru/000002.3647897225.png">
+<img src="static/variation_walkthru/000002.1614299449.png">
+
+
+I like the expression on Xena's face in the first one (subseed
+3647897225), and the armor on her shoulder in the second one (subseed
+1614299449). Can we combine them to get the best of both worlds?
+
+We combine the two variations using -V (--with_variations). Again, we
+must provide the seed for the originally-chosen image in order for
+this to work.
+
+~~~
+dream> "prompt"  -S3357757885 -V3647897225,0.1;1614299449,0.1
+Outputs:
+./outputs/Xena/000003.1614299449.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1 -S3357757885
+~~~
+
+Here we are providing equal weights (0.1 and 0.1) for both the
+subseeds. The resulting image is close, but not exactly what I
+wanted:
+
+<img src="static/variation_walkthru/000003.1614299449.png">
+
+We could either try combining the images with different weights, or we
+can generate more variations around the almost-but-not-quite image. We
+do the latter, using both the -V (combining) and -v (variation
+strength) options. Note that we use -n6 to generate 6 variations:
+
+~~~~
+dream> "prompt" -S3357757885 -V3647897225,0.1;1614299449,0.1 -v0.05 -n6
+Outputs:
+./outputs/Xena/000004.3279757577.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,3279757577:0.05 -S3357757885
+./outputs/Xena/000004.2853129515.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,2853129515:0.05 -S3357757885
+./outputs/Xena/000004.3747154981.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,3747154981:0.05 -S3357757885
+./outputs/Xena/000004.2664260391.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,2664260391:0.05 -S3357757885
+./outputs/Xena/000004.1642517170.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,1642517170:0.05 -S3357757885
+./outputs/Xena/000004.2183375608.png: "prompt" -s50 -W512 -H512 -C7.5 -Ak_lms -V 3647897225:0.1,1614299449:0.1,2183375608:0.05 -S3357757885
+~~~~
+
+This produces six images, all slight variations on the combination of
+the chosen two images. Here's the one I like best:
+
+<img src="static/variation_walkthru/000004.3747154981.png">
+
+As you can see, this is a very powerful too, which when combined with
+subprompt weighting, gives you great control over the content and
+quality of your generated images.
--- a/configs/models.yaml
+++ b/configs/models.yaml
@@ -0,0 +1,18 @@
+# This file describes the alternative machine learning models
+#  available to the dream script.
+#
+# To add a new model, follow the examples below. Each
+# model requires a model config file, a weights file,
+# and the width and height of the images it
+# was trained on.
+
+laion400m:
+    config:  configs/latent-diffusion/txt2img-1p4B-eval.yaml
+    weights: models/ldm/text2img-large/model.ckpt
+    width: 256
+    height: 256
+stable-diffusion-1.4:
+    config:  configs/stable-diffusion/v1-inference.yaml
+    weights: models/ldm/stable-diffusion-v1/model.ckpt
+    width: 512
+    height: 512
--- a/configs/stable-diffusion/v1-finetune.yaml
+++ b/configs/stable-diffusion/v1-finetune.yaml
@@ -0,0 +1,109 @@
+model:
+  base_learning_rate: 5.0e-03
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: true   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    embedding_reg_weight: 0.0
+
+    personalization_config:
+      target: ldm.modules.embedding_manager.EmbeddingManager
+      params:
+        placeholder_strings: ["*"]
+        initializer_words: ["sculpture"]
+        per_image_tokens: false
+        num_vectors_per_token: 1
+        progressive_words: False
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 1
+    num_workers: 2
+    wrap: false
+    train:
+      target: ldm.data.personalized.PersonalizedBase
+      params:
+        size: 512
+        set: train
+        per_image_tokens: false
+        repeats: 100
+    validation:
+      target: ldm.data.personalized.PersonalizedBase
+      params:
+        size: 512
+        set: val
+        per_image_tokens: false
+        repeats: 10
+
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 500
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 500
+        max_images: 8
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
+    max_steps: 4000
+    
--- a/configs/stable-diffusion/v1-finetune_style.yaml
+++ b/configs/stable-diffusion/v1-finetune_style.yaml
@@ -0,0 +1,103 @@
+model:
+  base_learning_rate: 5.0e-03
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: true   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    embedding_reg_weight: 0.0
+
+    personalization_config:
+      target: ldm.modules.embedding_manager.EmbeddingManager
+      params:
+        placeholder_strings: ["*"]
+        initializer_words: ["painting"]
+        per_image_tokens: false
+        num_vectors_per_token: 1
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 2
+    num_workers: 16
+    wrap: false
+    train:
+      target: ldm.data.personalized_style.PersonalizedBase
+      params:
+        size: 512
+        set: train
+        per_image_tokens: false
+        repeats: 100
+    validation:
+      target: ldm.data.personalized_style.PersonalizedBase
+      params:
+        size: 512
+        set: val
+        per_image_tokens: false
+        repeats: 10
+
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 500
+        max_images: 8
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
--- a/configs/stable-diffusion/v1-inference.yaml
+++ b/configs/stable-diffusion/v1-inference.yaml
@@ -26,6 +26,15 @@ model:
        f_max: [ 1. ]
        f_min: [ 1. ]

+    personalization_config:
+      target: ldm.modules.embedding_manager.EmbeddingManager
+      params:
+        placeholder_strings: ["*"]
+        initializer_words: ["sculpture"]
+        per_image_tokens: false
+        num_vectors_per_token: 1
+        progressive_words: False
+
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
--- a/environment-mac.yaml
+++ b/environment-mac.yaml
@@ -0,0 +1,58 @@
+name: ldm
+channels:
+  - pytorch-nightly
+  - conda-forge
+dependencies:
+  - python==3.9.13
+  - pip==22.2.2
+ 
+  # pytorch-nightly, left unpinned
+  - pytorch
+  - torchmetrics
+  - torchvision
+
+  # I suggest to keep the other deps sorted for convenience.
+  # If you wish to upgrade to 3.10, try to run this:
+  #
+  # ```shell
+  # CONDA_CMD=conda
+  # sed -E 's/python==3.9.13/python==3.10.5/;s/ldm/ldm-3.10/;21,99s/- ([^=]+)==.+/- \1/' environment-mac.yaml > /tmp/environment-mac-updated.yml
+  # CONDA_SUBDIR=osx-arm64 $CONDA_CMD env create -f /tmp/environment-mac-updated.yml && $CONDA_CMD list -n ldm-3.10 | awk ' {print "  - " $1 "==" $2;} '
+  # ```
+  #
+  # Unfortunately, as of 2022-08-31, this fails at the pip stage.
+  - albumentations==1.2.1
+  - coloredlogs==15.0.1
+  - einops==0.4.1
+  - grpcio==1.46.4
+  - humanfriendly
+  - imageio-ffmpeg==0.4.7
+  - imageio==2.21.2
+  - imgaug==0.4.0
+  - kornia==0.6.7
+  - mpmath==1.2.1
+  - nomkl
+  - numpy==1.23.2
+  - omegaconf==2.1.1
+  - onnx==1.12.0
+  - onnxruntime==1.12.1
+  - opencv==4.6.0
+  - pudb==2022.1
+  - pytorch-lightning==1.6.5
+  - scipy==1.9.1
+  - streamlit==1.12.2
+  - sympy==1.10.1
+  - tensorboard==2.9.0
+  - transformers==4.21.2
+  - pip:
+    - invisible-watermark
+    - test-tube
+    - tokenizers
+    - torch-fidelity
+    - -e git+https://github.com/huggingface/diffusers.git@v0.2.4#egg=diffusers
+    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
+    - -e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k_diffusion
+    - -e .
+variables:
+  PYTORCH_ENABLE_MPS_FALLBACK: 1
--- a/environment.yaml
+++ b/environment.yaml
@@ -18,12 +18,14 @@ dependencies:
    - pytorch-lightning==1.4.2
    - omegaconf==2.1.1
    - test-tube>=0.7.5
-    - streamlit>=0.73.1
+    - streamlit==1.12.0
+    - pillow==9.2.0
    - einops==0.3.0
    - torch-fidelity==0.3.0
    - transformers==4.19.2
    - torchmetrics==0.6.0
-    - kornia==0.6
-    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+    - kornia==0.6.0
    - -e git+https://github.com/openai/CLIP.git@main#egg=clip
+    - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+    - -e git+https://github.com/lstein/k-diffusion.git@master#egg=k-diffusion
    - -e .
--- a/ldm/data/base.py
+++ b/ldm/data/base.py
@@ -1,11 +1,17 @@
 from abc import abstractmethod
-from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset
+from torch.utils.data import (
+    Dataset,
+    ConcatDataset,
+    ChainDataset,
+    IterableDataset,
+)


 class Txt2ImgIterableBaseDataset(IterableDataset):
-    '''
+    """
    Define an interface to make the IterableDatasets for text2img data chainable
-    '''
+    """
+
    def __init__(self, num_records=0, valid_ids=None, size=256):
        super().__init__()
        self.num_records = num_records
@@ -13,11 +19,13 @@ class Txt2ImgIterableBaseDataset(IterableDataset):
        self.sample_ids = valid_ids
        self.size = size

-        print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.')
+        print(
+            f'{self.__class__.__name__} dataset contains {self.__len__()} examples.'
+        )

    def __len__(self):
        return self.num_records

    @abstractmethod
    def __iter__(self):
-        pass
+        pass
--- a/ldm/data/imagenet.py
+++ b/ldm/data/imagenet.py
@@ -11,24 +11,34 @@ from tqdm import tqdm
 from torch.utils.data import Dataset, Subset

 import taming.data.utils as tdu
-from taming.data.imagenet import str_to_indices, give_synsets_from_indices, download, retrieve
+from taming.data.imagenet import (
+    str_to_indices,
+    give_synsets_from_indices,
+    download,
+    retrieve,
+)
 from taming.data.imagenet import ImagePaths

-from ldm.modules.image_degradation import degradation_fn_bsr, degradation_fn_bsr_light
+from ldm.modules.image_degradation import (
+    degradation_fn_bsr,
+    degradation_fn_bsr_light,
+)


-def synset2idx(path_to_yaml="data/index_synset.yaml"):
+def synset2idx(path_to_yaml='data/index_synset.yaml'):
    with open(path_to_yaml) as f:
        di2s = yaml.load(f)
-    return dict((v,k) for k,v in di2s.items())
+    return dict((v, k) for k, v in di2s.items())


 class ImageNetBase(Dataset):
    def __init__(self, config=None):
        self.config = config or OmegaConf.create()
-        if not type(self.config)==dict:
+        if not type(self.config) == dict:
            self.config = OmegaConf.to_container(self.config)
-        self.keep_orig_class_label = self.config.get("keep_orig_class_label", False)
+        self.keep_orig_class_label = self.config.get(
+            'keep_orig_class_label', False
+        )
        self.process_images = True  # if False we skip loading & processing images and self.data contains filepaths
        self._prepare()
        self._prepare_synset_to_human()
@@ -46,17 +56,23 @@ class ImageNetBase(Dataset):
        raise NotImplementedError()

    def _filter_relpaths(self, relpaths):
-        ignore = set([
-            "n06596364_9591.JPEG",
-        ])
-        relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore]
-        if "sub_indices" in self.config:
-            indices = str_to_indices(self.config["sub_indices"])
-            synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn)  # returns a list of strings
+        ignore = set(
+            [
+                'n06596364_9591.JPEG',
+            ]
+        )
+        relpaths = [
+            rpath for rpath in relpaths if not rpath.split('/')[-1] in ignore
+        ]
+        if 'sub_indices' in self.config:
+            indices = str_to_indices(self.config['sub_indices'])
+            synsets = give_synsets_from_indices(
+                indices, path_to_yaml=self.idx2syn
+            )  # returns a list of strings
            self.synset2idx = synset2idx(path_to_yaml=self.idx2syn)
            files = []
            for rpath in relpaths:
-                syn = rpath.split("/")[0]
+                syn = rpath.split('/')[0]
                if syn in synsets:
                    files.append(rpath)
            return files
@@ -65,78 +81,89 @@ class ImageNetBase(Dataset):

    def _prepare_synset_to_human(self):
        SIZE = 2655750
-        URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1"
-        self.human_dict = os.path.join(self.root, "synset_human.txt")
-        if (not os.path.exists(self.human_dict) or
-                not os.path.getsize(self.human_dict)==SIZE):
+        URL = 'https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1'
+        self.human_dict = os.path.join(self.root, 'synset_human.txt')
+        if (
+            not os.path.exists(self.human_dict)
+            or not os.path.getsize(self.human_dict) == SIZE
+        ):
            download(URL, self.human_dict)

    def _prepare_idx_to_synset(self):
-        URL = "https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1"
-        self.idx2syn = os.path.join(self.root, "index_synset.yaml")
-        if (not os.path.exists(self.idx2syn)):
+        URL = 'https://heibox.uni-heidelberg.de/f/d835d5b6ceda4d3aa910/?dl=1'
+        self.idx2syn = os.path.join(self.root, 'index_synset.yaml')
+        if not os.path.exists(self.idx2syn):
            download(URL, self.idx2syn)

    def _prepare_human_to_integer_label(self):
-        URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1"
-        self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt")
-        if (not os.path.exists(self.human2integer)):
+        URL = 'https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1'
+        self.human2integer = os.path.join(
+            self.root, 'imagenet1000_clsidx_to_labels.txt'
+        )
+        if not os.path.exists(self.human2integer):
            download(URL, self.human2integer)
-        with open(self.human2integer, "r") as f:
+        with open(self.human2integer, 'r') as f:
            lines = f.read().splitlines()
            assert len(lines) == 1000
            self.human2integer_dict = dict()
            for line in lines:
-                value, key = line.split(":")
+                value, key = line.split(':')
                self.human2integer_dict[key] = int(value)

    def _load(self):
-        with open(self.txt_filelist, "r") as f:
+        with open(self.txt_filelist, 'r') as f:
            self.relpaths = f.read().splitlines()
            l1 = len(self.relpaths)
            self.relpaths = self._filter_relpaths(self.relpaths)
-            print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths)))
+            print(
+                'Removed {} files from filelist during filtering.'.format(
+                    l1 - len(self.relpaths)
+                )
+            )

-        self.synsets = [p.split("/")[0] for p in self.relpaths]
+        self.synsets = [p.split('/')[0] for p in self.relpaths]
        self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths]

        unique_synsets = np.unique(self.synsets)
-        class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets))
+        class_dict = dict(
+            (synset, i) for i, synset in enumerate(unique_synsets)
+        )
        if not self.keep_orig_class_label:
            self.class_labels = [class_dict[s] for s in self.synsets]
        else:
            self.class_labels = [self.synset2idx[s] for s in self.synsets]

-        with open(self.human_dict, "r") as f:
+        with open(self.human_dict, 'r') as f:
            human_dict = f.read().splitlines()
            human_dict = dict(line.split(maxsplit=1) for line in human_dict)

        self.human_labels = [human_dict[s] for s in self.synsets]

        labels = {
-            "relpath": np.array(self.relpaths),
-            "synsets": np.array(self.synsets),
-            "class_label": np.array(self.class_labels),
-            "human_label": np.array(self.human_labels),
+            'relpath': np.array(self.relpaths),
+            'synsets': np.array(self.synsets),
+            'class_label': np.array(self.class_labels),
+            'human_label': np.array(self.human_labels),
        }

        if self.process_images:
-            self.size = retrieve(self.config, "size", default=256)
-            self.data = ImagePaths(self.abspaths,
-                                   labels=labels,
-                                   size=self.size,
-                                   random_crop=self.random_crop,
-                                   )
+            self.size = retrieve(self.config, 'size', default=256)
+            self.data = ImagePaths(
+                self.abspaths,
+                labels=labels,
+                size=self.size,
+                random_crop=self.random_crop,
+            )
        else:
            self.data = self.abspaths


 class ImageNetTrain(ImageNetBase):
-    NAME = "ILSVRC2012_train"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2"
+    NAME = 'ILSVRC2012_train'
+    URL = 'http://www.image-net.org/challenges/LSVRC/2012/'
+    AT_HASH = 'a306397ccf9c2ead27155983c254227c0fd938e2'
    FILES = [
-        "ILSVRC2012_img_train.tar",
+        'ILSVRC2012_img_train.tar',
    ]
    SIZES = [
        147897477120,
@@ -151,57 +178,64 @@ class ImageNetTrain(ImageNetBase):
        if self.data_root:
            self.root = os.path.join(self.data_root, self.NAME)
        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
+            cachedir = os.environ.get(
+                'XDG_CACHE_HOME', os.path.expanduser('~/.cache')
+            )
+            self.root = os.path.join(cachedir, 'autoencoders/data', self.NAME)

-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+        self.datadir = os.path.join(self.root, 'data')
+        self.txt_filelist = os.path.join(self.root, 'filelist.txt')
        self.expected_length = 1281167
-        self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop",
-                                    default=True)
+        self.random_crop = retrieve(
+            self.config, 'ImageNetTrain/random_crop', default=True
+        )
        if not tdu.is_prepared(self.root):
            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+            print('Preparing dataset {} in {}'.format(self.NAME, self.root))

            datadir = self.datadir
            if not os.path.exists(datadir):
                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                if (
+                    not os.path.exists(path)
+                    or not os.path.getsize(path) == self.SIZES[0]
+                ):
                    import academictorrents as at
+
                    atpath = at.get(self.AT_HASH, datastore=self.root)
                    assert atpath == path

-                print("Extracting {} to {}".format(path, datadir))
+                print('Extracting {} to {}'.format(path, datadir))
                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
+                with tarfile.open(path, 'r:') as tar:
                    tar.extractall(path=datadir)

-                print("Extracting sub-tars.")
-                subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar")))
+                print('Extracting sub-tars.')
+                subpaths = sorted(glob.glob(os.path.join(datadir, '*.tar')))
                for subpath in tqdm(subpaths):
-                    subdir = subpath[:-len(".tar")]
+                    subdir = subpath[: -len('.tar')]
                    os.makedirs(subdir, exist_ok=True)
-                    with tarfile.open(subpath, "r:") as tar:
+                    with tarfile.open(subpath, 'r:') as tar:
                        tar.extractall(path=subdir)

-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = glob.glob(os.path.join(datadir, '**', '*.JPEG'))
            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
+            filelist = '\n'.join(filelist) + '\n'
+            with open(self.txt_filelist, 'w') as f:
                f.write(filelist)

            tdu.mark_prepared(self.root)


 class ImageNetValidation(ImageNetBase):
-    NAME = "ILSVRC2012_validation"
-    URL = "http://www.image-net.org/challenges/LSVRC/2012/"
-    AT_HASH = "5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5"
-    VS_URL = "https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1"
+    NAME = 'ILSVRC2012_validation'
+    URL = 'http://www.image-net.org/challenges/LSVRC/2012/'
+    AT_HASH = '5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5'
+    VS_URL = 'https://heibox.uni-heidelberg.de/f/3e0f6e9c624e45f2bd73/?dl=1'
    FILES = [
-        "ILSVRC2012_img_val.tar",
-        "validation_synset.txt",
+        'ILSVRC2012_img_val.tar',
+        'validation_synset.txt',
    ]
    SIZES = [
        6744924160,
@@ -217,39 +251,49 @@ class ImageNetValidation(ImageNetBase):
        if self.data_root:
            self.root = os.path.join(self.data_root, self.NAME)
        else:
-            cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
-            self.root = os.path.join(cachedir, "autoencoders/data", self.NAME)
-        self.datadir = os.path.join(self.root, "data")
-        self.txt_filelist = os.path.join(self.root, "filelist.txt")
+            cachedir = os.environ.get(
+                'XDG_CACHE_HOME', os.path.expanduser('~/.cache')
+            )
+            self.root = os.path.join(cachedir, 'autoencoders/data', self.NAME)
+        self.datadir = os.path.join(self.root, 'data')
+        self.txt_filelist = os.path.join(self.root, 'filelist.txt')
        self.expected_length = 50000
-        self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop",
-                                    default=False)
+        self.random_crop = retrieve(
+            self.config, 'ImageNetValidation/random_crop', default=False
+        )
        if not tdu.is_prepared(self.root):
            # prep
-            print("Preparing dataset {} in {}".format(self.NAME, self.root))
+            print('Preparing dataset {} in {}'.format(self.NAME, self.root))

            datadir = self.datadir
            if not os.path.exists(datadir):
                path = os.path.join(self.root, self.FILES[0])
-                if not os.path.exists(path) or not os.path.getsize(path)==self.SIZES[0]:
+                if (
+                    not os.path.exists(path)
+                    or not os.path.getsize(path) == self.SIZES[0]
+                ):
                    import academictorrents as at
+
                    atpath = at.get(self.AT_HASH, datastore=self.root)
                    assert atpath == path

-                print("Extracting {} to {}".format(path, datadir))
+                print('Extracting {} to {}'.format(path, datadir))
                os.makedirs(datadir, exist_ok=True)
-                with tarfile.open(path, "r:") as tar:
+                with tarfile.open(path, 'r:') as tar:
                    tar.extractall(path=datadir)

                vspath = os.path.join(self.root, self.FILES[1])
-                if not os.path.exists(vspath) or not os.path.getsize(vspath)==self.SIZES[1]:
+                if (
+                    not os.path.exists(vspath)
+                    or not os.path.getsize(vspath) == self.SIZES[1]
+                ):
                    download(self.VS_URL, vspath)

-                with open(vspath, "r") as f:
+                with open(vspath, 'r') as f:
                    synset_dict = f.read().splitlines()
                    synset_dict = dict(line.split() for line in synset_dict)

-                print("Reorganizing into synset folders")
+                print('Reorganizing into synset folders')
                synsets = np.unique(list(synset_dict.values()))
                for s in synsets:
                    os.makedirs(os.path.join(datadir, s), exist_ok=True)
@@ -258,21 +302,26 @@ class ImageNetValidation(ImageNetBase):
                    dst = os.path.join(datadir, v)
                    shutil.move(src, dst)

-            filelist = glob.glob(os.path.join(datadir, "**", "*.JPEG"))
+            filelist = glob.glob(os.path.join(datadir, '**', '*.JPEG'))
            filelist = [os.path.relpath(p, start=datadir) for p in filelist]
            filelist = sorted(filelist)
-            filelist = "\n".join(filelist)+"\n"
-            with open(self.txt_filelist, "w") as f:
+            filelist = '\n'.join(filelist) + '\n'
+            with open(self.txt_filelist, 'w') as f:
                f.write(filelist)

            tdu.mark_prepared(self.root)


-
 class ImageNetSR(Dataset):
-    def __init__(self, size=None,
-                 degradation=None, downscale_f=4, min_crop_f=0.5, max_crop_f=1.,
-                 random_crop=True):
+    def __init__(
+        self,
+        size=None,
+        degradation=None,
+        downscale_f=4,
+        min_crop_f=0.5,
+        max_crop_f=1.0,
+        random_crop=True,
+    ):
        """
        Imagenet Superresolution Dataloader
        Performs following ops in order:
@@ -296,67 +345,86 @@ class ImageNetSR(Dataset):
        self.LR_size = int(size / downscale_f)
        self.min_crop_f = min_crop_f
        self.max_crop_f = max_crop_f
-        assert(max_crop_f <= 1.)
+        assert max_crop_f <= 1.0
        self.center_crop = not random_crop

-        self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA)
+        self.image_rescaler = albumentations.SmallestMaxSize(
+            max_size=size, interpolation=cv2.INTER_AREA
+        )

-        self.pil_interpolation = False # gets reset later if incase interp_op is from pillow
+        self.pil_interpolation = (
+            False  # gets reset later if incase interp_op is from pillow
+        )

-        if degradation == "bsrgan":
-            self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f)
+        if degradation == 'bsrgan':
+            self.degradation_process = partial(
+                degradation_fn_bsr, sf=downscale_f
+            )

-        elif degradation == "bsrgan_light":
-            self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f)
+        elif degradation == 'bsrgan_light':
+            self.degradation_process = partial(
+                degradation_fn_bsr_light, sf=downscale_f
+            )

        else:
            interpolation_fn = {
-            "cv_nearest": cv2.INTER_NEAREST,
-            "cv_bilinear": cv2.INTER_LINEAR,
-            "cv_bicubic": cv2.INTER_CUBIC,
-            "cv_area": cv2.INTER_AREA,
-            "cv_lanczos": cv2.INTER_LANCZOS4,
-            "pil_nearest": PIL.Image.NEAREST,
-            "pil_bilinear": PIL.Image.BILINEAR,
-            "pil_bicubic": PIL.Image.BICUBIC,
-            "pil_box": PIL.Image.BOX,
-            "pil_hamming": PIL.Image.HAMMING,
-            "pil_lanczos": PIL.Image.LANCZOS,
+                'cv_nearest': cv2.INTER_NEAREST,
+                'cv_bilinear': cv2.INTER_LINEAR,
+                'cv_bicubic': cv2.INTER_CUBIC,
+                'cv_area': cv2.INTER_AREA,
+                'cv_lanczos': cv2.INTER_LANCZOS4,
+                'pil_nearest': PIL.Image.NEAREST,
+                'pil_bilinear': PIL.Image.BILINEAR,
+                'pil_bicubic': PIL.Image.BICUBIC,
+                'pil_box': PIL.Image.BOX,
+                'pil_hamming': PIL.Image.HAMMING,
+                'pil_lanczos': PIL.Image.LANCZOS,
            }[degradation]

-            self.pil_interpolation = degradation.startswith("pil_")
+            self.pil_interpolation = degradation.startswith('pil_')

            if self.pil_interpolation:
-                self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn)
+                self.degradation_process = partial(
+                    TF.resize,
+                    size=self.LR_size,
+                    interpolation=interpolation_fn,
+                )

            else:
-                self.degradation_process = albumentations.SmallestMaxSize(max_size=self.LR_size,
-                                                                          interpolation=interpolation_fn)
+                self.degradation_process = albumentations.SmallestMaxSize(
+                    max_size=self.LR_size, interpolation=interpolation_fn
+                )

    def __len__(self):
        return len(self.base)

    def __getitem__(self, i):
        example = self.base[i]
-        image = Image.open(example["file_path_"])
+        image = Image.open(example['file_path_'])

-        if not image.mode == "RGB":
-            image = image.convert("RGB")
+        if not image.mode == 'RGB':
+            image = image.convert('RGB')

        image = np.array(image).astype(np.uint8)

        min_side_len = min(image.shape[:2])
-        crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None)
+        crop_side_len = min_side_len * np.random.uniform(
+            self.min_crop_f, self.max_crop_f, size=None
+        )
        crop_side_len = int(crop_side_len)

        if self.center_crop:
-            self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len)
+            self.cropper = albumentations.CenterCrop(
+                height=crop_side_len, width=crop_side_len
+            )

        else:
-            self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len)
+            self.cropper = albumentations.RandomCrop(
+                height=crop_side_len, width=crop_side_len
+            )

-        image = self.cropper(image=image)["image"]
-        image = self.image_rescaler(image=image)["image"]
+        image = self.cropper(image=image)['image']
+        image = self.image_rescaler(image=image)['image']

        if self.pil_interpolation:
            image_pil = PIL.Image.fromarray(image)
@@ -364,10 +432,10 @@ class ImageNetSR(Dataset):
            LR_image = np.array(LR_image).astype(np.uint8)

        else:
-            LR_image = self.degradation_process(image=image)["image"]
+            LR_image = self.degradation_process(image=image)['image']

-        example["image"] = (image/127.5 - 1.0).astype(np.float32)
-        example["LR_image"] = (LR_image/127.5 - 1.0).astype(np.float32)
+        example['image'] = (image / 127.5 - 1.0).astype(np.float32)
+        example['LR_image'] = (LR_image / 127.5 - 1.0).astype(np.float32)

        return example

@@ -377,9 +445,11 @@ class ImageNetSRTrain(ImageNetSR):
        super().__init__(**kwargs)

    def get_base(self):
-        with open("data/imagenet_train_hr_indices.p", "rb") as f:
+        with open('data/imagenet_train_hr_indices.p', 'rb') as f:
            indices = pickle.load(f)
-        dset = ImageNetTrain(process_images=False,)
+        dset = ImageNetTrain(
+            process_images=False,
+        )
        return Subset(dset, indices)


@@ -388,7 +458,9 @@ class ImageNetSRValidation(ImageNetSR):
        super().__init__(**kwargs)

    def get_base(self):
-        with open("data/imagenet_val_hr_indices.p", "rb") as f:
+        with open('data/imagenet_val_hr_indices.p', 'rb') as f:
            indices = pickle.load(f)
-        dset = ImageNetValidation(process_images=False,)
+        dset = ImageNetValidation(
+            process_images=False,
+        )
        return Subset(dset, indices)
--- a/ldm/data/lsun.py
+++ b/ldm/data/lsun.py
@@ -7,30 +7,33 @@ from torchvision import transforms


 class LSUNBase(Dataset):
-    def __init__(self,
-                 txt_file,
-                 data_root,
-                 size=None,
-                 interpolation="bicubic",
-                 flip_p=0.5
-                 ):
+    def __init__(
+        self,
+        txt_file,
+        data_root,
+        size=None,
+        interpolation='bicubic',
+        flip_p=0.5,
+    ):
        self.data_paths = txt_file
        self.data_root = data_root
-        with open(self.data_paths, "r") as f:
+        with open(self.data_paths, 'r') as f:
            self.image_paths = f.read().splitlines()
        self._length = len(self.image_paths)
        self.labels = {
-            "relative_file_path_": [l for l in self.image_paths],
-            "file_path_": [os.path.join(self.data_root, l)
-                           for l in self.image_paths],
+            'relative_file_path_': [l for l in self.image_paths],
+            'file_path_': [
+                os.path.join(self.data_root, l) for l in self.image_paths
+            ],
        }

        self.size = size
-        self.interpolation = {"linear": PIL.Image.LINEAR,
-                              "bilinear": PIL.Image.BILINEAR,
-                              "bicubic": PIL.Image.BICUBIC,
-                              "lanczos": PIL.Image.LANCZOS,
-                              }[interpolation]
+        self.interpolation = {
+            'linear': PIL.Image.LINEAR,
+            'bilinear': PIL.Image.BILINEAR,
+            'bicubic': PIL.Image.BICUBIC,
+            'lanczos': PIL.Image.LANCZOS,
+        }[interpolation]
        self.flip = transforms.RandomHorizontalFlip(p=flip_p)

    def __len__(self):
@@ -38,55 +41,86 @@ class LSUNBase(Dataset):

    def __getitem__(self, i):
        example = dict((k, self.labels[k][i]) for k in self.labels)
-        image = Image.open(example["file_path_"])
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
+        image = Image.open(example['file_path_'])
+        if not image.mode == 'RGB':
+            image = image.convert('RGB')

        # default to score-sde preprocessing
        img = np.array(image).astype(np.uint8)
        crop = min(img.shape[0], img.shape[1])
-        h, w, = img.shape[0], img.shape[1]
-        img = img[(h - crop) // 2:(h + crop) // 2,
-              (w - crop) // 2:(w + crop) // 2]
+        h, w, = (
+            img.shape[0],
+            img.shape[1],
+        )
+        img = img[
+            (h - crop) // 2 : (h + crop) // 2,
+            (w - crop) // 2 : (w + crop) // 2,
+        ]

        image = Image.fromarray(img)
        if self.size is not None:
-            image = image.resize((self.size, self.size), resample=self.interpolation)
+            image = image.resize(
+                (self.size, self.size), resample=self.interpolation
+            )

        image = self.flip(image)
        image = np.array(image).astype(np.uint8)
-        example["image"] = (image / 127.5 - 1.0).astype(np.float32)
+        example['image'] = (image / 127.5 - 1.0).astype(np.float32)
        return example


 class LSUNChurchesTrain(LSUNBase):
    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs)
+        super().__init__(
+            txt_file='data/lsun/church_outdoor_train.txt',
+            data_root='data/lsun/churches',
+            **kwargs
+        )


 class LSUNChurchesValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches",
-                         flip_p=flip_p, **kwargs)
+    def __init__(self, flip_p=0.0, **kwargs):
+        super().__init__(
+            txt_file='data/lsun/church_outdoor_val.txt',
+            data_root='data/lsun/churches',
+            flip_p=flip_p,
+            **kwargs
+        )


 class LSUNBedroomsTrain(LSUNBase):
    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs)
+        super().__init__(
+            txt_file='data/lsun/bedrooms_train.txt',
+            data_root='data/lsun/bedrooms',
+            **kwargs
+        )


 class LSUNBedroomsValidation(LSUNBase):
    def __init__(self, flip_p=0.0, **kwargs):
-        super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms",
-                         flip_p=flip_p, **kwargs)
+        super().__init__(
+            txt_file='data/lsun/bedrooms_val.txt',
+            data_root='data/lsun/bedrooms',
+            flip_p=flip_p,
+            **kwargs
+        )


 class LSUNCatsTrain(LSUNBase):
    def __init__(self, **kwargs):
-        super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs)
+        super().__init__(
+            txt_file='data/lsun/cat_train.txt',
+            data_root='data/lsun/cats',
+            **kwargs
+        )


 class LSUNCatsValidation(LSUNBase):
-    def __init__(self, flip_p=0., **kwargs):
-        super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats",
-                         flip_p=flip_p, **kwargs)
+    def __init__(self, flip_p=0.0, **kwargs):
+        super().__init__(
+            txt_file='data/lsun/cat_val.txt',
+            data_root='data/lsun/cats',
+            flip_p=flip_p,
+            **kwargs
+        )
--- a/ldm/data/personalized.py
+++ b/ldm/data/personalized.py
@@ -0,0 +1,202 @@
+import os
+import numpy as np
+import PIL
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+import random
+
+imagenet_templates_smallest = [
+    'a photo of a {}',
+]
+
+imagenet_templates_small = [
+    'a photo of a {}',
+    'a rendering of a {}',
+    'a cropped photo of the {}',
+    'the photo of a {}',
+    'a photo of a clean {}',
+    'a photo of a dirty {}',
+    'a dark photo of the {}',
+    'a photo of my {}',
+    'a photo of the cool {}',
+    'a close-up photo of a {}',
+    'a bright photo of the {}',
+    'a cropped photo of a {}',
+    'a photo of the {}',
+    'a good photo of the {}',
+    'a photo of one {}',
+    'a close-up photo of the {}',
+    'a rendition of the {}',
+    'a photo of the clean {}',
+    'a rendition of a {}',
+    'a photo of a nice {}',
+    'a good photo of a {}',
+    'a photo of the nice {}',
+    'a photo of the small {}',
+    'a photo of the weird {}',
+    'a photo of the large {}',
+    'a photo of a cool {}',
+    'a photo of a small {}',
+]
+
+imagenet_dual_templates_small = [
+    'a photo of a {} with {}',
+    'a rendering of a {} with {}',
+    'a cropped photo of the {} with {}',
+    'the photo of a {} with {}',
+    'a photo of a clean {} with {}',
+    'a photo of a dirty {} with {}',
+    'a dark photo of the {} with {}',
+    'a photo of my {} with {}',
+    'a photo of the cool {} with {}',
+    'a close-up photo of a {} with {}',
+    'a bright photo of the {} with {}',
+    'a cropped photo of a {} with {}',
+    'a photo of the {} with {}',
+    'a good photo of the {} with {}',
+    'a photo of one {} with {}',
+    'a close-up photo of the {} with {}',
+    'a rendition of the {} with {}',
+    'a photo of the clean {} with {}',
+    'a rendition of a {} with {}',
+    'a photo of a nice {} with {}',
+    'a good photo of a {} with {}',
+    'a photo of the nice {} with {}',
+    'a photo of the small {} with {}',
+    'a photo of the weird {} with {}',
+    'a photo of the large {} with {}',
+    'a photo of a cool {} with {}',
+    'a photo of a small {} with {}',
+]
+
+per_img_token_list = [
+    'א',
+    'ב',
+    'ג',
+    'ד',
+    'ה',
+    'ו',
+    'ז',
+    'ח',
+    'ט',
+    'י',
+    'כ',
+    'ל',
+    'מ',
+    'נ',
+    'ס',
+    'ע',
+    'פ',
+    'צ',
+    'ק',
+    'ר',
+    'ש',
+    'ת',
+]
+
+
+class PersonalizedBase(Dataset):
+    def __init__(
+        self,
+        data_root,
+        size=None,
+        repeats=100,
+        interpolation='bicubic',
+        flip_p=0.5,
+        set='train',
+        placeholder_token='*',
+        per_image_tokens=False,
+        center_crop=False,
+        mixing_prob=0.25,
+        coarse_class_text=None,
+    ):
+
+        self.data_root = data_root
+
+        self.image_paths = [
+            os.path.join(self.data_root, file_path)
+            for file_path in os.listdir(self.data_root)
+        ]
+
+        # self._length = len(self.image_paths)
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        self.placeholder_token = placeholder_token
+
+        self.per_image_tokens = per_image_tokens
+        self.center_crop = center_crop
+        self.mixing_prob = mixing_prob
+
+        self.coarse_class_text = coarse_class_text
+
+        if per_image_tokens:
+            assert self.num_images < len(
+                per_img_token_list
+            ), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
+
+        if set == 'train':
+            self._length = self.num_images * repeats
+
+        self.size = size
+        self.interpolation = {
+            'linear': PIL.Image.LINEAR,
+            'bilinear': PIL.Image.BILINEAR,
+            'bicubic': PIL.Image.BICUBIC,
+            'lanczos': PIL.Image.LANCZOS,
+        }[interpolation]
+        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == 'RGB':
+            image = image.convert('RGB')
+
+        placeholder_string = self.placeholder_token
+        if self.coarse_class_text:
+            placeholder_string = (
+                f'{self.coarse_class_text} {placeholder_string}'
+            )
+
+        if self.per_image_tokens and np.random.uniform() < self.mixing_prob:
+            text = random.choice(imagenet_dual_templates_small).format(
+                placeholder_string, per_img_token_list[i % self.num_images]
+            )
+        else:
+            text = random.choice(imagenet_templates_small).format(
+                placeholder_string
+            )
+
+        example['caption'] = text
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            h, w, = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[
+                (h - crop) // 2 : (h + crop) // 2,
+                (w - crop) // 2 : (w + crop) // 2,
+            ]
+
+        image = Image.fromarray(img)
+        if self.size is not None:
+            image = image.resize(
+                (self.size, self.size), resample=self.interpolation
+            )
+
+        image = self.flip(image)
+        image = np.array(image).astype(np.uint8)
+        example['image'] = (image / 127.5 - 1.0).astype(np.float32)
+        return example
--- a/ldm/data/personalized_style.py
+++ b/ldm/data/personalized_style.py
@@ -0,0 +1,169 @@
+import os
+import numpy as np
+import PIL
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+import random
+
+imagenet_templates_small = [
+    'a painting in the style of {}',
+    'a rendering in the style of {}',
+    'a cropped painting in the style of {}',
+    'the painting in the style of {}',
+    'a clean painting in the style of {}',
+    'a dirty painting in the style of {}',
+    'a dark painting in the style of {}',
+    'a picture in the style of {}',
+    'a cool painting in the style of {}',
+    'a close-up painting in the style of {}',
+    'a bright painting in the style of {}',
+    'a cropped painting in the style of {}',
+    'a good painting in the style of {}',
+    'a close-up painting in the style of {}',
+    'a rendition in the style of {}',
+    'a nice painting in the style of {}',
+    'a small painting in the style of {}',
+    'a weird painting in the style of {}',
+    'a large painting in the style of {}',
+]
+
+imagenet_dual_templates_small = [
+    'a painting in the style of {} with {}',
+    'a rendering in the style of {} with {}',
+    'a cropped painting in the style of {} with {}',
+    'the painting in the style of {} with {}',
+    'a clean painting in the style of {} with {}',
+    'a dirty painting in the style of {} with {}',
+    'a dark painting in the style of {} with {}',
+    'a cool painting in the style of {} with {}',
+    'a close-up painting in the style of {} with {}',
+    'a bright painting in the style of {} with {}',
+    'a cropped painting in the style of {} with {}',
+    'a good painting in the style of {} with {}',
+    'a painting of one {} in the style of {}',
+    'a nice painting in the style of {} with {}',
+    'a small painting in the style of {} with {}',
+    'a weird painting in the style of {} with {}',
+    'a large painting in the style of {} with {}',
+]
+
+per_img_token_list = [
+    'א',
+    'ב',
+    'ג',
+    'ד',
+    'ה',
+    'ו',
+    'ז',
+    'ח',
+    'ט',
+    'י',
+    'כ',
+    'ל',
+    'מ',
+    'נ',
+    'ס',
+    'ע',
+    'פ',
+    'צ',
+    'ק',
+    'ר',
+    'ש',
+    'ת',
+]
+
+
+class PersonalizedBase(Dataset):
+    def __init__(
+        self,
+        data_root,
+        size=None,
+        repeats=100,
+        interpolation='bicubic',
+        flip_p=0.5,
+        set='train',
+        placeholder_token='*',
+        per_image_tokens=False,
+        center_crop=False,
+    ):
+
+        self.data_root = data_root
+
+        self.image_paths = [
+            os.path.join(self.data_root, file_path)
+            for file_path in os.listdir(self.data_root)
+        ]
+
+        # self._length = len(self.image_paths)
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        self.placeholder_token = placeholder_token
+
+        self.per_image_tokens = per_image_tokens
+        self.center_crop = center_crop
+
+        if per_image_tokens:
+            assert self.num_images < len(
+                per_img_token_list
+            ), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'."
+
+        if set == 'train':
+            self._length = self.num_images * repeats
+
+        self.size = size
+        self.interpolation = {
+            'linear': PIL.Image.LINEAR,
+            'bilinear': PIL.Image.BILINEAR,
+            'bicubic': PIL.Image.BICUBIC,
+            'lanczos': PIL.Image.LANCZOS,
+        }[interpolation]
+        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == 'RGB':
+            image = image.convert('RGB')
+
+        if self.per_image_tokens and np.random.uniform() < 0.25:
+            text = random.choice(imagenet_dual_templates_small).format(
+                self.placeholder_token, per_img_token_list[i % self.num_images]
+            )
+        else:
+            text = random.choice(imagenet_templates_small).format(
+                self.placeholder_token
+            )
+
+        example['caption'] = text
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            h, w, = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[
+                (h - crop) // 2 : (h + crop) // 2,
+                (w - crop) // 2 : (w + crop) // 2,
+            ]
+
+        image = Image.fromarray(img)
+        if self.size is not None:
+            image = image.resize(
+                (self.size, self.size), resample=self.interpolation
+            )
+
+        image = self.flip(image)
+        image = np.array(image).astype(np.uint8)
+        example['image'] = (image / 127.5 - 1.0).astype(np.float32)
+        return example
--- a/ldm/dream/devices.py
+++ b/ldm/dream/devices.py
@@ -0,0 +1,17 @@
+import torch
+
+def choose_torch_device() -> str:
+    '''Convenience routine for guessing which GPU device to run model on'''
+    if torch.cuda.is_available():
+        return 'cuda'
+    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        return 'mps'
+    return 'cpu'
+
+def choose_autocast_device(device) -> str:
+    '''Returns an autocast compatible device from a torch device'''
+    device_type = device.type # this returns 'mps' on M1
+    # autocast only supports cuda or cpu
+    if device_type not in ('cuda','cpu'):
+        return 'cpu'
+    return device_type
--- a/ldm/dream/image_util.py
+++ b/ldm/dream/image_util.py
@@ -0,0 +1,70 @@
+from math import sqrt, floor, ceil
+from PIL import Image
+
+class InitImageResizer():
+    """Simple class to create resized copies of an Image while preserving the aspect ratio."""
+    def __init__(self,Image):
+        self.image = Image
+
+    def resize(self,width=None,height=None) -> Image:
+        """
+        Return a copy of the image resized to fit within
+        a box width x height. The aspect ratio is 
+        maintained. If neither width nor height are provided, 
+        then returns a copy of the original image. If one or the other is
+        provided, then the other will be calculated from the
+        aspect ratio.
+
+        Everything is floored to the nearest multiple of 64 so
+        that it can be passed to img2img()
+        """
+        im    = self.image
+        
+        ar = im.width/float(im.height)
+
+        # Infer missing values from aspect ratio
+        if not(width or height): # both missing
+            width  = im.width
+            height = im.height
+        elif not height:           # height missing
+            height = int(width/ar)
+        elif not width:            # width missing
+            width  = int(height*ar)
+
+        # rw and rh are the resizing width and height for the image
+        # they maintain the aspect ratio, but may not completelyl fill up
+        # the requested destination size
+        (rw,rh) = (width,int(width/ar)) if im.width>=im.height else (int(height*ar),height)
+
+        #round everything to multiples of 64
+        width,height,rw,rh = map(
+            lambda x: x-x%64, (width,height,rw,rh)
+        )
+
+        # no resize necessary, but return a copy
+        if im.width == width and im.height == height:
+            return im.copy()
+        
+        # otherwise resize the original image so that it fits inside the bounding box
+        resized_image = self.image.resize((rw,rh),resample=Image.Resampling.LANCZOS)
+        return resized_image
+
+def make_grid(image_list, rows=None, cols=None):
+    image_cnt = len(image_list)
+    if None in (rows, cols):
+        rows = floor(sqrt(image_cnt))  # try to make it square
+        cols = ceil(image_cnt / rows)
+    width = image_list[0].width
+    height = image_list[0].height
+
+    grid_img = Image.new('RGB', (width * cols, height * rows))
+    i = 0
+    for r in range(0, rows):
+        for c in range(0, cols):
+            if i >= len(image_list):
+                break
+            grid_img.paste(image_list[i], (c * width, r * height))
+            i = i + 1
+
+    return grid_img
+
--- a/ldm/dream/pngwriter.py
+++ b/ldm/dream/pngwriter.py
@@ -0,0 +1,79 @@
+"""
+Two helper classes for dealing with PNG images and their path names.
+PngWriter -- Converts Images generated by T2I into PNGs, finds
+             appropriate names for them, and writes prompt metadata
+             into the PNG.
+PromptFormatter -- Utility for converting a Namespace of prompt parameters
+             back into a formatted prompt string with command-line switches.
+"""
+import os
+import re
+from PIL import PngImagePlugin
+
+# -------------------image generation utils-----
+
+
+class PngWriter:
+    def __init__(self, outdir):
+        self.outdir = outdir
+        os.makedirs(outdir, exist_ok=True)
+
+    # gives the next unique prefix in outdir
+    def unique_prefix(self):
+        # sort reverse alphabetically until we find max+1
+        dirlist = sorted(os.listdir(self.outdir), reverse=True)
+        # find the first filename that matches our pattern or return 000000.0.png
+        existing_name = next(
+            (f for f in dirlist if re.match('^(\d+)\..*\.png', f)),
+            '0000000.0.png',
+        )
+        basecount = int(existing_name.split('.', 1)[0]) + 1
+        return f'{basecount:06}'
+
+    # saves image named _image_ to outdir/name, writing metadata from prompt
+    # returns full path of output
+    def save_image_and_prompt_to_png(self, image, prompt, name):
+        path = os.path.join(self.outdir, name)
+        info = PngImagePlugin.PngInfo()
+        info.add_text('Dream', prompt)
+        image.save(path, 'PNG', pnginfo=info)
+        return path
+
+
+class PromptFormatter:
+    def __init__(self, t2i, opt):
+        self.t2i = t2i
+        self.opt = opt
+
+    # note: the t2i object should provide all these values.
+    # there should be no need to or against opt values
+    def normalize_prompt(self):
+        """Normalize the prompt and switches"""
+        t2i = self.t2i
+        opt = self.opt
+
+        switches = list()
+        switches.append(f'"{opt.prompt}"')
+        switches.append(f'-s{opt.steps        or t2i.steps}')
+        switches.append(f'-W{opt.width        or t2i.width}')
+        switches.append(f'-H{opt.height       or t2i.height}')
+        switches.append(f'-C{opt.cfg_scale    or t2i.cfg_scale}')
+        switches.append(f'-A{opt.sampler_name or t2i.sampler_name}')
+        if opt.init_img:
+            switches.append(f'-I{opt.init_img}')
+        if opt.fit:
+            switches.append(f'--fit')
+        if opt.strength and opt.init_img is not None:
+            switches.append(f'-f{opt.strength or t2i.strength}')
+        if opt.gfpgan_strength:
+            switches.append(f'-G{opt.gfpgan_strength}')
+        if opt.upscale:
+            switches.append(f'-U {" ".join([str(u) for u in opt.upscale])}')
+        if opt.variation_amount > 0:
+            switches.append(f'-v{opt.variation_amount}')
+        if opt.with_variations:
+            formatted_variations = ','.join(f'{seed}:{weight}' for seed, weight in opt.with_variations)
+            switches.append(f'-V{formatted_variations}')
+        if t2i.full_precision:
+            switches.append('-F')
+        return ' '.join(switches)
--- a/ldm/dream/readline.py
+++ b/ldm/dream/readline.py
@@ -0,0 +1,121 @@
+"""
+Readline helper functions for dream.py (linux and mac only).
+"""
+import os
+import re
+import atexit
+
+# ---------------readline utilities---------------------
+try:
+    import readline
+
+    readline_available = True
+except:
+    readline_available = False
+
+
+class Completer:
+    def __init__(self, options):
+        self.options = sorted(options)
+        return
+
+    def complete(self, text, state):
+        buffer = readline.get_line_buffer()
+
+        if text.startswith(('-I', '--init_img')):
+            return self._path_completions(text, state, ('.png','.jpg','.jpeg'))
+
+        if buffer.strip().endswith('cd') or text.startswith(('.', '/')):
+            return self._path_completions(text, state, ())
+
+        response = None
+        if state == 0:
+            # This is the first time for this text, so build a match list.
+            if text:
+                self.matches = [
+                    s for s in self.options if s and s.startswith(text)
+                ]
+            else:
+                self.matches = self.options[:]
+
+        # Return the state'th item from the match list,
+        # if we have that many.
+        try:
+            response = self.matches[state]
+        except IndexError:
+            response = None
+        return response
+
+    def _path_completions(self, text, state, extensions):
+        # get the path so far
+        if text.startswith('-I'):
+            path = text.replace('-I', '', 1).lstrip()
+        elif text.startswith('--init_img='):
+            path = text.replace('--init_img=', '', 1).lstrip()
+        else:
+            path = text
+
+        matches = list()
+
+        path = os.path.expanduser(path)
+        if len(path) == 0:
+            matches.append(text + './')
+        else:
+            dir = os.path.dirname(path)
+            dir_list = os.listdir(dir)
+            for n in dir_list:
+                if n.startswith('.') and len(n) > 1:
+                    continue
+                full_path = os.path.join(dir, n)
+                if full_path.startswith(path):
+                    if os.path.isdir(full_path):
+                        matches.append(
+                            os.path.join(os.path.dirname(text), n) + '/'
+                        )
+                    elif n.endswith(extensions):
+                        matches.append(os.path.join(os.path.dirname(text), n))
+
+        try:
+            response = matches[state]
+        except IndexError:
+            response = None
+        return response
+
+
+if readline_available:
+    readline.set_completer(
+        Completer(
+            [
+                '--steps','-s',
+                '--seed','-S',
+                '--iterations','-n',
+                '--width','-W','--height','-H',
+                '--cfg_scale','-C',
+                '--grid','-g',
+                '--individual','-i',
+                '--init_img','-I',
+                '--strength','-f',
+                '--variants','-v',
+                '--outdir','-o',
+                '--sampler','-A','-m',
+                '--embedding_path',
+                '--device',
+                '--grid','-g',
+                '--gfpgan_strength','-G',
+                '--upscale','-U',
+                '-save_orig','--save_original',
+                '--skip_normalize','-x',
+                '--log_tokenization','t',
+            ]
+        ).complete
+    )
+    readline.set_completer_delims(' ')
+    readline.parse_and_bind('tab: complete')
+
+    histfile = os.path.join(os.path.expanduser('~'), '.dream_history')
+    try:
+        readline.read_history_file(histfile)
+        readline.set_history_length(1000)
+    except FileNotFoundError:
+        pass
+    atexit.register(readline.write_history_file, histfile)
--- a/ldm/dream/server.py
+++ b/ldm/dream/server.py
@@ -0,0 +1,202 @@
+import json
+import base64
+import mimetypes
+import os
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from ldm.dream.pngwriter import PngWriter
+from threading import Event
+
+class CanceledException(Exception):
+    pass
+
+class DreamServer(BaseHTTPRequestHandler):
+    model = None
+    canceled = Event()
+
+    def do_GET(self):
+        if self.path == "/":
+            self.send_response(200)
+            self.send_header("Content-type", "text/html")
+            self.end_headers()
+            with open("./static/dream_web/index.html", "rb") as content:
+                self.wfile.write(content.read())
+        elif self.path == "/config.js":
+            # unfortunately this import can't be at the top level, since that would cause a circular import
+            from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
+            self.send_response(200)
+            self.send_header("Content-type", "application/javascript")
+            self.end_headers()
+            config = {
+                'gfpgan_model_exists': gfpgan_model_exists
+            }
+            self.wfile.write(bytes("let config = " + json.dumps(config) + ";\n", "utf-8"))
+        elif self.path == "/cancel":
+            self.canceled.set()
+            self.send_response(200)
+            self.send_header("Content-type", "application/json")
+            self.end_headers()
+            self.wfile.write(bytes('{}', 'utf8'))
+        else:
+            path = "." + self.path
+            cwd = os.path.realpath(os.getcwd())
+            is_in_cwd = os.path.commonprefix((os.path.realpath(path), cwd)) == cwd
+            if not (is_in_cwd and os.path.exists(path)):
+                self.send_response(404)
+                return
+            mime_type = mimetypes.guess_type(path)[0]
+            if mime_type is not None:
+                self.send_response(200)
+                self.send_header("Content-type", mime_type)
+                self.end_headers()
+                with open("." + self.path, "rb") as content:
+                    self.wfile.write(content.read())
+            else:
+                self.send_response(404)
+
+    def do_POST(self):
+        self.send_response(200)
+        self.send_header("Content-type", "application/json")
+        self.end_headers()
+
+        # unfortunately this import can't be at the top level, since that would cause a circular import
+        from ldm.gfpgan.gfpgan_tools import gfpgan_model_exists
+
+        content_length = int(self.headers['Content-Length'])
+        post_data = json.loads(self.rfile.read(content_length))
+        prompt = post_data['prompt']
+        initimg = post_data['initimg']
+        strength = float(post_data['strength'])
+        iterations = int(post_data['iterations'])
+        steps = int(post_data['steps'])
+        width = int(post_data['width'])
+        height = int(post_data['height'])
+        fit    = 'fit' in post_data
+        cfgscale = float(post_data['cfgscale'])
+        sampler_name  = post_data['sampler']
+        gfpgan_strength = float(post_data['gfpgan_strength']) if gfpgan_model_exists else 0
+        upscale_level    = post_data['upscale_level']
+        upscale_strength = post_data['upscale_strength']
+        upscale = [int(upscale_level),float(upscale_strength)] if upscale_level != '' else None
+        progress_images = 'progress_images' in post_data
+        seed = self.model.seed if int(post_data['seed']) == -1 else int(post_data['seed'])
+
+        self.canceled.clear()
+        print(f">> Request to generate with prompt: {prompt}")
+        # In order to handle upscaled images, the PngWriter needs to maintain state
+        # across images generated by each call to prompt2img(), so we define it in
+        # the outer scope of image_done()
+        config = post_data.copy() # Shallow copy
+        config['initimg'] = ''
+
+        images_generated = 0    # helps keep track of when upscaling is started
+        images_upscaled = 0     # helps keep track of when upscaling is completed
+        pngwriter = PngWriter("./outputs/img-samples/")
+
+        prefix = pngwriter.unique_prefix()
+        # if upscaling is requested, then this will be called twice, once when
+        # the images are first generated, and then again when after upscaling
+        # is complete. The upscaling replaces the original file, so the second
+        # entry should not be inserted into the image list.
+        def image_done(image, seed, upscaled=False):
+            name = f'{prefix}.{seed}.png'
+            path = pngwriter.save_image_and_prompt_to_png(image, f'{prompt} -S{seed}', name)
+
+            # Append post_data to log, but only once!
+            if not upscaled:
+                with open("./outputs/img-samples/dream_web_log.txt", "a") as log:
+                    log.write(f"{path}: {json.dumps(config)}\n")
+
+                self.wfile.write(bytes(json.dumps(
+                    {'event': 'result', 'url': path, 'seed': seed, 'config': config}
+                ) + '\n',"utf-8"))
+
+            # control state of the "postprocessing..." message
+            upscaling_requested = upscale or gfpgan_strength>0
+            nonlocal images_generated # NB: Is this bad python style? It is typical usage in a perl closure.
+            nonlocal images_upscaled  # NB: Is this bad python style? It is typical usage in a perl closure.
+            if upscaled:
+                images_upscaled += 1
+            else:
+                images_generated +=1
+            if upscaling_requested:
+                action = None
+                if images_generated >= iterations:
+                    if images_upscaled < iterations:
+                        action = 'upscaling-started'
+                    else:
+                        action = 'upscaling-done'
+                if action:
+                    x = images_upscaled+1
+                    self.wfile.write(bytes(json.dumps(
+                        {'event':action,'processed_file_cnt':f'{x}/{iterations}'}
+                    ) + '\n',"utf-8"))
+
+        step_writer = PngWriter('./outputs/intermediates/')
+        step_index = 1
+        def image_progress(sample, step):
+            if self.canceled.is_set():
+                self.wfile.write(bytes(json.dumps({'event':'canceled'}) + '\n', 'utf-8'))
+                raise CanceledException
+            path = None
+            # since rendering images is moderately expensive, only render every 5th image
+            # and don't bother with the last one, since it'll render anyway
+            nonlocal step_index
+            if progress_images and step % 5 == 0 and step < steps - 1:
+                image = self.model._sample_to_image(sample)
+                name = f'{prefix}.{seed}.{step_index}.png'
+                metadata = f'{prompt} -S{seed} [intermediate]'
+                path = step_writer.save_image_and_prompt_to_png(image, metadata, name)
+                step_index += 1
+            self.wfile.write(bytes(json.dumps(
+                {'event': 'step', 'step': step + 1, 'url': path}
+            ) + '\n',"utf-8"))
+
+        try:
+            if initimg is None:
+                # Run txt2img
+                self.model.prompt2image(prompt,
+                                        iterations=iterations,
+                                        cfg_scale = cfgscale,
+                                        width = width,
+                                        height = height,
+                                        seed = seed,
+                                        steps = steps,
+                                        gfpgan_strength = gfpgan_strength,
+                                        upscale         = upscale,
+                                        sampler_name    = sampler_name,
+                                        step_callback=image_progress,
+                                        image_callback=image_done)
+            else:
+                # Decode initimg as base64 to temp file
+                with open("./img2img-tmp.png", "wb") as f:
+                    initimg = initimg.split(",")[1] # Ignore mime type
+                    f.write(base64.b64decode(initimg))
+
+                try:
+                    # Run img2img
+                    self.model.prompt2image(prompt,
+                                            init_img = "./img2img-tmp.png",
+                                            strength = strength,
+                                            iterations = iterations,
+                                            cfg_scale  = cfgscale,
+                                            seed       = seed,
+                                            steps      = steps,
+                                            sampler_name    = sampler_name,
+                                            width      = width,
+                                            height     = height,
+                                            fit        = fit,
+                                            gfpgan_strength=gfpgan_strength,
+                                            upscale         = upscale,
+                                            step_callback=image_progress,
+                                            image_callback=image_done)
+                finally:
+                    # Remove the temp file
+                    os.remove("./img2img-tmp.png")
+        except CanceledException:
+            print(f"Canceled.")
+            return
+
+
+class ThreadingDreamServer(ThreadingHTTPServer):
+    def __init__(self, server_address):
+        super(ThreadingDreamServer, self).__init__(server_address, DreamServer)
--- a/ldm/gfpgan/gfpgan_tools.py
+++ b/ldm/gfpgan/gfpgan_tools.py
@@ -0,0 +1,167 @@
+import torch
+import warnings
+import os
+import sys
+import numpy as np
+
+from PIL import Image
+from scripts.dream import create_argv_parser
+
+arg_parser = create_argv_parser()
+opt = arg_parser.parse_args()
+
+model_path = os.path.join(opt.gfpgan_dir, opt.gfpgan_model_path)
+gfpgan_model_exists = os.path.isfile(model_path)
+
+def _run_gfpgan(image, strength, prompt, seed, upsampler_scale=4):
+    print(f'>> GFPGAN - Restoring Faces: {prompt} : seed:{seed}')
+    gfpgan = None
+    with warnings.catch_warnings():
+        warnings.filterwarnings('ignore', category=DeprecationWarning)
+        warnings.filterwarnings('ignore', category=UserWarning)
+
+        try:
+            if not gfpgan_model_exists:
+                raise Exception('GFPGAN model not found at path ' + model_path)
+
+            sys.path.append(os.path.abspath(opt.gfpgan_dir))
+            from gfpgan import GFPGANer
+
+            bg_upsampler = _load_gfpgan_bg_upsampler(
+                opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
+            )
+
+            gfpgan = GFPGANer(
+                model_path=model_path,
+                upscale=upsampler_scale,
+                arch='clean',
+                channel_multiplier=2,
+                bg_upsampler=bg_upsampler,
+            )
+        except Exception:
+            import traceback
+
+            print('>> Error loading GFPGAN:', file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+
+    if gfpgan is None:
+        print(
+            f'>> GFPGAN not initialized, it must be loaded via the --gfpgan argument'
+        )
+        return image
+
+    image = image.convert('RGB')
+
+    cropped_faces, restored_faces, restored_img = gfpgan.enhance(
+        np.array(image, dtype=np.uint8),
+        has_aligned=False,
+        only_center_face=False,
+        paste_back=True,
+    )
+    res = Image.fromarray(restored_img)
+
+    if strength < 1.0:
+        # Resize the image to the new image if the sizes have changed
+        if restored_img.size != image.size:
+            image = image.resize(res.size)
+        res = Image.blend(image, res, strength)
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gfpgan = None
+
+    return res
+
+
+def _load_gfpgan_bg_upsampler(bg_upsampler, upsampler_scale, bg_tile=400):
+    if bg_upsampler == 'realesrgan':
+        if not torch.cuda.is_available():  # CPU
+            warnings.warn(
+                'The unoptimized RealESRGAN is slow on CPU. We do not use it. '
+                'If you really want to use it, please modify the corresponding codes.'
+            )
+            bg_upsampler = None
+        else:
+            model_path = {
+                2: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
+                4: 'https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
+            }
+
+            if upsampler_scale not in model_path:
+                return None
+
+            from basicsr.archs.rrdbnet_arch import RRDBNet
+            from realesrgan import RealESRGANer
+
+            if upsampler_scale == 4:
+                model = RRDBNet(
+                    num_in_ch=3,
+                    num_out_ch=3,
+                    num_feat=64,
+                    num_block=23,
+                    num_grow_ch=32,
+                    scale=4,
+                )
+            if upsampler_scale == 2:
+                model = RRDBNet(
+                    num_in_ch=3,
+                    num_out_ch=3,
+                    num_feat=64,
+                    num_block=23,
+                    num_grow_ch=32,
+                    scale=2,
+                )
+
+            bg_upsampler = RealESRGANer(
+                scale=upsampler_scale,
+                model_path=model_path[upsampler_scale],
+                model=model,
+                tile=bg_tile,
+                tile_pad=10,
+                pre_pad=0,
+                half=True,
+            )  # need to set False in CPU mode
+    else:
+        bg_upsampler = None
+
+    return bg_upsampler
+
+
+def real_esrgan_upscale(image, strength, upsampler_scale, prompt, seed):
+    print(
+        f'>> Real-ESRGAN Upscaling: {prompt} : seed:{seed} : scale:{upsampler_scale}x'
+    )
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings('ignore', category=DeprecationWarning)
+        warnings.filterwarnings('ignore', category=UserWarning)
+
+        try:
+            upsampler = _load_gfpgan_bg_upsampler(
+                opt.gfpgan_bg_upsampler, upsampler_scale, opt.gfpgan_bg_tile
+            )
+        except Exception:
+            import traceback
+
+            print('>> Error loading Real-ESRGAN:', file=sys.stderr)
+            print(traceback.format_exc(), file=sys.stderr)
+
+    output, img_mode = upsampler.enhance(
+        np.array(image, dtype=np.uint8),
+        outscale=upsampler_scale,
+        alpha_upsampler=opt.gfpgan_bg_upsampler,
+    )
+
+    res = Image.fromarray(output)
+
+    if strength < 1.0:
+        # Resize the image to the new image if the sizes have changed
+        if output.size != image.size:
+            image = image.resize(res.size)
+        res = Image.blend(image, res, strength)
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    upsampler = None
+
+    return res
--- a/ldm/lr_scheduler.py
+++ b/ldm/lr_scheduler.py
@@ -5,32 +5,49 @@ class LambdaWarmUpCosineScheduler:
    """
    note: use with a base_lr of 1.0
    """
-    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+
+    def __init__(
+        self,
+        warm_up_steps,
+        lr_min,
+        lr_max,
+        lr_start,
+        max_decay_steps,
+        verbosity_interval=0,
+    ):
        self.lr_warm_up_steps = warm_up_steps
        self.lr_start = lr_start
        self.lr_min = lr_min
        self.lr_max = lr_max
        self.lr_max_decay_steps = max_decay_steps
-        self.last_lr = 0.
+        self.last_lr = 0.0
        self.verbosity_interval = verbosity_interval

    def schedule(self, n, **kwargs):
        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+            if n % self.verbosity_interval == 0:
+                print(
+                    f'current step: {n}, recent lr-multiplier: {self.last_lr}'
+                )
        if n < self.lr_warm_up_steps:
-            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            lr = (
+                self.lr_max - self.lr_start
+            ) / self.lr_warm_up_steps * n + self.lr_start
            self.last_lr = lr
            return lr
        else:
-            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = (n - self.lr_warm_up_steps) / (
+                self.lr_max_decay_steps - self.lr_warm_up_steps
+            )
            t = min(t, 1.0)
            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
-                    1 + np.cos(t * np.pi))
+                1 + np.cos(t * np.pi)
+            )
            self.last_lr = lr
            return lr

    def __call__(self, n, **kwargs):
-        return self.schedule(n,**kwargs)
+        return self.schedule(n, **kwargs)


 class LambdaWarmUpCosineScheduler2:
@@ -38,15 +55,30 @@ class LambdaWarmUpCosineScheduler2:
    supports repeated iterations, configurable via lists
    note: use with a base_lr of 1.0.
    """
-    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
-        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+
+    def __init__(
+        self,
+        warm_up_steps,
+        f_min,
+        f_max,
+        f_start,
+        cycle_lengths,
+        verbosity_interval=0,
+    ):
+        assert (
+            len(warm_up_steps)
+            == len(f_min)
+            == len(f_max)
+            == len(f_start)
+            == len(cycle_lengths)
+        )
        self.lr_warm_up_steps = warm_up_steps
        self.f_start = f_start
        self.f_min = f_min
        self.f_max = f_max
        self.cycle_lengths = cycle_lengths
        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
-        self.last_f = 0.
+        self.last_f = 0.0
        self.verbosity_interval = verbosity_interval

    def find_in_interval(self, n):
@@ -60,17 +92,25 @@ class LambdaWarmUpCosineScheduler2:
        cycle = self.find_in_interval(n)
        n = n - self.cum_cycles[cycle]
        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
-                                                       f"current cycle {cycle}")
+            if n % self.verbosity_interval == 0:
+                print(
+                    f'current step: {n}, recent lr-multiplier: {self.last_f}, '
+                    f'current cycle {cycle}'
+                )
        if n < self.lr_warm_up_steps[cycle]:
-            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            f = (
+                self.f_max[cycle] - self.f_start[cycle]
+            ) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
            self.last_f = f
            return f
        else:
-            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = (n - self.lr_warm_up_steps[cycle]) / (
+                self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]
+            )
            t = min(t, 1.0)
-            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
-                    1 + np.cos(t * np.pi))
+            f = self.f_min[cycle] + 0.5 * (
+                self.f_max[cycle] - self.f_min[cycle]
+            ) * (1 + np.cos(t * np.pi))
            self.last_f = f
            return f

@@ -79,20 +119,25 @@ class LambdaWarmUpCosineScheduler2:


 class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
-
    def schedule(self, n, **kwargs):
        cycle = self.find_in_interval(n)
        n = n - self.cum_cycles[cycle]
        if self.verbosity_interval > 0:
-            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
-                                                       f"current cycle {cycle}")
+            if n % self.verbosity_interval == 0:
+                print(
+                    f'current step: {n}, recent lr-multiplier: {self.last_f}, '
+                    f'current cycle {cycle}'
+                )

        if n < self.lr_warm_up_steps[cycle]:
-            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            f = (
+                self.f_max[cycle] - self.f_start[cycle]
+            ) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
            self.last_f = f
            return f
        else:
-            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (
+                self.cycle_lengths[cycle] - n
+            ) / (self.cycle_lengths[cycle])
            self.last_f = f
            return f
-
--- a/ldm/models/autoencoder.py
+++ b/ldm/models/autoencoder.py
@@ -6,29 +6,32 @@ from contextlib import contextmanager
 from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer

 from ldm.modules.diffusionmodules.model import Encoder, Decoder
-from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+from ldm.modules.distributions.distributions import (
+    DiagonalGaussianDistribution,
+)

 from ldm.util import instantiate_from_config


 class VQModel(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 n_embed,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 batch_resize_range=None,
-                 scheduler_config=None,
-                 lr_g_factor=1.0,
-                 remap=None,
-                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
-                 use_ema=False
-                 ):
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        n_embed,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key='image',
+        colorize_nlabels=None,
+        monitor=None,
+        batch_resize_range=None,
+        scheduler_config=None,
+        lr_g_factor=1.0,
+        remap=None,
+        sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+        use_ema=False,
+    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.n_embed = n_embed
@@ -36,24 +39,34 @@ class VQModel(pl.LightningModule):
        self.encoder = Encoder(**ddconfig)
        self.decoder = Decoder(**ddconfig)
        self.loss = instantiate_from_config(lossconfig)
-        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
-                                        remap=remap,
-                                        sane_index_shape=sane_index_shape)
-        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.quantize = VectorQuantizer(
+            n_embed,
+            embed_dim,
+            beta=0.25,
+            remap=remap,
+            sane_index_shape=sane_index_shape,
+        )
+        self.quant_conv = torch.nn.Conv2d(ddconfig['z_channels'], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(
+            embed_dim, ddconfig['z_channels'], 1
+        )
        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+            assert type(colorize_nlabels) == int
+            self.register_buffer(
+                'colorize', torch.randn(3, colorize_nlabels, 1, 1)
+            )
        if monitor is not None:
            self.monitor = monitor
        self.batch_resize_range = batch_resize_range
        if self.batch_resize_range is not None:
-            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
+            print(
+                f'{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.'
+            )

        self.use_ema = use_ema
        if self.use_ema:
            self.model_ema = LitEma(self)
-            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+            print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')

        if ckpt_path is not None:
            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
@@ -66,28 +79,30 @@ class VQModel(pl.LightningModule):
            self.model_ema.store(self.parameters())
            self.model_ema.copy_to(self)
            if context is not None:
-                print(f"{context}: Switched to EMA weights")
+                print(f'{context}: Switched to EMA weights')
        try:
            yield None
        finally:
            if self.use_ema:
                self.model_ema.restore(self.parameters())
                if context is not None:
-                    print(f"{context}: Restored training weights")
+                    print(f'{context}: Restored training weights')

    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
+        sd = torch.load(path, map_location='cpu')['state_dict']
        keys = list(sd.keys())
        for k in keys:
            for ik in ignore_keys:
                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
+                    print('Deleting key {} from state_dict.'.format(k))
                    del sd[k]
        missing, unexpected = self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        print(
+            f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+        )
        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
-            print(f"Unexpected Keys: {unexpected}")
+            print(f'Missing Keys: {missing}')
+            print(f'Unexpected Keys: {unexpected}')

    def on_train_batch_end(self, *args, **kwargs):
        if self.use_ema:
@@ -115,7 +130,7 @@ class VQModel(pl.LightningModule):
        return dec

    def forward(self, input, return_pred_indices=False):
-        quant, diff, (_,_,ind) = self.encode(input)
+        quant, diff, (_, _, ind) = self.encode(input)
        dec = self.decode(quant)
        if return_pred_indices:
            return dec, diff, ind
@@ -125,7 +140,11 @@ class VQModel(pl.LightningModule):
        x = batch[k]
        if len(x.shape) == 3:
            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        x = (
+            x.permute(0, 3, 1, 2)
+            .to(memory_format=torch.contiguous_format)
+            .float()
+        )
        if self.batch_resize_range is not None:
            lower_size = self.batch_resize_range[0]
            upper_size = self.batch_resize_range[1]
@@ -133,9 +152,11 @@ class VQModel(pl.LightningModule):
                # do the first few batches with max size to avoid later oom
                new_resize = upper_size
            else:
-                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
+                new_resize = np.random.choice(
+                    np.arange(lower_size, upper_size + 16, 16)
+                )
            if new_resize != x.shape[2]:
-                x = F.interpolate(x, size=new_resize, mode="bicubic")
+                x = F.interpolate(x, size=new_resize, mode='bicubic')
            x = x.detach()
        return x

@@ -147,81 +168,139 @@ class VQModel(pl.LightningModule):

        if optimizer_idx == 0:
            # autoencode
-            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train",
-                                            predicted_indices=ind)
+            aeloss, log_dict_ae = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train',
+                predicted_indices=ind,
+            )

-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+            )
            return aeloss

        if optimizer_idx == 1:
            # discriminator
-            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            discloss, log_dict_disc = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train',
+            )
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+            )
            return discloss

    def validation_step(self, batch, batch_idx):
        log_dict = self._validation_step(batch, batch_idx)
        with self.ema_scope():
-            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
+            log_dict_ema = self._validation_step(
+                batch, batch_idx, suffix='_ema'
+            )
        return log_dict

-    def _validation_step(self, batch, batch_idx, suffix=""):
+    def _validation_step(self, batch, batch_idx, suffix=''):
        x = self.get_input(batch, self.image_key)
        xrec, qloss, ind = self(x, return_pred_indices=True)
-        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
-                                        self.global_step,
-                                        last_layer=self.get_last_layer(),
-                                        split="val"+suffix,
-                                        predicted_indices=ind
-                                        )
+        aeloss, log_dict_ae = self.loss(
+            qloss,
+            x,
+            xrec,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val' + suffix,
+            predicted_indices=ind,
+        )

-        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
-                                            self.global_step,
-                                            last_layer=self.get_last_layer(),
-                                            split="val"+suffix,
-                                            predicted_indices=ind
-                                            )
-        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
-        self.log(f"val{suffix}/rec_loss", rec_loss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
-        self.log(f"val{suffix}/aeloss", aeloss,
-                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        discloss, log_dict_disc = self.loss(
+            qloss,
+            x,
+            xrec,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val' + suffix,
+            predicted_indices=ind,
+        )
+        rec_loss = log_dict_ae[f'val{suffix}/rec_loss']
+        self.log(
+            f'val{suffix}/rec_loss',
+            rec_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log(
+            f'val{suffix}/aeloss',
+            aeloss,
+            prog_bar=True,
+            logger=True,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
        if version.parse(pl.__version__) >= version.parse('1.4.0'):
-            del log_dict_ae[f"val{suffix}/rec_loss"]
+            del log_dict_ae[f'val{suffix}/rec_loss']
        self.log_dict(log_dict_ae)
        self.log_dict(log_dict_disc)
        return self.log_dict

    def configure_optimizers(self):
        lr_d = self.learning_rate
-        lr_g = self.lr_g_factor*self.learning_rate
-        print("lr_d", lr_d)
-        print("lr_g", lr_g)
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quantize.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr_g, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr_d, betas=(0.5, 0.9))
+        lr_g = self.lr_g_factor * self.learning_rate
+        print('lr_d', lr_d)
+        print('lr_g', lr_g)
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.quantize.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr_g,
+            betas=(0.5, 0.9),
+        )
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr_d, betas=(0.5, 0.9)
+        )

        if self.scheduler_config is not None:
            scheduler = instantiate_from_config(self.scheduler_config)

-            print("Setting up LambdaLR scheduler...")
+            print('Setting up LambdaLR scheduler...')
            scheduler = [
                {
-                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
+                    'scheduler': LambdaLR(
+                        opt_ae, lr_lambda=scheduler.schedule
+                    ),
                    'interval': 'step',
-                    'frequency': 1
+                    'frequency': 1,
                },
                {
-                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
+                    'scheduler': LambdaLR(
+                        opt_disc, lr_lambda=scheduler.schedule
+                    ),
                    'interval': 'step',
-                    'frequency': 1
+                    'frequency': 1,
                },
            ]
            return [opt_ae, opt_disc], scheduler
@@ -235,7 +314,7 @@ class VQModel(pl.LightningModule):
        x = self.get_input(batch, self.image_key)
        x = x.to(self.device)
        if only_inputs:
-            log["inputs"] = x
+            log['inputs'] = x
            return log
        xrec, _ = self(x)
        if x.shape[1] > 3:
@@ -243,21 +322,24 @@ class VQModel(pl.LightningModule):
            assert xrec.shape[1] > 3
            x = self.to_rgb(x)
            xrec = self.to_rgb(xrec)
-        log["inputs"] = x
-        log["reconstructions"] = xrec
+        log['inputs'] = x
+        log['reconstructions'] = xrec
        if plot_ema:
            with self.ema_scope():
                xrec_ema, _ = self(x)
-                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
-                log["reconstructions_ema"] = xrec_ema
+                if x.shape[1] > 3:
+                    xrec_ema = self.to_rgb(xrec_ema)
+                log['reconstructions_ema'] = xrec_ema
        return log

    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        assert self.image_key == 'segmentation'
+        if not hasattr(self, 'colorize'):
+            self.register_buffer(
+                'colorize', torch.randn(3, x.shape[1], 1, 1).to(x)
+            )
        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
        return x


@@ -283,43 +365,50 @@ class VQModelInterface(VQModel):


 class AutoencoderKL(pl.LightningModule):
-    def __init__(self,
-                 ddconfig,
-                 lossconfig,
-                 embed_dim,
-                 ckpt_path=None,
-                 ignore_keys=[],
-                 image_key="image",
-                 colorize_nlabels=None,
-                 monitor=None,
-                 ):
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key='image',
+        colorize_nlabels=None,
+        monitor=None,
+    ):
        super().__init__()
        self.image_key = image_key
        self.encoder = Encoder(**ddconfig)
        self.decoder = Decoder(**ddconfig)
        self.loss = instantiate_from_config(lossconfig)
-        assert ddconfig["double_z"]
-        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        assert ddconfig['double_z']
+        self.quant_conv = torch.nn.Conv2d(
+            2 * ddconfig['z_channels'], 2 * embed_dim, 1
+        )
+        self.post_quant_conv = torch.nn.Conv2d(
+            embed_dim, ddconfig['z_channels'], 1
+        )
        self.embed_dim = embed_dim
        if colorize_nlabels is not None:
-            assert type(colorize_nlabels)==int
-            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+            assert type(colorize_nlabels) == int
+            self.register_buffer(
+                'colorize', torch.randn(3, colorize_nlabels, 1, 1)
+            )
        if monitor is not None:
            self.monitor = monitor
        if ckpt_path is not None:
            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)

    def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
+        sd = torch.load(path, map_location='cpu')['state_dict']
        keys = list(sd.keys())
        for k in keys:
            for ik in ignore_keys:
                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
+                    print('Deleting key {} from state_dict.'.format(k))
                    del sd[k]
        self.load_state_dict(sd, strict=False)
-        print(f"Restored from {path}")
+        print(f'Restored from {path}')

    def encode(self, x):
        h = self.encoder(x)
@@ -345,7 +434,11 @@ class AutoencoderKL(pl.LightningModule):
        x = batch[k]
        if len(x.shape) == 3:
            x = x[..., None]
-        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        x = (
+            x.permute(0, 3, 1, 2)
+            .to(memory_format=torch.contiguous_format)
+            .float()
+        )
        return x

    def training_step(self, batch, batch_idx, optimizer_idx):
@@ -354,44 +447,102 @@ class AutoencoderKL(pl.LightningModule):

        if optimizer_idx == 0:
            # train encoder+decoder+logvar
-            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                            last_layer=self.get_last_layer(), split="train")
-            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            aeloss, log_dict_ae = self.loss(
+                inputs,
+                reconstructions,
+                posterior,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train',
+            )
+            self.log(
+                'aeloss',
+                aeloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+            )
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=False,
+            )
            return aeloss

        if optimizer_idx == 1:
            # train the discriminator
-            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
-                                                last_layer=self.get_last_layer(), split="train")
+            discloss, log_dict_disc = self.loss(
+                inputs,
+                reconstructions,
+                posterior,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train',
+            )

-            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
-            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            self.log(
+                'discloss',
+                discloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True,
+            )
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=False,
+            )
            return discloss

    def validation_step(self, batch, batch_idx):
        inputs = self.get_input(batch, self.image_key)
        reconstructions, posterior = self(inputs)
-        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
-                                        last_layer=self.get_last_layer(), split="val")
+        aeloss, log_dict_ae = self.loss(
+            inputs,
+            reconstructions,
+            posterior,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val',
+        )

-        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
-                                            last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(
+            inputs,
+            reconstructions,
+            posterior,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val',
+        )

-        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log('val/rec_loss', log_dict_ae['val/rec_loss'])
        self.log_dict(log_dict_ae)
        self.log_dict(log_dict_disc)
        return self.log_dict

    def configure_optimizers(self):
        lr = self.learning_rate
-        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
-                                  list(self.decoder.parameters())+
-                                  list(self.quant_conv.parameters())+
-                                  list(self.post_quant_conv.parameters()),
-                                  lr=lr, betas=(0.5, 0.9))
-        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
-                                    lr=lr, betas=(0.5, 0.9))
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters())
+            + list(self.decoder.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr,
+            betas=(0.5, 0.9),
+        )
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)
+        )
        return [opt_ae, opt_disc], []

    def get_last_layer(self):
@@ -409,17 +560,19 @@ class AutoencoderKL(pl.LightningModule):
                assert xrec.shape[1] > 3
                x = self.to_rgb(x)
                xrec = self.to_rgb(xrec)
-            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
-            log["reconstructions"] = xrec
-        log["inputs"] = x
+            log['samples'] = self.decode(torch.randn_like(posterior.sample()))
+            log['reconstructions'] = xrec
+        log['inputs'] = x
        return log

    def to_rgb(self, x):
-        assert self.image_key == "segmentation"
-        if not hasattr(self, "colorize"):
-            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        assert self.image_key == 'segmentation'
+        if not hasattr(self, 'colorize'):
+            self.register_buffer(
+                'colorize', torch.randn(3, x.shape[1], 1, 1).to(x)
+            )
        x = F.conv2d(x, weight=self.colorize)
-        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
        return x


--- a/ldm/models/diffusion/classifier.py
+++ b/ldm/models/diffusion/classifier.py
@@ -10,13 +10,13 @@ from einops import rearrange
 from glob import glob
 from natsort import natsorted

-from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
+from ldm.modules.diffusionmodules.openaimodel import (
+    EncoderUNetModel,
+    UNetModel,
+)
 from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config

-__models__ = {
-    'class_label': EncoderUNetModel,
-    'segmentation': UNetModel
-}
+__models__ = {'class_label': EncoderUNetModel, 'segmentation': UNetModel}


 def disabled_train(self, mode=True):
@@ -26,37 +26,49 @@ def disabled_train(self, mode=True):


 class NoisyLatentImageClassifier(pl.LightningModule):
-
-    def __init__(self,
-                 diffusion_path,
-                 num_classes,
-                 ckpt_path=None,
-                 pool='attention',
-                 label_key=None,
-                 diffusion_ckpt_path=None,
-                 scheduler_config=None,
-                 weight_decay=1.e-2,
-                 log_steps=10,
-                 monitor='val/loss',
-                 *args,
-                 **kwargs):
+    def __init__(
+        self,
+        diffusion_path,
+        num_classes,
+        ckpt_path=None,
+        pool='attention',
+        label_key=None,
+        diffusion_ckpt_path=None,
+        scheduler_config=None,
+        weight_decay=1.0e-2,
+        log_steps=10,
+        monitor='val/loss',
+        *args,
+        **kwargs,
+    ):
        super().__init__(*args, **kwargs)
        self.num_classes = num_classes
        # get latest config of diffusion model
-        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
+        diffusion_config = natsorted(
+            glob(os.path.join(diffusion_path, 'configs', '*-project.yaml'))
+        )[-1]
        self.diffusion_config = OmegaConf.load(diffusion_config).model
        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
        self.load_diffusion()

        self.monitor = monitor
-        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
-        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
+        self.numd = (
+            self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
+        )
+        self.log_time_interval = (
+            self.diffusion_model.num_timesteps // log_steps
+        )
        self.log_steps = log_steps

-        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
+        self.label_key = (
+            label_key
+            if not hasattr(self.diffusion_model, 'cond_stage_key')
            else self.diffusion_model.cond_stage_key
+        )

-        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
+        assert (
+            self.label_key is not None
+        ), 'label_key neither in diffusion model nor in model.params'

        if self.label_key not in __models__:
            raise NotImplementedError()
@@ -68,22 +80,27 @@ class NoisyLatentImageClassifier(pl.LightningModule):
        self.weight_decay = weight_decay

    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
-        sd = torch.load(path, map_location="cpu")
-        if "state_dict" in list(sd.keys()):
-            sd = sd["state_dict"]
+        sd = torch.load(path, map_location='cpu')
+        if 'state_dict' in list(sd.keys()):
+            sd = sd['state_dict']
        keys = list(sd.keys())
        for k in keys:
            for ik in ignore_keys:
                if k.startswith(ik):
-                    print("Deleting key {} from state_dict.".format(k))
+                    print('Deleting key {} from state_dict.'.format(k))
                    del sd[k]
-        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
-            sd, strict=False)
-        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        missing, unexpected = (
+            self.load_state_dict(sd, strict=False)
+            if not only_model
+            else self.model.load_state_dict(sd, strict=False)
+        )
+        print(
+            f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+        )
        if len(missing) > 0:
-            print(f"Missing Keys: {missing}")
+            print(f'Missing Keys: {missing}')
        if len(unexpected) > 0:
-            print(f"Unexpected Keys: {unexpected}")
+            print(f'Unexpected Keys: {unexpected}')

    def load_diffusion(self):
        model = instantiate_from_config(self.diffusion_config)
@@ -93,17 +110,25 @@ class NoisyLatentImageClassifier(pl.LightningModule):
            param.requires_grad = False

    def load_classifier(self, ckpt_path, pool):
-        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
-        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
+        model_config = deepcopy(
+            self.diffusion_config.params.unet_config.params
+        )
+        model_config.in_channels = (
+            self.diffusion_config.params.unet_config.params.out_channels
+        )
        model_config.out_channels = self.num_classes
        if self.label_key == 'class_label':
            model_config.pool = pool

        self.model = __models__[self.label_key](**model_config)
        if ckpt_path is not None:
-            print('#####################################################################')
+            print(
+                '#####################################################################'
+            )
            print(f'load from ckpt "{ckpt_path}"')
-            print('#####################################################################')
+            print(
+                '#####################################################################'
+            )
            self.init_from_ckpt(ckpt_path)

    @torch.no_grad()
@@ -111,11 +136,19 @@ class NoisyLatentImageClassifier(pl.LightningModule):
        noise = default(noise, lambda: torch.randn_like(x))
        continuous_sqrt_alpha_cumprod = None
        if self.diffusion_model.use_continuous_noise:
-            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
+            continuous_sqrt_alpha_cumprod = (
+                self.diffusion_model.sample_continuous_noise_level(
+                    x.shape[0], t + 1
+                )
+            )
            # todo: make sure t+1 is correct here

-        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
-                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
+        return self.diffusion_model.q_sample(
+            x_start=x,
+            t=t,
+            noise=noise,
+            continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod,
+        )

    def forward(self, x_noisy, t, *args, **kwargs):
        return self.model(x_noisy, t)
@@ -141,17 +174,21 @@ class NoisyLatentImageClassifier(pl.LightningModule):
            targets = rearrange(targets, 'b h w c -> b c h w')
            for down in range(self.numd):
                h, w = targets.shape[-2:]
-                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
+                targets = F.interpolate(
+                    targets, size=(h // 2, w // 2), mode='nearest'
+                )

            # targets = rearrange(targets,'b c h w -> b h w c')

        return targets

-    def compute_top_k(self, logits, labels, k, reduction="mean"):
+    def compute_top_k(self, logits, labels, k, reduction='mean'):
        _, top_ks = torch.topk(logits, k, dim=1)
-        if reduction == "mean":
-            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
-        elif reduction == "none":
+        if reduction == 'mean':
+            return (
+                (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
+            )
+        elif reduction == 'none':
            return (top_ks == labels[:, None]).float().sum(dim=-1)

    def on_train_epoch_start(self):
@@ -162,29 +199,59 @@ class NoisyLatentImageClassifier(pl.LightningModule):
    def write_logs(self, loss, logits, targets):
        log_prefix = 'train' if self.training else 'val'
        log = {}
-        log[f"{log_prefix}/loss"] = loss.mean()
-        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
-            logits, targets, k=1, reduction="mean"
+        log[f'{log_prefix}/loss'] = loss.mean()
+        log[f'{log_prefix}/acc@1'] = self.compute_top_k(
+            logits, targets, k=1, reduction='mean'
        )
-        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
-            logits, targets, k=5, reduction="mean"
+        log[f'{log_prefix}/acc@5'] = self.compute_top_k(
+            logits, targets, k=5, reduction='mean'
        )

-        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
-        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
-        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
+        self.log_dict(
+            log,
+            prog_bar=False,
+            logger=True,
+            on_step=self.training,
+            on_epoch=True,
+        )
+        self.log(
+            'loss', log[f'{log_prefix}/loss'], prog_bar=True, logger=False
+        )
+        self.log(
+            'global_step',
+            self.global_step,
+            logger=False,
+            on_epoch=False,
+            prog_bar=True,
+        )
        lr = self.optimizers().param_groups[0]['lr']
-        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
+        self.log(
+            'lr_abs',
+            lr,
+            on_step=True,
+            logger=True,
+            on_epoch=False,
+            prog_bar=True,
+        )

    def shared_step(self, batch, t=None):
-        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
+        x, *_ = self.diffusion_model.get_input(
+            batch, k=self.diffusion_model.first_stage_key
+        )
        targets = self.get_conditioning(batch)
        if targets.dim() == 4:
            targets = targets.argmax(dim=1)
        if t is None:
-            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
+            t = torch.randint(
+                0,
+                self.diffusion_model.num_timesteps,
+                (x.shape[0],),
+                device=self.device,
+            ).long()
        else:
-            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
+            t = torch.full(
+                size=(x.shape[0],), fill_value=t, device=self.device
+            ).long()
        x_noisy = self.get_x_noisy(x, t)
        logits = self(x_noisy, t)

@@ -200,8 +267,14 @@ class NoisyLatentImageClassifier(pl.LightningModule):
        return loss

    def reset_noise_accs(self):
-        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
-                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
+        self.noisy_acc = {
+            t: {'acc@1': [], 'acc@5': []}
+            for t in range(
+                0,
+                self.diffusion_model.num_timesteps,
+                self.diffusion_model.log_every_t,
+            )
+        }

    def on_validation_start(self):
        self.reset_noise_accs()
@@ -212,24 +285,35 @@ class NoisyLatentImageClassifier(pl.LightningModule):

        for t in self.noisy_acc:
            _, logits, _, targets = self.shared_step(batch, t)
-            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
-            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
+            self.noisy_acc[t]['acc@1'].append(
+                self.compute_top_k(logits, targets, k=1, reduction='mean')
+            )
+            self.noisy_acc[t]['acc@5'].append(
+                self.compute_top_k(logits, targets, k=5, reduction='mean')
+            )

        return loss

    def configure_optimizers(self):
-        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
+        optimizer = AdamW(
+            self.model.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
+        )

        if self.use_scheduler:
            scheduler = instantiate_from_config(self.scheduler_config)

-            print("Setting up LambdaLR scheduler...")
+            print('Setting up LambdaLR scheduler...')
            scheduler = [
                {
-                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
+                    'scheduler': LambdaLR(
+                        optimizer, lr_lambda=scheduler.schedule
+                    ),
                    'interval': 'step',
-                    'frequency': 1
-                }]
+                    'frequency': 1,
+                }
+            ]
            return [optimizer], scheduler

        return optimizer
@@ -243,7 +327,7 @@ class NoisyLatentImageClassifier(pl.LightningModule):
        y = self.get_conditioning(batch)

        if self.label_key == 'class_label':
-            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
+            y = log_txt_as_img((x.shape[2], x.shape[3]), batch['human_label'])
            log['labels'] = y

        if ismap(y):
@@ -256,10 +340,14 @@ class NoisyLatentImageClassifier(pl.LightningModule):

                log[f'inputs@t{current_time}'] = x_noisy

-                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
+                pred = F.one_hot(
+                    logits.argmax(dim=1), num_classes=self.num_classes
+                )
                pred = rearrange(pred, 'b h w c -> b c h w')

-                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
+                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(
+                    pred
+                )

        for key in log:
            log[key] = log[key][:N]
--- a/ldm/models/diffusion/ddim.py
+++ b/ldm/models/diffusion/ddim.py
@@ -4,88 +4,146 @@ import torch
 import numpy as np
 from tqdm import tqdm
 from functools import partial
+from ldm.dream.devices import choose_torch_device

-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
-    extract_into_tensor
+from ldm.modules.diffusionmodules.util import (
+    make_ddim_sampling_parameters,
+    make_ddim_timesteps,
+    noise_like,
+    extract_into_tensor,
+)


 class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
+    def __init__(self, model, schedule='linear', device=None, **kwargs):
        super().__init__()
        self.model = model
        self.ddpm_num_timesteps = model.num_timesteps
        self.schedule = schedule
+        self.device   = device or choose_torch_device()

    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
+            if attr.device != torch.device(self.device):
+                attr = attr.to(dtype=torch.float32, device=self.device)
        setattr(self, name, attr)

-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+    def make_schedule(
+        self,
+        ddim_num_steps,
+        ddim_discretize='uniform',
+        ddim_eta=0.0,
+        verbose=True,
+    ):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose,
+        )
        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        assert (
+            alphas_cumprod.shape[0] == self.ddpm_num_timesteps
+        ), 'alphas have to be defined for each timestep'
+        to_torch = (
+            lambda x: x.clone()
+            .detach()
+            .to(torch.float32)
+            .to(self.model.device)
+        )

        self.register_buffer('betas', to_torch(self.model.betas))
        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        self.register_buffer(
+            'alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)
+        )

        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+        self.register_buffer(
+            'sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            'sqrt_one_minus_alphas_cumprod',
+            to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            'log_one_minus_alphas_cumprod',
+            to_torch(np.log(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            'sqrt_recip_alphas_cumprod',
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            'sqrt_recipm1_alphas_cumprod',
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
+        )

        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
+        (
+            ddim_sigmas,
+            ddim_alphas,
+            ddim_alphas_prev,
+        ) = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose,
+        )
        self.register_buffer('ddim_sigmas', ddim_sigmas)
        self.register_buffer('ddim_alphas', ddim_alphas)
        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        self.register_buffer(
+            'ddim_sqrt_one_minus_alphas', np.sqrt(1.0 - ddim_alphas)
+        )
        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+            (1 - self.alphas_cumprod_prev)
+            / (1 - self.alphas_cumprod)
+            * (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        )
+        self.register_buffer(
+            'ddim_sigmas_for_original_num_steps',
+            sigmas_for_original_sampling_steps,
+        )

    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
+    def sample(
+        self,
+        S,
+        batch_size,
+        shape,
+        conditioning=None,
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
        if conditioning is not None:
            if isinstance(conditioning, dict):
                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+                    print(
+                        f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                    )
            else:
                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+                    print(
+                        f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
+                    )

        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
        # sampling
@@ -93,30 +151,47 @@ class DDIMSampler(object):
        size = (batch_size, C, H, W)
        print(f'Data shape for DDIM sampling is {size}, eta {eta}')

-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
+        samples, intermediates = self.ddim_sampling(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+        )
        return samples, intermediates

    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
+    def ddim_sampling(
+        self,
+        cond,
+        shape,
+        x_T=None,
+        ddim_use_original_steps=False,
+        callback=None,
+        timesteps=None,
+        quantize_denoised=False,
+        mask=None,
+        x0=None,
+        img_callback=None,
+        log_every_t=100,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+    ):
        device = self.model.betas.device
        b = shape[0]
        if x_T is None:
@@ -125,17 +200,38 @@ class DDIMSampler(object):
            img = x_T

        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+            timesteps = (
+                self.ddpm_num_timesteps
+                if ddim_use_original_steps
+                else self.ddim_timesteps
+            )
        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            subset_end = (
+                int(
+                    min(timesteps / self.ddim_timesteps.shape[0], 1)
+                    * self.ddim_timesteps.shape[0]
+                )
+                - 1
+            )
            timesteps = self.ddim_timesteps[:subset_end]

        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        time_range = (
+            reversed(range(0, timesteps))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        total_steps = (
+            timesteps if ddim_use_original_steps else timesteps.shape[0]
+        )
+        print(f'Running DDIM Sampling with {total_steps} timesteps')

-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps, dynamic_ncols=True)
+        iterator = tqdm(
+            time_range,
+            desc='DDIM Sampler',
+            total=total_steps,
+            dynamic_ncols=True,
+        )

        for i, step in enumerate(iterator):
            index = total_steps - i - 1
@@ -143,18 +239,30 @@ class DDIMSampler(object):

            if mask is not None:
                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
+                img_orig = self.model.q_sample(
+                    x0, ts
+                )  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1.0 - mask) * img

-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning)
+            outs = self.p_sample_ddim(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+            )
            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)

            if index % log_every_t == 0 or index == total_steps - 1:
                intermediates['x_inter'].append(img)
@@ -163,42 +271,82 @@ class DDIMSampler(object):
        return img, intermediates

    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None):
+    def p_sample_ddim(
+        self,
+        x,
+        c,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+    ):
        b, *_, device = *x.shape, x.device

-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+        if (
+            unconditional_conditioning is None
+            or unconditional_guidance_scale == 1.0
+        ):
            e_t = self.model.apply_model(x, t, c)
        else:
            x_in = torch.cat([x] * 2)
            t_in = torch.cat([t] * 2)
            c_in = torch.cat([unconditional_conditioning, c])
            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            e_t = e_t_uncond + unconditional_guidance_scale * (
+                e_t - e_t_uncond
+            )

        if score_corrector is not None:
-            assert self.model.parameterization == "eps"
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+            assert self.model.parameterization == 'eps'
+            e_t = score_corrector.modify_score(
+                self.model, e_t, x, t, c, **corrector_kwargs
+            )

-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        alphas = (
+            self.model.alphas_cumprod
+            if use_original_steps
+            else self.ddim_alphas
+        )
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )
        # select parameters corresponding to the currently considered timestep
        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+        sqrt_one_minus_at = torch.full(
+            (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+        )

        # current prediction for x_0
        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
        if quantize_denoised:
            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
+        dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t
+        noise = (
+            sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        )
+        if noise_dropout > 0.0:
            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
        return x_prev, pred_x0
@@ -216,26 +364,55 @@ class DDIMSampler(object):

        if noise is None:
            noise = torch.randn_like(x0)
-        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
-                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+        return (
+            extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0
+            + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape)
+            * noise
+        )

    @torch.no_grad()
-    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
-               use_original_steps=False):
+    def decode(
+        self,
+        x_latent,
+        cond,
+        t_start,
+        img_callback=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        use_original_steps=False,
+    ):

-        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+        timesteps = (
+            np.arange(self.ddpm_num_timesteps)
+            if use_original_steps
+            else self.ddim_timesteps
+        )
        timesteps = timesteps[:t_start]

        time_range = np.flip(timesteps)
        total_steps = timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        print(f'Running DDIM Sampling with {total_steps} timesteps')

        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
        x_dec = x_latent
        for i, step in enumerate(iterator):
            index = total_steps - i - 1
-            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
-            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
-                                          unconditional_guidance_scale=unconditional_guidance_scale,
-                                          unconditional_conditioning=unconditional_conditioning)
+            ts = torch.full(
+                (x_latent.shape[0],),
+                step,
+                device=x_latent.device,
+                dtype=torch.long,
+            )
+            x_dec, _ = self.p_sample_ddim(
+                x_dec,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=use_original_steps,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+            )
+            if img_callback:
+                img_callback(x_dec, i)
+
        return x_dec
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
--- a/ldm/models/diffusion/ksampler.py
+++ b/ldm/models/diffusion/ksampler.py
@@ -0,0 +1,88 @@
+"""wrapper around part of Katherine Crowson's k-diffusion library, making it call compatible with other Samplers"""
+import k_diffusion as K
+import torch
+import torch.nn as nn
+from ldm.dream.devices import choose_torch_device
+
+class CFGDenoiser(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.inner_model = model
+
+    def forward(self, x, sigma, uncond, cond, cond_scale):
+        x_in = torch.cat([x] * 2)
+        sigma_in = torch.cat([sigma] * 2)
+        cond_in = torch.cat([uncond, cond])
+        uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
+        return uncond + (cond - uncond) * cond_scale
+
+
+class KSampler(object):
+    def __init__(self, model, schedule='lms', device=None, **kwargs):
+        super().__init__()
+        self.model = K.external.CompVisDenoiser(model)
+        self.schedule = schedule
+        self.device   = device or choose_torch_device()
+
+        def forward(self, x, sigma, uncond, cond, cond_scale):
+            x_in = torch.cat([x] * 2)
+            sigma_in = torch.cat([sigma] * 2)
+            cond_in = torch.cat([uncond, cond])
+            uncond, cond = self.inner_model(
+                x_in, sigma_in, cond=cond_in
+            ).chunk(2)
+            return uncond + (cond - uncond) * cond_scale
+
+    # most of these arguments are ignored and are only present for compatibility with
+    # other samples
+    @torch.no_grad()
+    def sample(
+        self,
+        S,
+        batch_size,
+        shape,
+        conditioning=None,
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        def route_callback(k_callback_values):
+            if img_callback is not None:
+                img_callback(k_callback_values['x'], k_callback_values['i'])
+
+        sigmas = self.model.get_sigmas(S)
+        if x_T is not None:
+            x = x_T * sigmas[0]
+        else:
+            x = (
+                torch.randn([batch_size, *shape], device=self.device)
+                * sigmas[0]
+            )   # for GPU draw
+        model_wrap_cfg = CFGDenoiser(self.model)
+        extra_args = {
+            'cond': conditioning,
+            'uncond': unconditional_conditioning,
+            'cond_scale': unconditional_guidance_scale,
+        }
+        return (
+            K.sampling.__dict__[f'sample_{self.schedule}'](
+                model_wrap_cfg, x, sigmas, extra_args=extra_args,
+                callback=route_callback
+            ),
+            None,
+        )
--- a/ldm/models/diffusion/plms.py
+++ b/ldm/models/diffusion/plms.py
@@ -4,120 +4,195 @@ import torch
 import numpy as np
 from tqdm import tqdm
 from functools import partial
+from ldm.dream.devices import choose_torch_device

-from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
+from ldm.modules.diffusionmodules.util import (
+    make_ddim_sampling_parameters,
+    make_ddim_timesteps,
+    noise_like,
+)


 class PLMSSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
+    def __init__(self, model, schedule='linear', device=None, **kwargs):
        super().__init__()
        self.model = model
        self.ddpm_num_timesteps = model.num_timesteps
        self.schedule = schedule
+        self.device   = device if device else choose_torch_device()

    def register_buffer(self, name, attr):
        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
+            if attr.device != torch.device(self.device):
+                attr = attr.to(torch.float32).to(torch.device(self.device))
        setattr(self, name, attr)

-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+    def make_schedule(
+        self,
+        ddim_num_steps,
+        ddim_discretize='uniform',
+        ddim_eta=0.0,
+        verbose=True,
+    ):
        if ddim_eta != 0:
            raise ValueError('ddim_eta must be 0 for PLMS')
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose,
+        )
        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        assert (
+            alphas_cumprod.shape[0] == self.ddpm_num_timesteps
+        ), 'alphas have to be defined for each timestep'
+        to_torch = (
+            lambda x: x.clone()
+            .detach()
+            .to(torch.float32)
+            .to(self.model.device)
+        )

        self.register_buffer('betas', to_torch(self.model.betas))
        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        self.register_buffer(
+            'alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)
+        )

        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+        self.register_buffer(
+            'sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            'sqrt_one_minus_alphas_cumprod',
+            to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            'log_one_minus_alphas_cumprod',
+            to_torch(np.log(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            'sqrt_recip_alphas_cumprod',
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            'sqrt_recipm1_alphas_cumprod',
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
+        )

        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
+        (
+            ddim_sigmas,
+            ddim_alphas,
+            ddim_alphas_prev,
+        ) = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose,
+        )
        self.register_buffer('ddim_sigmas', ddim_sigmas)
        self.register_buffer('ddim_alphas', ddim_alphas)
        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        self.register_buffer(
+            'ddim_sqrt_one_minus_alphas', np.sqrt(1.0 - ddim_alphas)
+        )
        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+            (1 - self.alphas_cumprod_prev)
+            / (1 - self.alphas_cumprod)
+            * (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        )
+        self.register_buffer(
+            'ddim_sigmas_for_original_num_steps',
+            sigmas_for_original_sampling_steps,
+        )

    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
+    def sample(
+        self,
+        S,
+        batch_size,
+        shape,
+        conditioning=None,
+        callback=None,
+        normals_sequence=None,
+        img_callback=None,
+        quantize_x0=False,
+        eta=0.0,
+        mask=None,
+        x0=None,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        verbose=True,
+        x_T=None,
+        log_every_t=100,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
        if conditioning is not None:
            if isinstance(conditioning, dict):
                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+                    print(
+                        f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                    )
            else:
                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+                    print(
+                        f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
+                    )

        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
        # sampling
        C, H, W = shape
        size = (batch_size, C, H, W)
-#        print(f'Data shape for PLMS sampling is {size}')
+        #        print(f'Data shape for PLMS sampling is {size}')

-        samples, intermediates = self.plms_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
+        samples, intermediates = self.plms_sampling(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+        )
        return samples, intermediates

    @torch.no_grad()
-    def plms_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
+    def plms_sampling(
+        self,
+        cond,
+        shape,
+        x_T=None,
+        ddim_use_original_steps=False,
+        callback=None,
+        timesteps=None,
+        quantize_denoised=False,
+        mask=None,
+        x0=None,
+        img_callback=None,
+        log_every_t=100,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+    ):
        device = self.model.betas.device
        b = shape[0]
        if x_T is None:
@@ -126,42 +201,81 @@ class PLMSSampler(object):
            img = x_T

        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+            timesteps = (
+                self.ddpm_num_timesteps
+                if ddim_use_original_steps
+                else self.ddim_timesteps
+            )
        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            subset_end = (
+                int(
+                    min(timesteps / self.ddim_timesteps.shape[0], 1)
+                    * self.ddim_timesteps.shape[0]
+                )
+                - 1
+            )
            timesteps = self.ddim_timesteps[:subset_end]

        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-#        print(f"Running PLMS Sampling with {total_steps} timesteps")
+        time_range = (
+            list(reversed(range(0, timesteps)))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        total_steps = (
+            timesteps if ddim_use_original_steps else timesteps.shape[0]
+        )
+        #        print(f"Running PLMS Sampling with {total_steps} timesteps")

-        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps, dynamic_ncols=True)
+        iterator = tqdm(
+            time_range,
+            desc='PLMS Sampler',
+            total=total_steps,
+            dynamic_ncols=True,
+        )
        old_eps = []

        for i, step in enumerate(iterator):
            index = total_steps - i - 1
            ts = torch.full((b,), step, device=device, dtype=torch.long)
-            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
+            ts_next = torch.full(
+                (b,),
+                time_range[min(i + 1, len(time_range) - 1)],
+                device=device,
+                dtype=torch.long,
+            )

            if mask is not None:
                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
+                img_orig = self.model.q_sample(
+                    x0, ts
+                )  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1.0 - mask) * img

-            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning,
-                                      old_eps=old_eps, t_next=ts_next)
+            outs = self.p_sample_plms(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                old_eps=old_eps,
+                t_next=ts_next,
+            )
            img, pred_x0, e_t = outs
            old_eps.append(e_t)
            if len(old_eps) >= 4:
                old_eps.pop(0)
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)

            if index % log_every_t == 0 or index == total_steps - 1:
                intermediates['x_inter'].append(img)
@@ -170,47 +284,95 @@ class PLMSSampler(object):
        return img, intermediates

    @torch.no_grad()
-    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
+    def p_sample_plms(
+        self,
+        x,
+        c,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+        score_corrector=None,
+        corrector_kwargs=None,
+        unconditional_guidance_scale=1.0,
+        unconditional_conditioning=None,
+        old_eps=None,
+        t_next=None,
+    ):
        b, *_, device = *x.shape, x.device

        def get_model_output(x, t):
-            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            if (
+                unconditional_conditioning is None
+                or unconditional_guidance_scale == 1.0
+            ):
                e_t = self.model.apply_model(x, t, c)
            else:
                x_in = torch.cat([x] * 2)
                t_in = torch.cat([t] * 2)
                c_in = torch.cat([unconditional_conditioning, c])
-                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+                e_t_uncond, e_t = self.model.apply_model(
+                    x_in, t_in, c_in
+                ).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (
+                    e_t - e_t_uncond
+                )

            if score_corrector is not None:
-                assert self.model.parameterization == "eps"
-                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+                assert self.model.parameterization == 'eps'
+                e_t = score_corrector.modify_score(
+                    self.model, e_t, x, t, c, **corrector_kwargs
+                )

            return e_t

-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        alphas = (
+            self.model.alphas_cumprod
+            if use_original_steps
+            else self.ddim_alphas
+        )
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )

        def get_x_prev_and_pred_x0(e_t, index):
            # select parameters corresponding to the currently considered timestep
            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            a_prev = torch.full(
+                (b, 1, 1, 1), alphas_prev[index], device=device
+            )
            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
+            sqrt_one_minus_at = torch.full(
+                (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+            )

            # current prediction for x_0
            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
            if quantize_denoised:
                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
            # direction pointing to x_t
-            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-            if noise_dropout > 0.:
+            dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t
+            noise = (
+                sigma_t
+                * noise_like(x.shape, device, repeat_noise)
+                * temperature
+            )
+            if noise_dropout > 0.0:
                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
            return x_prev, pred_x0
@@ -229,7 +391,12 @@ class PLMSSampler(object):
            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
        elif len(old_eps) >= 3:
            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
-            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
+            e_t_prime = (
+                55 * e_t
+                - 59 * old_eps[-1]
+                + 37 * old_eps[-2]
+                - 9 * old_eps[-3]
+            ) / 24

        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)

--- a/ldm/modules/attention.py
+++ b/ldm/modules/attention.py
@@ -13,7 +13,7 @@ def exists(val):


 def uniq(arr):
-    return{el: True for el in arr}.keys()
+    return {el: True for el in arr}.keys()


 def default(val, d):
@@ -45,19 +45,18 @@ class GEGLU(nn.Module):


 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
        super().__init__()
        inner_dim = int(dim * mult)
        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )

        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
        )

    def forward(self, x):
@@ -74,7 +73,9 @@ def zero_module(module):


 def Normalize(in_channels):
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+    )


 class LinearAttention(nn.Module):
@@ -82,17 +83,28 @@ class LinearAttention(nn.Module):
        super().__init__()
        self.heads = heads
        hidden_dim = dim_head * heads
-        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
        self.to_out = nn.Conv2d(hidden_dim, dim, 1)

    def forward(self, x):
        b, c, h, w = x.shape
        qkv = self.to_qkv(x)
-        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
-        k = k.softmax(dim=-1)  
+        q, k, v = rearrange(
+            qkv,
+            'b (qkv heads c) h w -> qkv b heads c (h w)',
+            heads=self.heads,
+            qkv=3,
+        )
+        k = k.softmax(dim=-1)
        context = torch.einsum('bhdn,bhen->bhde', k, v)
        out = torch.einsum('bhde,bhdn->bhen', context, q)
-        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        out = rearrange(
+            out,
+            'b heads c (h w) -> b (heads c) h w',
+            heads=self.heads,
+            h=h,
+            w=w,
+        )
        return self.to_out(out)


@@ -102,26 +114,18 @@ class SpatialSelfAttention(nn.Module):
        self.in_channels = in_channels

        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.k = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.v = torch.nn.Conv2d(in_channels,
-                                 in_channels,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-        self.proj_out = torch.nn.Conv2d(in_channels,
-                                        in_channels,
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )

    def forward(self, x):
        h_ = x
@@ -131,12 +135,12 @@ class SpatialSelfAttention(nn.Module):
        v = self.v(h_)

        # compute attention
-        b,c,h,w = q.shape
+        b, c, h, w = q.shape
        q = rearrange(q, 'b c h w -> b (h w) c')
        k = rearrange(k, 'b c h w -> b c (h w)')
        w_ = torch.einsum('bij,bjk->bik', q, k)

-        w_ = w_ * (int(c)**(-0.5))
+        w_ = w_ * (int(c) ** (-0.5))
        w_ = torch.nn.functional.softmax(w_, dim=2)

        # attend to values
@@ -146,16 +150,18 @@ class SpatialSelfAttention(nn.Module):
        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
        h_ = self.proj_out(h_)

-        return x+h_
+        return x + h_


 class CrossAttention(nn.Module):
-    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
+    def __init__(
+        self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0
+    ):
        super().__init__()
        inner_dim = dim_head * heads
        context_dim = default(context_dim, query_dim)

-        self.scale = dim_head ** -0.5
+        self.scale = dim_head**-0.5
        self.heads = heads

        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
@@ -163,8 +169,7 @@ class CrossAttention(nn.Module):
        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

        self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim),
-            nn.Dropout(dropout)
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
        )

    def forward(self, x, context=None, mask=None):
@@ -175,7 +180,9 @@ class CrossAttention(nn.Module):
        k = self.to_k(context)
        v = self.to_v(context)

-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        q, k, v = map(
+            lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)
+        )

        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale

@@ -194,21 +201,40 @@ class CrossAttention(nn.Module):


 class BasicTransformerBlock(nn.Module):
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
        super().__init__()
-        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
+        self.attn1 = CrossAttention(
+            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+        )  # is a self-attention
        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
-                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )  # is self-attn if context is none
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.norm3 = nn.LayerNorm(dim)
        self.checkpoint = checkpoint

    def forward(self, x, context=None):
-        return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
+        return checkpoint(
+            self._forward, (x, context), self.parameters(), self.checkpoint
+        )

    def _forward(self, x, context=None):
+        x = x.contiguous() if x.device.type == 'mps' else x
        x = self.attn1(self.norm1(x)) + x
        x = self.attn2(self.norm2(x), context=context) + x
        x = self.ff(self.norm3(x)) + x
@@ -223,29 +249,43 @@ class SpatialTransformer(nn.Module):
    Then apply standard transformer action.
    Finally, reshape to image
    """
-    def __init__(self, in_channels, n_heads, d_head,
-                 depth=1, dropout=0., context_dim=None):
+
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+    ):
        super().__init__()
        self.in_channels = in_channels
        inner_dim = n_heads * d_head
        self.norm = Normalize(in_channels)

-        self.proj_in = nn.Conv2d(in_channels,
-                                 inner_dim,
-                                 kernel_size=1,
-                                 stride=1,
-                                 padding=0)
-
-        self.transformer_blocks = nn.ModuleList(
-            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
-                for d in range(depth)]
+        self.proj_in = nn.Conv2d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
        )

-        self.proj_out = zero_module(nn.Conv2d(inner_dim,
-                                              in_channels,
-                                              kernel_size=1,
-                                              stride=1,
-                                              padding=0))
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                )
+                for d in range(depth)
+            ]
+        )
+
+        self.proj_out = zero_module(
+            nn.Conv2d(
+                inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+            )
+        )

    def forward(self, x, context=None):
        # note: if no context is given, cross-attention defaults to self-attention
@@ -258,4 +298,4 @@ class SpatialTransformer(nn.Module):
            x = block(x, context=context)
        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
        x = self.proj_out(x)
-        return x + x_in
+        return x + x_in
--- a/ldm/modules/diffusionmodules/model.py
+++ b/ldm/modules/diffusionmodules/model.py
--- a/ldm/modules/diffusionmodules/openaimodel.py
+++ b/ldm/modules/diffusionmodules/openaimodel.py
@@ -24,6 +24,7 @@ from ldm.modules.attention import SpatialTransformer
 def convert_module_to_f16(x):
    pass

+
 def convert_module_to_f32(x):
    pass

@@ -42,7 +43,9 @@ class AttentionPool2d(nn.Module):
        output_dim: int = None,
    ):
        super().__init__()
-        self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5
+        )
        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
        self.num_heads = embed_dim // num_heads_channels
@@ -97,37 +100,45 @@ class Upsample(nn.Module):
                 upsampling occurs in the inner-two dimensions.
    """

-    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+    def __init__(
+        self, channels, use_conv, dims=2, out_channels=None, padding=1
+    ):
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels
        self.use_conv = use_conv
        self.dims = dims
        if use_conv:
-            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
+            self.conv = conv_nd(
+                dims, self.channels, self.out_channels, 3, padding=padding
+            )

    def forward(self, x):
        assert x.shape[1] == self.channels
        if self.dims == 3:
            x = F.interpolate(
-                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode='nearest'
            )
        else:
-            x = F.interpolate(x, scale_factor=2, mode="nearest")
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
        if self.use_conv:
            x = self.conv(x)
        return x

+
 class TransposedUpsample(nn.Module):
-    'Learned 2x upsampling without padding'
+    """Learned 2x upsampling without padding"""
+
    def __init__(self, channels, out_channels=None, ks=5):
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels

-        self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
+        self.up = nn.ConvTranspose2d(
+            self.channels, self.out_channels, kernel_size=ks, stride=2
+        )

-    def forward(self,x):
+    def forward(self, x):
        return self.up(x)


@@ -140,7 +151,9 @@ class Downsample(nn.Module):
                 downsampling occurs in the inner-two dimensions.
    """

-    def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+    def __init__(
+        self, channels, use_conv, dims=2, out_channels=None, padding=1
+    ):
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels
@@ -149,7 +162,12 @@ class Downsample(nn.Module):
        stride = 2 if dims != 3 else (1, 2, 2)
        if use_conv:
            self.op = conv_nd(
-                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+                dims,
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                padding=padding,
            )
        else:
            assert self.channels == self.out_channels
@@ -219,7 +237,9 @@ class ResBlock(TimestepBlock):
            nn.SiLU(),
            linear(
                emb_channels,
-                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+                2 * self.out_channels
+                if use_scale_shift_norm
+                else self.out_channels,
            ),
        )
        self.out_layers = nn.Sequential(
@@ -227,7 +247,9 @@ class ResBlock(TimestepBlock):
            nn.SiLU(),
            nn.Dropout(p=dropout),
            zero_module(
-                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+                conv_nd(
+                    dims, self.out_channels, self.out_channels, 3, padding=1
+                )
            ),
        )

@@ -238,7 +260,9 @@ class ResBlock(TimestepBlock):
                dims, channels, self.out_channels, 3, padding=1
            )
        else:
-            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 1
+            )

    def forward(self, x, emb):
        """
@@ -251,7 +275,6 @@ class ResBlock(TimestepBlock):
            self._forward, (x, emb), self.parameters(), self.use_checkpoint
        )

-
    def _forward(self, x, emb):
        if self.updown:
            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
@@ -297,7 +320,7 @@ class AttentionBlock(nn.Module):
        else:
            assert (
                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            ), f'q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}'
            self.num_heads = channels // num_head_channels
        self.use_checkpoint = use_checkpoint
        self.norm = normalization(channels)
@@ -312,8 +335,10 @@ class AttentionBlock(nn.Module):
        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))

    def forward(self, x):
-        return checkpoint(self._forward, (x,), self.parameters(), True)   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
-        #return pt_checkpoint(self._forward, x)  # pytorch
+        return checkpoint(
+            self._forward, (x,), self.parameters(), True
+        )   # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        # return pt_checkpoint(self._forward, x)  # pytorch

    def _forward(self, x):
        b, c, *spatial = x.shape
@@ -340,7 +365,7 @@ def count_flops_attn(model, _x, y):
    # We perform two matmuls with the same number of ops.
    # The first computes the weight matrix, the second computes
    # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    matmul_ops = 2 * b * (num_spatial**2) * c
    model.total_ops += th.DoubleTensor([matmul_ops])


@@ -362,13 +387,15 @@ class QKVAttentionLegacy(nn.Module):
        bs, width, length = qkv.shape
        assert width % (3 * self.n_heads) == 0
        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(
+            ch, dim=1
+        )
        scale = 1 / math.sqrt(math.sqrt(ch))
        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
+            'bct,bcs->bts', q * scale, k * scale
        )  # More stable with f16 than dividing afterwards
        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
+        a = th.einsum('bts,bcs->bct', weight, v)
        return a.reshape(bs, -1, length)

    @staticmethod
@@ -397,12 +424,14 @@ class QKVAttention(nn.Module):
        q, k, v = qkv.chunk(3, dim=1)
        scale = 1 / math.sqrt(math.sqrt(ch))
        weight = th.einsum(
-            "bct,bcs->bts",
+            'bct,bcs->bts',
            (q * scale).view(bs * self.n_heads, ch, length),
            (k * scale).view(bs * self.n_heads, ch, length),
        )  # More stable with f16 than dividing afterwards
        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        a = th.einsum(
+            'bts,bcs->bct', weight, v.reshape(bs * self.n_heads, ch, length)
+        )
        return a.reshape(bs, -1, length)

    @staticmethod
@@ -461,19 +490,24 @@ class UNetModel(nn.Module):
        use_scale_shift_norm=False,
        resblock_updown=False,
        use_new_attention_order=False,
-        use_spatial_transformer=False,    # custom transformer support
-        transformer_depth=1,              # custom transformer support
-        context_dim=None,                 # custom transformer support
-        n_embed=None,                     # custom support for prediction of discrete ids into codebook of first stage vq model
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
        legacy=True,
    ):
        super().__init__()
        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+            assert (
+                context_dim is not None
+            ), 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'

        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            assert (
+                use_spatial_transformer
+            ), 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
            from omegaconf.listconfig import ListConfig
+
            if type(context_dim) == ListConfig:
                context_dim = list(context_dim)

@@ -481,10 +515,14 @@ class UNetModel(nn.Module):
            num_heads_upsample = num_heads

        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+            assert (
+                num_head_channels != -1
+            ), 'Either num_heads or num_head_channels has to be set'

        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+            assert (
+                num_heads != -1
+            ), 'Either num_heads or num_head_channels has to be set'

        self.image_size = image_size
        self.in_channels = in_channels
@@ -545,8 +583,12 @@ class UNetModel(nn.Module):
                        num_heads = ch // num_head_channels
                        dim_head = num_head_channels
                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
                    layers.append(
                        AttentionBlock(
                            ch,
@@ -554,8 +596,14 @@ class UNetModel(nn.Module):
                            num_heads=num_heads,
                            num_head_channels=dim_head,
                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                        if not use_spatial_transformer
+                        else SpatialTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth,
+                            context_dim=context_dim,
                        )
                    )
                self.input_blocks.append(TimestepEmbedSequential(*layers))
@@ -592,8 +640,12 @@ class UNetModel(nn.Module):
            num_heads = ch // num_head_channels
            dim_head = num_head_channels
        if legacy:
-            #num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+            # num_heads = 1
+            dim_head = (
+                ch // num_heads
+                if use_spatial_transformer
+                else num_head_channels
+            )
        self.middle_block = TimestepEmbedSequential(
            ResBlock(
                ch,
@@ -609,9 +661,15 @@ class UNetModel(nn.Module):
                num_heads=num_heads,
                num_head_channels=dim_head,
                use_new_attention_order=use_new_attention_order,
-            ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
-                        ),
+            )
+            if not use_spatial_transformer
+            else SpatialTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth,
+                context_dim=context_dim,
+            ),
            ResBlock(
                ch,
                time_embed_dim,
@@ -646,8 +704,12 @@ class UNetModel(nn.Module):
                        num_heads = ch // num_head_channels
                        dim_head = num_head_channels
                    if legacy:
-                        #num_heads = 1
-                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                        # num_heads = 1
+                        dim_head = (
+                            ch // num_heads
+                            if use_spatial_transformer
+                            else num_head_channels
+                        )
                    layers.append(
                        AttentionBlock(
                            ch,
@@ -655,8 +717,14 @@ class UNetModel(nn.Module):
                            num_heads=num_heads_upsample,
                            num_head_channels=dim_head,
                            use_new_attention_order=use_new_attention_order,
-                        ) if not use_spatial_transformer else SpatialTransformer(
-                            ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim
+                        )
+                        if not use_spatial_transformer
+                        else SpatialTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            depth=transformer_depth,
+                            context_dim=context_dim,
                        )
                    )
                if level and i == num_res_blocks:
@@ -673,7 +741,9 @@ class UNetModel(nn.Module):
                            up=True,
                        )
                        if resblock_updown
-                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                        else Upsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
                    )
                    ds //= 2
                self.output_blocks.append(TimestepEmbedSequential(*layers))
@@ -682,14 +752,16 @@ class UNetModel(nn.Module):
        self.out = nn.Sequential(
            normalization(ch),
            nn.SiLU(),
-            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+            zero_module(
+                conv_nd(dims, model_channels, out_channels, 3, padding=1)
+            ),
        )
        if self.predict_codebook_ids:
            self.id_predictor = nn.Sequential(
-            normalization(ch),
-            conv_nd(dims, model_channels, n_embed, 1),
-            #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-        )
+                normalization(ch),
+                conv_nd(dims, model_channels, n_embed, 1),
+                # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+            )

    def convert_to_fp16(self):
        """
@@ -707,7 +779,7 @@ class UNetModel(nn.Module):
        self.middle_block.apply(convert_module_to_f32)
        self.output_blocks.apply(convert_module_to_f32)

-    def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
        """
        Apply the model to an input batch.
        :param x: an [N x C x ...] Tensor of inputs.
@@ -718,9 +790,11 @@ class UNetModel(nn.Module):
        """
        assert (y is not None) == (
            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
+        ), 'must specify y if and only if the model is class-conditional'
        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False
+        )
        emb = self.time_embed(t_emb)

        if self.num_classes is not None:
@@ -768,9 +842,9 @@ class EncoderUNetModel(nn.Module):
        use_scale_shift_norm=False,
        resblock_updown=False,
        use_new_attention_order=False,
-        pool="adaptive",
+        pool='adaptive',
        *args,
-        **kwargs
+        **kwargs,
    ):
        super().__init__()

@@ -888,7 +962,7 @@ class EncoderUNetModel(nn.Module):
        )
        self._feature_size += ch
        self.pool = pool
-        if pool == "adaptive":
+        if pool == 'adaptive':
            self.out = nn.Sequential(
                normalization(ch),
                nn.SiLU(),
@@ -896,7 +970,7 @@ class EncoderUNetModel(nn.Module):
                zero_module(conv_nd(dims, ch, out_channels, 1)),
                nn.Flatten(),
            )
-        elif pool == "attention":
+        elif pool == 'attention':
            assert num_head_channels != -1
            self.out = nn.Sequential(
                normalization(ch),
@@ -905,13 +979,13 @@ class EncoderUNetModel(nn.Module):
                    (image_size // ds), ch, num_head_channels, out_channels
                ),
            )
-        elif pool == "spatial":
+        elif pool == 'spatial':
            self.out = nn.Sequential(
                nn.Linear(self._feature_size, 2048),
                nn.ReLU(),
                nn.Linear(2048, self.out_channels),
            )
-        elif pool == "spatial_v2":
+        elif pool == 'spatial_v2':
            self.out = nn.Sequential(
                nn.Linear(self._feature_size, 2048),
                normalization(2048),
@@ -919,7 +993,7 @@ class EncoderUNetModel(nn.Module):
                nn.Linear(2048, self.out_channels),
            )
        else:
-            raise NotImplementedError(f"Unexpected {pool} pooling")
+            raise NotImplementedError(f'Unexpected {pool} pooling')

    def convert_to_fp16(self):
        """
@@ -942,20 +1016,21 @@ class EncoderUNetModel(nn.Module):
        :param timesteps: a 1-D batch of timesteps.
        :return: an [N x K] Tensor of outputs.
        """
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        emb = self.time_embed(
+            timestep_embedding(timesteps, self.model_channels)
+        )

        results = []
        h = x.type(self.dtype)
        for module in self.input_blocks:
            h = module(h, emb)
-            if self.pool.startswith("spatial"):
+            if self.pool.startswith('spatial'):
                results.append(h.type(x.dtype).mean(dim=(2, 3)))
        h = self.middle_block(h, emb)
-        if self.pool.startswith("spatial"):
+        if self.pool.startswith('spatial'):
            results.append(h.type(x.dtype).mean(dim=(2, 3)))
            h = th.cat(results, axis=-1)
            return self.out(h)
        else:
            h = h.type(x.dtype)
            return self.out(h)
-
--- a/ldm/modules/diffusionmodules/util.py
+++ b/ldm/modules/diffusionmodules/util.py
@@ -18,15 +18,24 @@ from einops import repeat
 from ldm.util import instantiate_from_config


-def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-    if schedule == "linear":
+def make_beta_schedule(
+    schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
+):
+    if schedule == 'linear':
        betas = (
-                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+            torch.linspace(
+                linear_start**0.5,
+                linear_end**0.5,
+                n_timestep,
+                dtype=torch.float64,
+            )
+            ** 2
        )

-    elif schedule == "cosine":
+    elif schedule == 'cosine':
        timesteps = (
-                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep
+            + cosine_s
        )
        alphas = timesteps / (1 + cosine_s) * np.pi / 2
        alphas = torch.cos(alphas).pow(2)
@@ -34,23 +43,41 @@ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2,
        betas = 1 - alphas[1:] / alphas[:-1]
        betas = np.clip(betas, a_min=0, a_max=0.999)

-    elif schedule == "sqrt_linear":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
-    elif schedule == "sqrt":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
+    elif schedule == 'sqrt_linear':
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64
+        )
+    elif schedule == 'sqrt':
+        betas = (
+            torch.linspace(
+                linear_start, linear_end, n_timestep, dtype=torch.float64
+            )
+            ** 0.5
+        )
    else:
        raise ValueError(f"schedule '{schedule}' unknown.")
    return betas.numpy()


-def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
+def make_ddim_timesteps(
+    ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True
+):
    if ddim_discr_method == 'uniform':
        c = num_ddpm_timesteps // num_ddim_timesteps
        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
    elif ddim_discr_method == 'quad':
-        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
+        ddim_timesteps = (
+            (
+                np.linspace(
+                    0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps
+                )
+            )
+            ** 2
+        ).astype(int)
    else:
-        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+        raise NotImplementedError(
+            f'There is no ddim discretization method called "{ddim_discr_method}"'
+        )

    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
    # add one to get the final alpha values right (the ones from first scale to data during sampling)
@@ -60,17 +87,27 @@ def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timestep
    return steps_out


-def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+def make_ddim_sampling_parameters(
+    alphacums, ddim_timesteps, eta, verbose=True
+):
    # select alphas for computing the variance schedule
    alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+    alphas_prev = np.asarray(
+        [alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()
+    )

    # according the the formula provided in https://arxiv.org/abs/2010.02502
-    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    sigmas = eta * np.sqrt(
+        (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)
+    )
    if verbose:
-        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
-        print(f'For the chosen value of eta, which is {eta}, '
-              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
+        print(
+            f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}'
+        )
+        print(
+            f'For the chosen value of eta, which is {eta}, '
+            f'this results in the following sigma_t schedule for ddim sampler {sigmas}'
+        )
    return sigmas, alphas, alphas_prev


@@ -109,7 +146,9 @@ def checkpoint(func, inputs, params, flag):
                   explicitly take as arguments.
    :param flag: if False, disable gradient checkpointing.
    """
-    if flag:
+    if (
+        False
+    ):   # disabled checkpointing to allow requires_grad = False for main model
        args = tuple(inputs) + tuple(params)
        return CheckpointFunction.apply(func, len(inputs), *args)
    else:
@@ -129,7 +168,9 @@ class CheckpointFunction(torch.autograd.Function):

    @staticmethod
    def backward(ctx, *output_grads):
-        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        ctx.input_tensors = [
+            x.detach().requires_grad_(True) for x in ctx.input_tensors
+        ]
        with torch.enable_grad():
            # Fixes a bug where the first op in run_function modifies the
            # Tensor storage in place, which is not allowed for detach()'d
@@ -160,12 +201,16 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
    if not repeat_only:
        half = dim // 2
        freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
        ).to(device=timesteps.device)
        args = timesteps[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
    else:
        embedding = repeat(timesteps, 'b -> b d', d=dim)
    return embedding
@@ -215,6 +260,7 @@ class GroupNorm32(nn.GroupNorm):
    def forward(self, x):
        return super().forward(x.float()).type(x.dtype)

+
 def conv_nd(dims, *args, **kwargs):
    """
    Create a 1D, 2D, or 3D convolution module.
@@ -225,7 +271,7 @@ def conv_nd(dims, *args, **kwargs):
        return nn.Conv2d(*args, **kwargs)
    elif dims == 3:
        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
+    raise ValueError(f'unsupported dimensions: {dims}')


 def linear(*args, **kwargs):
@@ -245,15 +291,16 @@ def avg_pool_nd(dims, *args, **kwargs):
        return nn.AvgPool2d(*args, **kwargs)
    elif dims == 3:
        return nn.AvgPool3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
+    raise ValueError(f'unsupported dimensions: {dims}')


 class HybridConditioner(nn.Module):
-
    def __init__(self, c_concat_config, c_crossattn_config):
        super().__init__()
        self.concat_conditioner = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+        self.crossattn_conditioner = instantiate_from_config(
+            c_crossattn_config
+        )

    def forward(self, c_concat, c_crossattn):
        c_concat = self.concat_conditioner(c_concat)
@@ -262,6 +309,8 @@ class HybridConditioner(nn.Module):


 def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
+        shape[0], *((1,) * (len(shape) - 1))
+    )
    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()
+    return repeat_noise() if repeat else noise()
--- a/ldm/modules/distributions/distributions.py
+++ b/ldm/modules/distributions/distributions.py
@@ -30,33 +30,45 @@ class DiagonalGaussianDistribution(object):
        self.std = torch.exp(0.5 * self.logvar)
        self.var = torch.exp(self.logvar)
        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )

    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
        return x

    def kl(self, other=None):
        if self.deterministic:
-            return torch.Tensor([0.])
+            return torch.Tensor([0.0])
        else:
            if other is None:
-                return 0.5 * torch.sum(torch.pow(self.mean, 2)
-                                       + self.var - 1.0 - self.logvar,
-                                       dim=[1, 2, 3])
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
            else:
                return 0.5 * torch.sum(
                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
-                    dim=[1, 2, 3])
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )

-    def nll(self, sample, dims=[1,2,3]):
+    def nll(self, sample, dims=[1, 2, 3]):
        if self.deterministic:
-            return torch.Tensor([0.])
+            return torch.Tensor([0.0])
        logtwopi = np.log(2.0 * np.pi)
        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims)
+            logtwopi
+            + self.logvar
+            + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )

    def mode(self):
        return self.mean
@@ -74,7 +86,7 @@ def normal_kl(mean1, logvar1, mean2, logvar2):
        if isinstance(obj, torch.Tensor):
            tensor = obj
            break
-    assert tensor is not None, "at least one argument must be a Tensor"
+    assert tensor is not None, 'at least one argument must be a Tensor'

    # Force variances to be Tensors. Broadcasting helps convert scalars to
    # Tensors, but it does not work for torch.exp().
--- a/ldm/modules/ema.py
+++ b/ldm/modules/ema.py
@@ -10,24 +10,30 @@ class LitEma(nn.Module):

        self.m_name2s_name = {}
        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
-        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
-                             else torch.tensor(-1,dtype=torch.int))
+        self.register_buffer(
+            'num_updates',
+            torch.tensor(0, dtype=torch.int)
+            if use_num_upates
+            else torch.tensor(-1, dtype=torch.int),
+        )

        for name, p in model.named_parameters():
            if p.requires_grad:
-                #remove as '.'-character is not allowed in buffers
-                s_name = name.replace('.','')
-                self.m_name2s_name.update({name:s_name})
-                self.register_buffer(s_name,p.clone().detach().data)
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.', '')
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)

        self.collected_params = []

-    def forward(self,model):
+    def forward(self, model):
        decay = self.decay

        if self.num_updates >= 0:
            self.num_updates += 1
-            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
+            decay = min(
+                self.decay, (1 + self.num_updates) / (10 + self.num_updates)
+            )

        one_minus_decay = 1.0 - decay

@@ -38,8 +44,12 @@ class LitEma(nn.Module):
            for key in m_param:
                if m_param[key].requires_grad:
                    sname = self.m_name2s_name[key]
-                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
-                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
+                    shadow_params[sname] = shadow_params[sname].type_as(
+                        m_param[key]
+                    )
+                    shadow_params[sname].sub_(
+                        one_minus_decay * (shadow_params[sname] - m_param[key])
+                    )
                else:
                    assert not key in self.m_name2s_name

@@ -48,7 +58,9 @@ class LitEma(nn.Module):
        shadow_params = dict(self.named_buffers())
        for key in m_param:
            if m_param[key].requires_grad:
-                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+                m_param[key].data.copy_(
+                    shadow_params[self.m_name2s_name[key]].data
+                )
            else:
                assert not key in self.m_name2s_name

--- a/ldm/modules/embedding_manager.py
+++ b/ldm/modules/embedding_manager.py
@@ -0,0 +1,255 @@
+from cmath import log
+import torch
+from torch import nn
+
+import sys
+
+from ldm.data.personalized import per_img_token_list
+from transformers import CLIPTokenizer
+from functools import partial
+
+DEFAULT_PLACEHOLDER_TOKEN = ['*']
+
+PROGRESSIVE_SCALE = 2000
+
+
+def get_clip_token_for_string(tokenizer, string):
+    batch_encoding = tokenizer(
+        string,
+        truncation=True,
+        max_length=77,
+        return_length=True,
+        return_overflowing_tokens=False,
+        padding='max_length',
+        return_tensors='pt',
+    )
+    tokens = batch_encoding['input_ids']
+    assert (
+        torch.count_nonzero(tokens - 49407) == 2
+    ), f"String '{string}' maps to more than a single token. Please use another string"
+
+    return tokens[0, 1]
+
+
+def get_bert_token_for_string(tokenizer, string):
+    token = tokenizer(string)
+    # assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string"
+
+    token = token[0, 1]
+
+    return token
+
+
+def get_embedding_for_clip_token(embedder, token):
+    return embedder(token.unsqueeze(0))[0, 0]
+
+
+class EmbeddingManager(nn.Module):
+    def __init__(
+        self,
+        embedder,
+        placeholder_strings=None,
+        initializer_words=None,
+        per_image_tokens=False,
+        num_vectors_per_token=1,
+        progressive_words=False,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.string_to_token_dict = {}
+
+        self.string_to_param_dict = nn.ParameterDict()
+
+        self.initial_embeddings = (
+            nn.ParameterDict()
+        )   # These should not be optimized
+
+        self.progressive_words = progressive_words
+        self.progressive_counter = 0
+
+        self.max_vectors_per_token = num_vectors_per_token
+
+        if hasattr(
+            embedder, 'tokenizer'
+        ):   # using Stable Diffusion's CLIP encoder
+            self.is_clip = True
+            get_token_for_string = partial(
+                get_clip_token_for_string, embedder.tokenizer
+            )
+            get_embedding_for_tkn = partial(
+                get_embedding_for_clip_token,
+                embedder.transformer.text_model.embeddings,
+            )
+            token_dim = 1280
+        else:   # using LDM's BERT encoder
+            self.is_clip = False
+            get_token_for_string = partial(
+                get_bert_token_for_string, embedder.tknz_fn
+            )
+            get_embedding_for_tkn = embedder.transformer.token_emb
+            token_dim = 1280
+
+        if per_image_tokens:
+            placeholder_strings.extend(per_img_token_list)
+
+        for idx, placeholder_string in enumerate(placeholder_strings):
+
+            token = get_token_for_string(placeholder_string)
+
+            if initializer_words and idx < len(initializer_words):
+                init_word_token = get_token_for_string(initializer_words[idx])
+
+                with torch.no_grad():
+                    init_word_embedding = get_embedding_for_tkn(
+                        init_word_token.cpu()
+                    )
+
+                token_params = torch.nn.Parameter(
+                    init_word_embedding.unsqueeze(0).repeat(
+                        num_vectors_per_token, 1
+                    ),
+                    requires_grad=True,
+                )
+                self.initial_embeddings[
+                    placeholder_string
+                ] = torch.nn.Parameter(
+                    init_word_embedding.unsqueeze(0).repeat(
+                        num_vectors_per_token, 1
+                    ),
+                    requires_grad=False,
+                )
+            else:
+                token_params = torch.nn.Parameter(
+                    torch.rand(
+                        size=(num_vectors_per_token, token_dim),
+                        requires_grad=True,
+                    )
+                )
+
+            self.string_to_token_dict[placeholder_string] = token
+            self.string_to_param_dict[placeholder_string] = token_params
+
+    def forward(
+        self,
+        tokenized_text,
+        embedded_text,
+    ):
+        b, n, device = *tokenized_text.shape, tokenized_text.device
+
+        for (
+            placeholder_string,
+            placeholder_token,
+        ) in self.string_to_token_dict.items():
+
+            placeholder_embedding = self.string_to_param_dict[
+                placeholder_string
+            ].to(device)
+
+            if (
+                self.max_vectors_per_token == 1
+            ):   # If there's only one vector per token, we can do a simple replacement
+                placeholder_idx = torch.where(
+                    tokenized_text == placeholder_token.to(device)
+                )
+                embedded_text[placeholder_idx] = placeholder_embedding
+            else:   # otherwise, need to insert and keep track of changing indices
+                if self.progressive_words:
+                    self.progressive_counter += 1
+                    max_step_tokens = (
+                        1 + self.progressive_counter // PROGRESSIVE_SCALE
+                    )
+                else:
+                    max_step_tokens = self.max_vectors_per_token
+
+                num_vectors_for_token = min(
+                    placeholder_embedding.shape[0], max_step_tokens
+                )
+
+                placeholder_rows, placeholder_cols = torch.where(
+                    tokenized_text == placeholder_token.to(device)
+                )
+
+                if placeholder_rows.nelement() == 0:
+                    continue
+
+                sorted_cols, sort_idx = torch.sort(
+                    placeholder_cols, descending=True
+                )
+                sorted_rows = placeholder_rows[sort_idx]
+
+                for idx in range(len(sorted_rows)):
+                    row = sorted_rows[idx]
+                    col = sorted_cols[idx]
+
+                    new_token_row = torch.cat(
+                        [
+                            tokenized_text[row][:col],
+                            placeholder_token.repeat(num_vectors_for_token).to(
+                                device
+                            ),
+                            tokenized_text[row][col + 1 :],
+                        ],
+                        axis=0,
+                    )[:n]
+                    new_embed_row = torch.cat(
+                        [
+                            embedded_text[row][:col],
+                            placeholder_embedding[:num_vectors_for_token],
+                            embedded_text[row][col + 1 :],
+                        ],
+                        axis=0,
+                    )[:n]
+
+                    embedded_text[row] = new_embed_row
+                    tokenized_text[row] = new_token_row
+
+        return embedded_text
+
+    def save(self, ckpt_path):
+        torch.save(
+            {
+                'string_to_token': self.string_to_token_dict,
+                'string_to_param': self.string_to_param_dict,
+            },
+            ckpt_path,
+        )
+
+    def load(self, ckpt_path, full=True):
+        ckpt = torch.load(ckpt_path, map_location='cpu')
+        self.string_to_token_dict = ckpt["string_to_token"]
+        self.string_to_param_dict = ckpt["string_to_param"]
+        if not full:
+            for key, value in self.string_to_param_dict.items():
+                self.string_to_param_dict[key] = torch.nn.Parameter(value.half())
+
+    def get_embedding_norms_squared(self):
+        all_params = torch.cat(
+            list(self.string_to_param_dict.values()), axis=0
+        )   # num_placeholders x embedding_dim
+        param_norm_squared = (all_params * all_params).sum(
+            axis=-1
+        )              # num_placeholders
+
+        return param_norm_squared
+
+    def embedding_parameters(self):
+        return self.string_to_param_dict.parameters()
+
+    def embedding_to_coarse_loss(self):
+
+        loss = 0.0
+        num_embeddings = len(self.initial_embeddings)
+
+        for key in self.initial_embeddings:
+            optimized = self.string_to_param_dict[key]
+            coarse = self.initial_embeddings[key].clone().to(optimized.device)
+
+            loss = (
+                loss
+                + (optimized - coarse)
+                @ (optimized - coarse).T
+                / num_embeddings
+            )
+
+        return loss
--- a/ldm/modules/encoders/modules.py
+++ b/ldm/modules/encoders/modules.py
@@ -5,8 +5,40 @@ import clip
 from einops import rearrange, repeat
 from transformers import CLIPTokenizer, CLIPTextModel
 import kornia
+from ldm.dream.devices import choose_torch_device

-from ldm.modules.x_transformer import Encoder, TransformerWrapper  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
+from ldm.modules.x_transformer import (
+    Encoder,
+    TransformerWrapper,
+)  # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
+
+
+def _expand_mask(mask, dtype, tgt_len=None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = (
+        mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    )
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+
+def _build_causal_attention_mask(bsz, seq_len, dtype):
+    # lazily create causal attention mask, with full attention between the vision tokens
+    # pytorch uses additive attention mask; fill with -inf
+    mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+    mask.fill_(torch.tensor(torch.finfo(dtype).min))
+    mask.triu_(1)  # zero out the lower diagonal
+    mask = mask.unsqueeze(1)  # expand mask
+    return mask


 class AbstractEncoder(nn.Module):
@@ -17,7 +49,6 @@ class AbstractEncoder(nn.Module):
        raise NotImplementedError


-
 class ClassEmbedder(nn.Module):
    def __init__(self, embed_dim, n_classes=1000, key='class'):
        super().__init__()
@@ -35,11 +66,22 @@ class ClassEmbedder(nn.Module):

 class TransformerEmbedder(AbstractEncoder):
    """Some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"):
+
+    def __init__(
+        self,
+        n_embed,
+        n_layer,
+        vocab_size,
+        max_seq_len=77,
+        device=choose_torch_device(),
+    ):
        super().__init__()
        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer))
+        self.transformer = TransformerWrapper(
+            num_tokens=vocab_size,
+            max_seq_len=max_seq_len,
+            attn_layers=Encoder(dim=n_embed, depth=n_layer),
+        )

    def forward(self, tokens):
        tokens = tokens.to(self.device)  # meh
@@ -51,27 +93,44 @@ class TransformerEmbedder(AbstractEncoder):


 class BERTTokenizer(AbstractEncoder):
-    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
-    def __init__(self, device="cuda", vq_interface=True, max_length=77):
+    """Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
+
+    def __init__(
+        self, device=choose_torch_device(), vq_interface=True, max_length=77
+    ):
        super().__init__()
-        from transformers import BertTokenizerFast  # TODO: add to reuquirements
+        from transformers import (
+            BertTokenizerFast,
+        )  # TODO: add to reuquirements
+
        # Modified to allow to run on non-internet connected compute nodes.
        # Model needs to be loaded into cache from an internet-connected machine
        # by running:
        #   from transformers import BertTokenizerFast
        #   BertTokenizerFast.from_pretrained("bert-base-uncased")
        try:
-            self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",local_files_only=True)
+            self.tokenizer = BertTokenizerFast.from_pretrained(
+                'bert-base-uncased', local_files_only=True
+            )
        except OSError:
-            raise SystemExit("* Couldn't load Bert tokenizer files. Try running scripts/preload_models.py from an internet-conected machine.")
+            raise SystemExit(
+                "* Couldn't load Bert tokenizer files. Try running scripts/preload_models.py from an internet-conected machine."
+            )
        self.device = device
        self.vq_interface = vq_interface
        self.max_length = max_length

    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        tokens = batch_encoding["input_ids"].to(self.device)
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt',
+        )
+        tokens = batch_encoding['input_ids'].to(self.device)
        return tokens

    @torch.no_grad()
@@ -87,54 +146,84 @@ class BERTTokenizer(AbstractEncoder):

 class BERTEmbedder(AbstractEncoder):
    """Uses the BERT tokenizr model and add some transformer encoder layers"""
-    def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77,
-                 device="cuda",use_tokenizer=True, embedding_dropout=0.0):
+
+    def __init__(
+        self,
+        n_embed,
+        n_layer,
+        vocab_size=30522,
+        max_seq_len=77,
+        device=choose_torch_device(),
+        use_tokenizer=True,
+        embedding_dropout=0.0,
+    ):
        super().__init__()
        self.use_tknz_fn = use_tokenizer
        if self.use_tknz_fn:
-            self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len)
+            self.tknz_fn = BERTTokenizer(
+                vq_interface=False, max_length=max_seq_len
+            )
        self.device = device
-        self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len,
-                                              attn_layers=Encoder(dim=n_embed, depth=n_layer),
-                                              emb_dropout=embedding_dropout)
+        self.transformer = TransformerWrapper(
+            num_tokens=vocab_size,
+            max_seq_len=max_seq_len,
+            attn_layers=Encoder(dim=n_embed, depth=n_layer),
+            emb_dropout=embedding_dropout,
+        )

-    def forward(self, text):
+    def forward(self, text, embedding_manager=None):
        if self.use_tknz_fn:
-            tokens = self.tknz_fn(text)#.to(self.device)
+            tokens = self.tknz_fn(text)  # .to(self.device)
        else:
            tokens = text
-        z = self.transformer(tokens, return_embeddings=True)
+        z = self.transformer(
+            tokens, return_embeddings=True, embedding_manager=embedding_manager
+        )
        return z

-    def encode(self, text):
+    def encode(self, text, **kwargs):
        # output of length 77
-        return self(text)
+        return self(text, **kwargs)


 class SpatialRescaler(nn.Module):
-    def __init__(self,
-                 n_stages=1,
-                 method='bilinear',
-                 multiplier=0.5,
-                 in_channels=3,
-                 out_channels=None,
-                 bias=False):
+    def __init__(
+        self,
+        n_stages=1,
+        method='bilinear',
+        multiplier=0.5,
+        in_channels=3,
+        out_channels=None,
+        bias=False,
+    ):
        super().__init__()
        self.n_stages = n_stages
        assert self.n_stages >= 0
-        assert method in ['nearest','linear','bilinear','trilinear','bicubic','area']
+        assert method in [
+            'nearest',
+            'linear',
+            'bilinear',
+            'trilinear',
+            'bicubic',
+            'area',
+        ]
        self.multiplier = multiplier
-        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
+        self.interpolator = partial(
+            torch.nn.functional.interpolate, mode=method
+        )
        self.remap_output = out_channels is not None
        if self.remap_output:
-            print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.')
-            self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias)
+            print(
+                f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.'
+            )
+            self.channel_mapper = nn.Conv2d(
+                in_channels, out_channels, 1, bias=bias
+            )

-    def forward(self,x):
+    def forward(self, x):
        for stage in range(self.n_stages):
            x = self.interpolator(x, scale_factor=self.multiplier)

-
        if self.remap_output:
            x = self.channel_mapper(x)
        return x
@@ -142,41 +231,245 @@ class SpatialRescaler(nn.Module):
    def encode(self, x):
        return self(x)

+
 class FrozenCLIPEmbedder(AbstractEncoder):
    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
-    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
+
+    def __init__(
+        self,
+        version='openai/clip-vit-large-patch14',
+        device=choose_torch_device(),
+        max_length=77,
+    ):
        super().__init__()
-        self.tokenizer = CLIPTokenizer.from_pretrained(version)
-        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            version, local_files_only=True
+        )
+        self.transformer = CLIPTextModel.from_pretrained(
+            version, local_files_only=True
+        )
        self.device = device
        self.max_length = max_length
        self.freeze()

+        def embedding_forward(
+            self,
+            input_ids=None,
+            position_ids=None,
+            inputs_embeds=None,
+            embedding_manager=None,
+        ) -> torch.Tensor:
+
+            seq_length = (
+                input_ids.shape[-1]
+                if input_ids is not None
+                else inputs_embeds.shape[-2]
+            )
+
+            if position_ids is None:
+                position_ids = self.position_ids[:, :seq_length]
+
+            if inputs_embeds is None:
+                inputs_embeds = self.token_embedding(input_ids)
+
+            if embedding_manager is not None:
+                inputs_embeds = embedding_manager(input_ids, inputs_embeds)
+
+            position_embeddings = self.position_embedding(position_ids)
+            embeddings = inputs_embeds + position_embeddings
+
+            return embeddings
+
+        self.transformer.text_model.embeddings.forward = (
+            embedding_forward.__get__(self.transformer.text_model.embeddings)
+        )
+
+        def encoder_forward(
+            self,
+            inputs_embeds,
+            attention_mask=None,
+            causal_attention_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+        ):
+            output_attentions = (
+                output_attentions
+                if output_attentions is not None
+                else self.config.output_attentions
+            )
+            output_hidden_states = (
+                output_hidden_states
+                if output_hidden_states is not None
+                else self.config.output_hidden_states
+            )
+            return_dict = (
+                return_dict
+                if return_dict is not None
+                else self.config.use_return_dict
+            )
+
+            encoder_states = () if output_hidden_states else None
+            all_attentions = () if output_attentions else None
+
+            hidden_states = inputs_embeds
+            for idx, encoder_layer in enumerate(self.layers):
+                if output_hidden_states:
+                    encoder_states = encoder_states + (hidden_states,)
+
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            return hidden_states
+
+        self.transformer.text_model.encoder.forward = encoder_forward.__get__(
+            self.transformer.text_model.encoder
+        )
+
+        def text_encoder_forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            position_ids=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            embedding_manager=None,
+        ):
+            output_attentions = (
+                output_attentions
+                if output_attentions is not None
+                else self.config.output_attentions
+            )
+            output_hidden_states = (
+                output_hidden_states
+                if output_hidden_states is not None
+                else self.config.output_hidden_states
+            )
+            return_dict = (
+                return_dict
+                if return_dict is not None
+                else self.config.use_return_dict
+            )
+
+            if input_ids is None:
+                raise ValueError('You have to specify either input_ids')
+
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+
+            hidden_states = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                embedding_manager=embedding_manager,
+            )
+
+            bsz, seq_len = input_shape
+            # CLIP's text model uses causal mask, prepare it here.
+            # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+            causal_attention_mask = _build_causal_attention_mask(
+                bsz, seq_len, hidden_states.dtype
+            ).to(hidden_states.device)
+
+            # expand attention_mask
+            if attention_mask is not None:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _expand_mask(
+                    attention_mask, hidden_states.dtype
+                )
+
+            last_hidden_state = self.encoder(
+                inputs_embeds=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+            return last_hidden_state
+
+        self.transformer.text_model.forward = text_encoder_forward.__get__(
+            self.transformer.text_model
+        )
+
+        def transformer_forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            position_ids=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            embedding_manager=None,
+        ):
+            return self.text_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                embedding_manager=embedding_manager,
+            )
+
+        self.transformer.forward = transformer_forward.__get__(
+            self.transformer
+        )
+
    def freeze(self):
        self.transformer = self.transformer.eval()
        for param in self.parameters():
            param.requires_grad = False

-    def forward(self, text):
-        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
-                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
-        tokens = batch_encoding["input_ids"].to(self.device)
-        outputs = self.transformer(input_ids=tokens)
+    def forward(self, text, **kwargs):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt',
+        )
+        tokens = batch_encoding['input_ids'].to(self.device)
+        z = self.transformer(input_ids=tokens, **kwargs)

-        z = outputs.last_hidden_state
        return z

-    def encode(self, text):
-        return self(text)
+    def encode(self, text, **kwargs):
+        return self(text, **kwargs)


 class FrozenCLIPTextEmbedder(nn.Module):
    """
    Uses the CLIP transformer encoder for text.
    """
-    def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True):
+
+    def __init__(
+        self,
+        version='ViT-L/14',
+        device=choose_torch_device(),
+        max_length=77,
+        n_repeat=1,
+        normalize=True,
+    ):
        super().__init__()
-        self.model, _ = clip.load(version, jit=False, device="cpu")
+        self.model, _ = clip.load(version, jit=False, device=device)
        self.device = device
        self.max_length = max_length
        self.n_repeat = n_repeat
@@ -196,7 +489,7 @@ class FrozenCLIPTextEmbedder(nn.Module):

    def encode(self, text):
        z = self(text)
-        if z.ndim==2:
+        if z.ndim == 2:
            z = z[:, None, :]
        z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat)
        return z
@@ -204,29 +497,42 @@ class FrozenCLIPTextEmbedder(nn.Module):

 class FrozenClipImageEmbedder(nn.Module):
    """
-        Uses the CLIP image encoder.
-        """
+    Uses the CLIP image encoder.
+    """
+
    def __init__(
-            self,
-            model,
-            jit=False,
-            device='cuda' if torch.cuda.is_available() else 'cpu',
-            antialias=False,
-        ):
+        self,
+        model,
+        jit=False,
+        device=choose_torch_device(),
+        antialias=False,
+    ):
        super().__init__()
        self.model, _ = clip.load(name=model, device=device, jit=jit)

        self.antialias = antialias

-        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
-        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.48145466, 0.4578275, 0.40821073]),
+            persistent=False,
+        )
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.26862954, 0.26130258, 0.27577711]),
+            persistent=False,
+        )

    def preprocess(self, x):
        # normalize to [0,1]
-        x = kornia.geometry.resize(x, (224, 224),
-                                   interpolation='bicubic',align_corners=True,
-                                   antialias=self.antialias)
-        x = (x + 1.) / 2.
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation='bicubic',
+            align_corners=True,
+            antialias=self.antialias,
+        )
+        x = (x + 1.0) / 2.0
        # renormalize according to clip
        x = kornia.enhance.normalize(x, self.mean, self.std)
        return x
@@ -236,7 +542,8 @@ class FrozenClipImageEmbedder(nn.Module):
        return self.model.encode_image(self.preprocess(x))


-if __name__ == "__main__":
+if __name__ == '__main__':
    from ldm.util import count_params
+
    model = FrozenCLIPEmbedder()
    count_params(model, verbose=True)
--- a/ldm/modules/image_degradation/init.py
+++ b/ldm/modules/image_degradation/init.py
@@ -1,2 +1,6 @@
-from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr
-from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light
+from ldm.modules.image_degradation.bsrgan import (
+    degradation_bsrgan_variant as degradation_fn_bsr,
+)
+from ldm.modules.image_degradation.bsrgan_light import (
+    degradation_bsrgan_variant as degradation_fn_bsr_light,
+)
--- a/ldm/modules/image_degradation/bsrgan.py
+++ b/ldm/modules/image_degradation/bsrgan.py
@@ -27,16 +27,16 @@ import ldm.modules.image_degradation.utils_image as util


 def modcrop_np(img, sf):
-    '''
+    """
    Args:
        img: numpy image, WxH or WxHxC
        sf: scale factor
    Return:
        cropped image
-    '''
+    """
    w, h = img.shape[:2]
    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
+    return im[: w - w % sf, : h - h % sf, ...]


 """
@@ -54,7 +54,9 @@ def analytic_kernel(k):
    # Loop over the small kernel to fill the big one
    for r in range(k_size):
        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+            big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += (
+                k[r, c] * k
+            )
    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
    crop = k_size // 2
    cropped_big_k = big_k[crop:-crop, crop:-crop]
@@ -63,7 +65,7 @@ def analytic_kernel(k):


 def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
+    """generate an anisotropic Gaussian kernel
    Args:
        ksize : e.g., 15, kernel size
        theta : [0,  pi], rotation angle range
@@ -74,7 +76,12 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
        k     : kernel
    """

-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
+    v = np.dot(
+        np.array(
+            [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
+        ),
+        np.array([1.0, 0.0]),
+    )
    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
    D = np.array([[l1, 0], [0, l2]])
    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
@@ -126,24 +133,32 @@ def shift_pixel(x, sf, upper_left=True):


 def blur(x, k):
-    '''
+    """
    x: image, NxcxHxW
    k: kernel, Nx1xhxw
-    '''
+    """
    n, c = x.shape[:2]
    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
    k = k.repeat(1, c, 1, 1)
    k = k.view(-1, 1, k.shape[2], k.shape[3])
    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
+    x = torch.nn.functional.conv2d(
+        x, k, bias=None, stride=1, padding=0, groups=n * c
+    )
    x = x.view(n, c, x.shape[2], x.shape[3])

    return x


-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
+def gen_kernel(
+    k_size=np.array([15, 15]),
+    scale_factor=np.array([4, 4]),
+    min_var=0.6,
+    max_var=10.0,
+    noise_level=0,
+):
+    """ "
    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
    # Kai Zhang
    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
@@ -157,13 +172,16 @@ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var

    # Set COV matrix using Lambdas and Theta
    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
+    Q = np.array(
+        [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
+    )
    SIGMA = Q @ LAMBDA @ Q.T
    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]

    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
+    MU = k_size // 2 - 0.5 * (
+        scale_factor - 1
+    )  # - 0.5 * (scale_factor - k_size % 2)
    MU = MU[None, None, :, None]

    # Create meshgrid for Gaussian
@@ -188,7 +206,9 @@ def fspecial_gaussian(hsize, sigma):
    hsize = [hsize, hsize]
    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+    [x, y] = np.meshgrid(
+        np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)
+    )
    arg = -(x * x + y * y) / (2 * std * std)
    h = np.exp(arg)
    h[h < scipy.finfo(float).eps * h.max()] = 0
@@ -208,10 +228,10 @@ def fspecial_laplacian(alpha):


 def fspecial(filter_type, *args, **kwargs):
-    '''
+    """
    python code from:
    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
+    """
    if filter_type == 'gaussian':
        return fspecial_gaussian(*args, **kwargs)
    if filter_type == 'laplacian':
@@ -226,19 +246,19 @@ def fspecial(filter_type, *args, **kwargs):


 def bicubic_degradation(x, sf=3):
-    '''
+    """
    Args:
        x: HxWxC image, [0, 1]
        sf: down-scale factor
    Return:
        bicubicly downsampled LR image
-    '''
+    """
    x = util.imresize_np(x, scale=1 / sf)
    return x


 def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
+    """blur + bicubic downsampling
    Args:
        x: HxWxC image, [0, 1]
        k: hxw, double
@@ -253,14 +273,16 @@ def srmd_degradation(x, k, sf=3):
          pages={3262--3271},
          year={2018}
        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
+    """
+    x = ndimage.filters.convolve(
+        x, np.expand_dims(k, axis=2), mode='wrap'
+    )  # 'nearest' | 'mirror'
    x = bicubic_degradation(x, sf=sf)
    return x


 def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
+    """bicubic downsampling + blur
    Args:
        x: HxWxC image, [0, 1]
        k: hxw, double
@@ -275,21 +297,21 @@ def dpsr_degradation(x, k, sf=3):
          pages={1671--1681},
          year={2019}
        }
-    '''
+    """
    x = bicubic_degradation(x, sf=sf)
    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
    return x


 def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
+    """blur + downsampling
    Args:
        x: HxWxC image, [0, 1]/[0, 255]
        k: hxw, double
        sf: down-scale factor
    Return:
        downsampled LR image
-    '''
+    """
    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
    st = 0
@@ -328,10 +350,19 @@ def add_blur(img, sf=4):
    if random.random() < 0.5:
        l1 = wd2 * random.random()
        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
+        k = anisotropic_Gaussian(
+            ksize=2 * random.randint(2, 11) + 3,
+            theta=random.random() * np.pi,
+            l1=l1,
+            l2=l2,
+        )
    else:
-        k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
+        k = fspecial(
+            'gaussian', 2 * random.randint(2, 11) + 3, wd * random.random()
+        )
+    img = ndimage.filters.convolve(
+        img, np.expand_dims(k, axis=2), mode='mirror'
+    )

    return img

@@ -344,7 +375,11 @@ def add_resize(img, sf=4):
        sf1 = random.uniform(0.5 / sf, 1)
    else:
        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
+    img = cv2.resize(
+        img,
+        (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
+        interpolation=random.choice([1, 2, 3]),
+    )
    img = np.clip(img, 0.0, 1.0)

    return img
@@ -366,19 +401,26 @@ def add_resize(img, sf=4):
 #     img = np.clip(img, 0.0, 1.0)
 #     return img

+
 def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
    noise_level = random.randint(noise_level1, noise_level2)
    rnum = np.random.rand()
    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(
+            np.float32
+        )
    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+        img = img + np.random.normal(
+            0, noise_level / 255.0, (*img.shape[:2], 1)
+        ).astype(np.float32)
    else:  # add  noise
-        L = noise_level2 / 255.
+        L = noise_level2 / 255.0
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+        img = img + np.random.multivariate_normal(
+            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
+        ).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img

@@ -388,28 +430,37 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
    img = np.clip(img, 0.0, 1.0)
    rnum = random.random()
    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+        img += img * np.random.normal(
+            0, noise_level / 255.0, img.shape
+        ).astype(np.float32)
    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+        img += img * np.random.normal(
+            0, noise_level / 255.0, (*img.shape[:2], 1)
+        ).astype(np.float32)
    else:
-        L = noise_level2 / 255.
+        L = noise_level2 / 255.0
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+        img += img * np.random.multivariate_normal(
+            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
+        ).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img


 def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
+    img = np.clip((img * 255.0).round(), 0, 255) / 255.0
    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
    if random.random() < 0.5:
        img = np.random.poisson(img * vals).astype(np.float32) / vals
    else:
        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
+        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
+        noise_gray = (
+            np.random.poisson(img_gray * vals).astype(np.float32) / vals
+            - img_gray
+        )
        img += noise_gray[:, :, np.newaxis]
    img = np.clip(img, 0.0, 1.0)
    return img
@@ -418,7 +469,9 @@ def add_Poisson_noise(img):
 def add_JPEG_noise(img):
    quality_factor = random.randint(30, 95)
    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+    result, encimg = cv2.imencode(
+        '.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]
+    )
    img = cv2.imdecode(encimg, 1)
    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
    return img
@@ -428,10 +481,14 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
    h, w = lq.shape[:2]
    rnd_h = random.randint(0, h - lq_patchsize)
    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+    lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]

    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
+    hq = hq[
+        rnd_h_H : rnd_h_H + lq_patchsize * sf,
+        rnd_w_H : rnd_w_H + lq_patchsize * sf,
+        :,
+    ]
    return lq, hq


@@ -452,7 +509,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
    sf_ori = sf

    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
    h, w = img.shape[:2]

    if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -462,8 +519,11 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):

    if sf == 4 and random.random() < scale2_prob:  # downsample1
        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
+            img = cv2.resize(
+                img,
+                (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
+                interpolation=random.choice([1, 2, 3]),
+            )
        else:
            img = util.imresize_np(img, 1 / 2, True)
        img = np.clip(img, 0.0, 1.0)
@@ -472,7 +532,10 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
    shuffle_order = random.sample(range(7), 7)
    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+        shuffle_order[idx1], shuffle_order[idx2] = (
+            shuffle_order[idx2],
+            shuffle_order[idx1],
+        )

    for i in shuffle_order:

@@ -487,19 +550,30 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
            # downsample2
            if random.random() < 0.75:
                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
+                img = cv2.resize(
+                    img,
+                    (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
+                    interpolation=random.choice([1, 2, 3]),
+                )
            else:
                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                k_shifted = (
+                    k_shifted / k_shifted.sum()
+                )  # blur with shifted kernel
+                img = ndimage.filters.convolve(
+                    img, np.expand_dims(k_shifted, axis=2), mode='mirror'
+                )
                img = img[0::sf, 0::sf, ...]  # nearest downsampling
            img = np.clip(img, 0.0, 1.0)

        elif i == 3:
            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            img = cv2.resize(
+                img,
+                (int(1 / sf * a), int(1 / sf * b)),
+                interpolation=random.choice([1, 2, 3]),
+            )
            img = np.clip(img, 0.0, 1.0)

        elif i == 4:
@@ -544,15 +618,18 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
    sf_ori = sf

    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
    h, w = image.shape[:2]

    hq = image.copy()

    if sf == 4 and random.random() < scale2_prob:  # downsample1
        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
+            image = cv2.resize(
+                image,
+                (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
+                interpolation=random.choice([1, 2, 3]),
+            )
        else:
            image = util.imresize_np(image, 1 / 2, True)
        image = np.clip(image, 0.0, 1.0)
@@ -561,7 +638,10 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
    shuffle_order = random.sample(range(7), 7)
    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+        shuffle_order[idx1], shuffle_order[idx2] = (
+            shuffle_order[idx2],
+            shuffle_order[idx1],
+        )

    for i in shuffle_order:

@@ -576,19 +656,33 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
            # downsample2
            if random.random() < 0.75:
                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
+                image = cv2.resize(
+                    image,
+                    (
+                        int(1 / sf1 * image.shape[1]),
+                        int(1 / sf1 * image.shape[0]),
+                    ),
+                    interpolation=random.choice([1, 2, 3]),
+                )
            else:
                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                k_shifted = (
+                    k_shifted / k_shifted.sum()
+                )  # blur with shifted kernel
+                image = ndimage.filters.convolve(
+                    image, np.expand_dims(k_shifted, axis=2), mode='mirror'
+                )
                image = image[0::sf, 0::sf, ...]  # nearest downsampling
            image = np.clip(image, 0.0, 1.0)

        elif i == 3:
            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            image = cv2.resize(
+                image,
+                (int(1 / sf * a), int(1 / sf * b)),
+                interpolation=random.choice([1, 2, 3]),
+            )
            image = np.clip(image, 0.0, 1.0)

        elif i == 4:
@@ -609,12 +703,19 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
    # add final JPEG compression noise
    image = add_JPEG_noise(image)
    image = util.single2uint(image)
-    example = {"image":image}
+    example = {'image': image}
    return example


 # TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
-def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
+def degradation_bsrgan_plus(
+    img,
+    sf=4,
+    shuffle_prob=0.5,
+    use_sharp=True,
+    lq_patchsize=64,
+    isp_model=None,
+):
    """
    This is an extended degradation model by combining
    the degradation models of BSRGAN and Real-ESRGAN
@@ -630,7 +731,7 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
    """

    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
    h, w = img.shape[:2]

    if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -645,8 +746,12 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
    else:
        shuffle_order = list(range(13))
        # local shuffle for noise, JPEG is always the last one
-        shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6)))
-        shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13)))
+        shuffle_order[2:6] = random.sample(
+            shuffle_order[2:6], len(range(2, 6))
+        )
+        shuffle_order[9:13] = random.sample(
+            shuffle_order[9:13], len(range(9, 13))
+        )

    poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1

@@ -689,8 +794,11 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc
            print('check the shuffle!')

    # resize to desired size
-    img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
-                     interpolation=random.choice([1, 2, 3]))
+    img = cv2.resize(
+        img,
+        (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])),
+        interpolation=random.choice([1, 2, 3]),
+    )

    # add final JPEG compression noise
    img = add_JPEG_noise(img)
@@ -702,29 +810,37 @@ def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patc


 if __name__ == '__main__':
-	print("hey")
-	img = util.imread_uint('utils/test.png', 3)
-	print(img)
-	img = util.uint2single(img)
-	print(img)
-	img = img[:448, :448]
-	h = img.shape[0] // 4
-	print("resizing to", h)
-	sf = 4
-	deg_fn = partial(degradation_bsrgan_variant, sf=sf)
-	for i in range(20):
-		print(i)
-		img_lq = deg_fn(img)
-		print(img_lq)
-		img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"]
-		print(img_lq.shape)
-		print("bicubic", img_lq_bicubic.shape)
-		print(img_hq.shape)
-		lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-		                        interpolation=0)
-		img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
-		util.imsave(img_concat, str(i) + '.png')
-
-
+    print('hey')
+    img = util.imread_uint('utils/test.png', 3)
+    print(img)
+    img = util.uint2single(img)
+    print(img)
+    img = img[:448, :448]
+    h = img.shape[0] // 4
+    print('resizing to', h)
+    sf = 4
+    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
+    for i in range(20):
+        print(i)
+        img_lq = deg_fn(img)
+        print(img_lq)
+        img_lq_bicubic = albumentations.SmallestMaxSize(
+            max_size=h, interpolation=cv2.INTER_CUBIC
+        )(image=img)['image']
+        print(img_lq.shape)
+        print('bicubic', img_lq_bicubic.shape)
+        print(img_hq.shape)
+        lq_nearest = cv2.resize(
+            util.single2uint(img_lq),
+            (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+            interpolation=0,
+        )
+        lq_bicubic_nearest = cv2.resize(
+            util.single2uint(img_lq_bicubic),
+            (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+            interpolation=0,
+        )
+        img_concat = np.concatenate(
+            [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1
+        )
+        util.imsave(img_concat, str(i) + '.png')
--- a/ldm/modules/image_degradation/bsrgan_light.py
+++ b/ldm/modules/image_degradation/bsrgan_light.py
@@ -27,16 +27,16 @@ import ldm.modules.image_degradation.utils_image as util


 def modcrop_np(img, sf):
-    '''
+    """
    Args:
        img: numpy image, WxH or WxHxC
        sf: scale factor
    Return:
        cropped image
-    '''
+    """
    w, h = img.shape[:2]
    im = np.copy(img)
-    return im[:w - w % sf, :h - h % sf, ...]
+    return im[: w - w % sf, : h - h % sf, ...]


 """
@@ -54,7 +54,9 @@ def analytic_kernel(k):
    # Loop over the small kernel to fill the big one
    for r in range(k_size):
        for c in range(k_size):
-            big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k
+            big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += (
+                k[r, c] * k
+            )
    # Crop the edges of the big kernel to ignore very small values and increase run time of SR
    crop = k_size // 2
    cropped_big_k = big_k[crop:-crop, crop:-crop]
@@ -63,7 +65,7 @@ def analytic_kernel(k):


 def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
-    """ generate an anisotropic Gaussian kernel
+    """generate an anisotropic Gaussian kernel
    Args:
        ksize : e.g., 15, kernel size
        theta : [0,  pi], rotation angle range
@@ -74,7 +76,12 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6):
        k     : kernel
    """

-    v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.]))
+    v = np.dot(
+        np.array(
+            [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
+        ),
+        np.array([1.0, 0.0]),
+    )
    V = np.array([[v[0], v[1]], [v[1], -v[0]]])
    D = np.array([[l1, 0], [0, l2]])
    Sigma = np.dot(np.dot(V, D), np.linalg.inv(V))
@@ -126,24 +133,32 @@ def shift_pixel(x, sf, upper_left=True):


 def blur(x, k):
-    '''
+    """
    x: image, NxcxHxW
    k: kernel, Nx1xhxw
-    '''
+    """
    n, c = x.shape[:2]
    p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2
    x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate')
    k = k.repeat(1, c, 1, 1)
    k = k.view(-1, 1, k.shape[2], k.shape[3])
    x = x.view(1, -1, x.shape[2], x.shape[3])
-    x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c)
+    x = torch.nn.functional.conv2d(
+        x, k, bias=None, stride=1, padding=0, groups=n * c
+    )
    x = x.view(n, c, x.shape[2], x.shape[3])

    return x


-def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0):
-    """"
+def gen_kernel(
+    k_size=np.array([15, 15]),
+    scale_factor=np.array([4, 4]),
+    min_var=0.6,
+    max_var=10.0,
+    noise_level=0,
+):
+    """ "
    # modified version of https://github.com/assafshocher/BlindSR_dataset_generator
    # Kai Zhang
    # min_var = 0.175 * sf  # variance of the gaussian kernel will be sampled between min_var and max_var
@@ -157,13 +172,16 @@ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var

    # Set COV matrix using Lambdas and Theta
    LAMBDA = np.diag([lambda_1, lambda_2])
-    Q = np.array([[np.cos(theta), -np.sin(theta)],
-                  [np.sin(theta), np.cos(theta)]])
+    Q = np.array(
+        [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]
+    )
    SIGMA = Q @ LAMBDA @ Q.T
    INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :]

    # Set expectation position (shifting kernel for aligned image)
-    MU = k_size // 2 - 0.5 * (scale_factor - 1)  # - 0.5 * (scale_factor - k_size % 2)
+    MU = k_size // 2 - 0.5 * (
+        scale_factor - 1
+    )  # - 0.5 * (scale_factor - k_size % 2)
    MU = MU[None, None, :, None]

    # Create meshgrid for Gaussian
@@ -188,7 +206,9 @@ def fspecial_gaussian(hsize, sigma):
    hsize = [hsize, hsize]
    siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0]
    std = sigma
-    [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1))
+    [x, y] = np.meshgrid(
+        np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)
+    )
    arg = -(x * x + y * y) / (2 * std * std)
    h = np.exp(arg)
    h[h < scipy.finfo(float).eps * h.max()] = 0
@@ -208,10 +228,10 @@ def fspecial_laplacian(alpha):


 def fspecial(filter_type, *args, **kwargs):
-    '''
+    """
    python code from:
    https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py
-    '''
+    """
    if filter_type == 'gaussian':
        return fspecial_gaussian(*args, **kwargs)
    if filter_type == 'laplacian':
@@ -226,19 +246,19 @@ def fspecial(filter_type, *args, **kwargs):


 def bicubic_degradation(x, sf=3):
-    '''
+    """
    Args:
        x: HxWxC image, [0, 1]
        sf: down-scale factor
    Return:
        bicubicly downsampled LR image
-    '''
+    """
    x = util.imresize_np(x, scale=1 / sf)
    return x


 def srmd_degradation(x, k, sf=3):
-    ''' blur + bicubic downsampling
+    """blur + bicubic downsampling
    Args:
        x: HxWxC image, [0, 1]
        k: hxw, double
@@ -253,14 +273,16 @@ def srmd_degradation(x, k, sf=3):
          pages={3262--3271},
          year={2018}
        }
-    '''
-    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')  # 'nearest' | 'mirror'
+    """
+    x = ndimage.filters.convolve(
+        x, np.expand_dims(k, axis=2), mode='wrap'
+    )  # 'nearest' | 'mirror'
    x = bicubic_degradation(x, sf=sf)
    return x


 def dpsr_degradation(x, k, sf=3):
-    ''' bicubic downsampling + blur
+    """bicubic downsampling + blur
    Args:
        x: HxWxC image, [0, 1]
        k: hxw, double
@@ -275,21 +297,21 @@ def dpsr_degradation(x, k, sf=3):
          pages={1671--1681},
          year={2019}
        }
-    '''
+    """
    x = bicubic_degradation(x, sf=sf)
    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
    return x


 def classical_degradation(x, k, sf=3):
-    ''' blur + downsampling
+    """blur + downsampling
    Args:
        x: HxWxC image, [0, 1]/[0, 255]
        k: hxw, double
        sf: down-scale factor
    Return:
        downsampled LR image
-    '''
+    """
    x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap')
    # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2))
    st = 0
@@ -326,16 +348,25 @@ def add_blur(img, sf=4):
    wd2 = 4.0 + sf
    wd = 2.0 + 0.2 * sf

-    wd2 = wd2/4
-    wd = wd/4
+    wd2 = wd2 / 4
+    wd = wd / 4

    if random.random() < 0.5:
        l1 = wd2 * random.random()
        l2 = wd2 * random.random()
-        k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2)
+        k = anisotropic_Gaussian(
+            ksize=random.randint(2, 11) + 3,
+            theta=random.random() * np.pi,
+            l1=l1,
+            l2=l2,
+        )
    else:
-        k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random())
-    img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror')
+        k = fspecial(
+            'gaussian', random.randint(2, 4) + 3, wd * random.random()
+        )
+    img = ndimage.filters.convolve(
+        img, np.expand_dims(k, axis=2), mode='mirror'
+    )

    return img

@@ -348,7 +379,11 @@ def add_resize(img, sf=4):
        sf1 = random.uniform(0.5 / sf, 1)
    else:
        sf1 = 1.0
-    img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3]))
+    img = cv2.resize(
+        img,
+        (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])),
+        interpolation=random.choice([1, 2, 3]),
+    )
    img = np.clip(img, 0.0, 1.0)

    return img
@@ -370,19 +405,26 @@ def add_resize(img, sf=4):
 #     img = np.clip(img, 0.0, 1.0)
 #     return img

+
 def add_Gaussian_noise(img, noise_level1=2, noise_level2=25):
    noise_level = random.randint(noise_level1, noise_level2)
    rnum = np.random.rand()
    if rnum > 0.6:  # add color Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+        img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(
+            np.float32
+        )
    elif rnum < 0.4:  # add grayscale Gaussian noise
-        img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+        img = img + np.random.normal(
+            0, noise_level / 255.0, (*img.shape[:2], 1)
+        ).astype(np.float32)
    else:  # add  noise
-        L = noise_level2 / 255.
+        L = noise_level2 / 255.0
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+        img = img + np.random.multivariate_normal(
+            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
+        ).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img

@@ -392,28 +434,37 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25):
    img = np.clip(img, 0.0, 1.0)
    rnum = random.random()
    if rnum > 0.6:
-        img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32)
+        img += img * np.random.normal(
+            0, noise_level / 255.0, img.shape
+        ).astype(np.float32)
    elif rnum < 0.4:
-        img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32)
+        img += img * np.random.normal(
+            0, noise_level / 255.0, (*img.shape[:2], 1)
+        ).astype(np.float32)
    else:
-        L = noise_level2 / 255.
+        L = noise_level2 / 255.0
        D = np.diag(np.random.rand(3))
        U = orth(np.random.rand(3, 3))
        conv = np.dot(np.dot(np.transpose(U), D), U)
-        img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32)
+        img += img * np.random.multivariate_normal(
+            [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]
+        ).astype(np.float32)
    img = np.clip(img, 0.0, 1.0)
    return img


 def add_Poisson_noise(img):
-    img = np.clip((img * 255.0).round(), 0, 255) / 255.
+    img = np.clip((img * 255.0).round(), 0, 255) / 255.0
    vals = 10 ** (2 * random.random() + 2.0)  # [2, 4]
    if random.random() < 0.5:
        img = np.random.poisson(img * vals).astype(np.float32) / vals
    else:
        img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114])
-        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.
-        noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray
+        img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0
+        noise_gray = (
+            np.random.poisson(img_gray * vals).astype(np.float32) / vals
+            - img_gray
+        )
        img += noise_gray[:, :, np.newaxis]
    img = np.clip(img, 0.0, 1.0)
    return img
@@ -422,7 +473,9 @@ def add_Poisson_noise(img):
 def add_JPEG_noise(img):
    quality_factor = random.randint(80, 95)
    img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR)
-    result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor])
+    result, encimg = cv2.imencode(
+        '.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]
+    )
    img = cv2.imdecode(encimg, 1)
    img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB)
    return img
@@ -432,10 +485,14 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64):
    h, w = lq.shape[:2]
    rnd_h = random.randint(0, h - lq_patchsize)
    rnd_w = random.randint(0, w - lq_patchsize)
-    lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :]
+    lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :]

    rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf)
-    hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :]
+    hq = hq[
+        rnd_h_H : rnd_h_H + lq_patchsize * sf,
+        rnd_w_H : rnd_w_H + lq_patchsize * sf,
+        :,
+    ]
    return lq, hq


@@ -456,7 +513,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
    sf_ori = sf

    h1, w1 = img.shape[:2]
-    img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
    h, w = img.shape[:2]

    if h < lq_patchsize * sf or w < lq_patchsize * sf:
@@ -466,8 +523,11 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):

    if sf == 4 and random.random() < scale2_prob:  # downsample1
        if np.random.rand() < 0.5:
-            img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
-                             interpolation=random.choice([1, 2, 3]))
+            img = cv2.resize(
+                img,
+                (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])),
+                interpolation=random.choice([1, 2, 3]),
+            )
        else:
            img = util.imresize_np(img, 1 / 2, True)
        img = np.clip(img, 0.0, 1.0)
@@ -476,7 +536,10 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
    shuffle_order = random.sample(range(7), 7)
    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+        shuffle_order[idx1], shuffle_order[idx2] = (
+            shuffle_order[idx2],
+            shuffle_order[idx1],
+        )

    for i in shuffle_order:

@@ -491,19 +554,30 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None):
            # downsample2
            if random.random() < 0.75:
                sf1 = random.uniform(1, 2 * sf)
-                img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
-                                 interpolation=random.choice([1, 2, 3]))
+                img = cv2.resize(
+                    img,
+                    (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])),
+                    interpolation=random.choice([1, 2, 3]),
+                )
            else:
                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                k_shifted = (
+                    k_shifted / k_shifted.sum()
+                )  # blur with shifted kernel
+                img = ndimage.filters.convolve(
+                    img, np.expand_dims(k_shifted, axis=2), mode='mirror'
+                )
                img = img[0::sf, 0::sf, ...]  # nearest downsampling
            img = np.clip(img, 0.0, 1.0)

        elif i == 3:
            # downsample3
-            img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            img = cv2.resize(
+                img,
+                (int(1 / sf * a), int(1 / sf * b)),
+                interpolation=random.choice([1, 2, 3]),
+            )
            img = np.clip(img, 0.0, 1.0)

        elif i == 4:
@@ -548,15 +622,18 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
    sf_ori = sf

    h1, w1 = image.shape[:2]
-    image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...]  # mod crop
+    image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...]  # mod crop
    h, w = image.shape[:2]

    hq = image.copy()

    if sf == 4 and random.random() < scale2_prob:  # downsample1
        if np.random.rand() < 0.5:
-            image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
-                               interpolation=random.choice([1, 2, 3]))
+            image = cv2.resize(
+                image,
+                (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])),
+                interpolation=random.choice([1, 2, 3]),
+            )
        else:
            image = util.imresize_np(image, 1 / 2, True)
        image = np.clip(image, 0.0, 1.0)
@@ -565,7 +642,10 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
    shuffle_order = random.sample(range(7), 7)
    idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3)
    if idx1 > idx2:  # keep downsample3 last
-        shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1]
+        shuffle_order[idx1], shuffle_order[idx2] = (
+            shuffle_order[idx2],
+            shuffle_order[idx1],
+        )

    for i in shuffle_order:

@@ -583,20 +663,34 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
            # downsample2
            if random.random() < 0.8:
                sf1 = random.uniform(1, 2 * sf)
-                image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])),
-                                   interpolation=random.choice([1, 2, 3]))
+                image = cv2.resize(
+                    image,
+                    (
+                        int(1 / sf1 * image.shape[1]),
+                        int(1 / sf1 * image.shape[0]),
+                    ),
+                    interpolation=random.choice([1, 2, 3]),
+                )
            else:
                k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf))
                k_shifted = shift_pixel(k, sf)
-                k_shifted = k_shifted / k_shifted.sum()  # blur with shifted kernel
-                image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror')
+                k_shifted = (
+                    k_shifted / k_shifted.sum()
+                )  # blur with shifted kernel
+                image = ndimage.filters.convolve(
+                    image, np.expand_dims(k_shifted, axis=2), mode='mirror'
+                )
                image = image[0::sf, 0::sf, ...]  # nearest downsampling

            image = np.clip(image, 0.0, 1.0)

        elif i == 3:
            # downsample3
-            image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3]))
+            image = cv2.resize(
+                image,
+                (int(1 / sf * a), int(1 / sf * b)),
+                interpolation=random.choice([1, 2, 3]),
+            )
            image = np.clip(image, 0.0, 1.0)

        elif i == 4:
@@ -617,34 +711,41 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
    # add final JPEG compression noise
    image = add_JPEG_noise(image)
    image = util.single2uint(image)
-    example = {"image": image}
+    example = {'image': image}
    return example


-
-
 if __name__ == '__main__':
-    print("hey")
+    print('hey')
    img = util.imread_uint('utils/test.png', 3)
    img = img[:448, :448]
    h = img.shape[0] // 4
-    print("resizing to", h)
+    print('resizing to', h)
    sf = 4
    deg_fn = partial(degradation_bsrgan_variant, sf=sf)
    for i in range(20):
        print(i)
        img_hq = img
-        img_lq = deg_fn(img)["image"]
+        img_lq = deg_fn(img)['image']
        img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq)
        print(img_lq)
-        img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"]
+        img_lq_bicubic = albumentations.SmallestMaxSize(
+            max_size=h, interpolation=cv2.INTER_CUBIC
+        )(image=img_hq)['image']
        print(img_lq.shape)
-        print("bicubic", img_lq_bicubic.shape)
+        print('bicubic', img_lq_bicubic.shape)
        print(img_hq.shape)
-        lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                interpolation=0)
-        lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic),
-                                        (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
-                                        interpolation=0)
-        img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1)
+        lq_nearest = cv2.resize(
+            util.single2uint(img_lq),
+            (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+            interpolation=0,
+        )
+        lq_bicubic_nearest = cv2.resize(
+            util.single2uint(img_lq_bicubic),
+            (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])),
+            interpolation=0,
+        )
+        img_concat = np.concatenate(
+            [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1
+        )
        util.imsave(img_concat, str(i) + '.png')
--- a/ldm/modules/image_degradation/utils_image.py
+++ b/ldm/modules/image_degradation/utils_image.py
@@ -6,13 +6,14 @@ import torch
 import cv2
 from torchvision.utils import make_grid
 from datetime import datetime
-#import matplotlib.pyplot as plt   # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py
+
+# import matplotlib.pyplot as plt   # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py


-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'


-'''
+"""
 # --------------------------------------------
 # Kai Zhang (github: https://github.com/cszn)
 # 03/Mar/2019
@@ -20,10 +21,22 @@ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 # https://github.com/twhui/SRGAN-pyTorch
 # https://github.com/xinntao/BasicSR
 # --------------------------------------------
-'''
+"""


-IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif']
+IMG_EXTENSIONS = [
+    '.jpg',
+    '.JPG',
+    '.jpeg',
+    '.JPEG',
+    '.png',
+    '.PNG',
+    '.ppm',
+    '.PPM',
+    '.bmp',
+    '.BMP',
+    '.tif',
+]


 def is_image_file(filename):
@@ -49,19 +62,19 @@ def surf(Z, cmap='rainbow', figsize=None):
    ax3 = plt.axes(projection='3d')

    w, h = Z.shape[:2]
-    xx = np.arange(0,w,1)
-    yy = np.arange(0,h,1)
+    xx = np.arange(0, w, 1)
+    yy = np.arange(0, h, 1)
    X, Y = np.meshgrid(xx, yy)
-    ax3.plot_surface(X,Y,Z,cmap=cmap)
-    #ax3.contour(X,Y,Z, zdim='z',offset=-2，cmap=cmap)
+    ax3.plot_surface(X, Y, Z, cmap=cmap)
+    # ax3.contour(X,Y,Z, zdim='z',offset=-2，cmap=cmap)
    plt.show()


-'''
+"""
 # --------------------------------------------
 # get image pathes
 # --------------------------------------------
-'''
+"""


 def get_image_paths(dataroot):
@@ -83,26 +96,26 @@ def _get_paths_from_images(path):
    return images


-'''
+"""
 # --------------------------------------------
 # split large images into small images 
 # --------------------------------------------
-'''
+"""


 def patches_from_image(img, p_size=512, p_overlap=64, p_max=800):
    w, h = img.shape[:2]
    patches = []
    if w > p_max and h > p_max:
-        w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int))
-        h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int))
-        w1.append(w-p_size)
-        h1.append(h-p_size)
-#        print(w1)
-#        print(h1)
+        w1 = list(np.arange(0, w - p_size, p_size - p_overlap, dtype=np.int))
+        h1 = list(np.arange(0, h - p_size, p_size - p_overlap, dtype=np.int))
+        w1.append(w - p_size)
+        h1.append(h - p_size)
+        #        print(w1)
+        #        print(h1)
        for i in w1:
            for j in h1:
-                patches.append(img[i:i+p_size, j:j+p_size,:])
+                patches.append(img[i : i + p_size, j : j + p_size, :])
    else:
        patches.append(img)

@@ -118,11 +131,21 @@ def imssave(imgs, img_path):
    for i, img in enumerate(imgs):
        if img.ndim == 3:
            img = img[:, :, [2, 1, 0]]
-        new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png')
+        new_path = os.path.join(
+            os.path.dirname(img_path),
+            img_name + str('_s{:04d}'.format(i)) + '.png',
+        )
        cv2.imwrite(new_path, img)


-def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
+def split_imageset(
+    original_dataroot,
+    taget_dataroot,
+    n_channels=3,
+    p_size=800,
+    p_overlap=96,
+    p_max=1000,
+):
    """
    split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
    and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
@@ -139,15 +162,18 @@ def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800,
        # img_name, ext = os.path.splitext(os.path.basename(img_path))
        img = imread_uint(img_path, n_channels=n_channels)
        patches = patches_from_image(img, p_size, p_overlap, p_max)
-        imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
-        #if original_dataroot == taget_dataroot:
-        #del img_path
+        imssave(
+            patches, os.path.join(taget_dataroot, os.path.basename(img_path))
+        )
+        # if original_dataroot == taget_dataroot:
+        # del img_path

-'''
+
+"""
 # --------------------------------------------
 # makedir
 # --------------------------------------------
-'''
+"""


 def mkdir(path):
@@ -171,12 +197,12 @@ def mkdir_and_rename(path):
    os.makedirs(path)


-'''
+"""
 # --------------------------------------------
 # read image from path
 # opencv is fast, but read BGR numpy image
 # --------------------------------------------
-'''
+"""


 # --------------------------------------------
@@ -206,6 +232,7 @@ def imsave(img, img_path):
        img = img[:, :, [2, 1, 0]]
    cv2.imwrite(img_path, img)

+
 def imwrite(img, img_path):
    img = np.squeeze(img)
    if img.ndim == 3:
@@ -213,7 +240,6 @@ def imwrite(img, img_path):
    cv2.imwrite(img_path, img)


-
 # --------------------------------------------
 # get single image of size HxWxn_channles (BGR)
 # --------------------------------------------
@@ -221,7 +247,7 @@ def read_img(path):
    # read image by cv2
    # return: Numpy float32, HWC, BGR, [0,1]
    img = cv2.imread(path, cv2.IMREAD_UNCHANGED)  # cv2.IMREAD_GRAYSCALE
-    img = img.astype(np.float32) / 255.
+    img = img.astype(np.float32) / 255.0
    if img.ndim == 2:
        img = np.expand_dims(img, axis=2)
    # some images have 4 channels
@@ -230,7 +256,7 @@ def read_img(path):
    return img


-'''
+"""
 # --------------------------------------------
 # image format conversion
 # --------------------------------------------
@@ -238,7 +264,7 @@ def read_img(path):
 # numpy(single) <--->  tensor
 # numpy(unit)   <--->  tensor
 # --------------------------------------------
-'''
+"""


 # --------------------------------------------
@@ -248,22 +274,22 @@ def read_img(path):

 def uint2single(img):

-    return np.float32(img/255.)
+    return np.float32(img / 255.0)


 def single2uint(img):

-    return np.uint8((img.clip(0, 1)*255.).round())
+    return np.uint8((img.clip(0, 1) * 255.0).round())


 def uint162single(img):

-    return np.float32(img/65535.)
+    return np.float32(img / 65535.0)


 def single2uint16(img):

-    return np.uint16((img.clip(0, 1)*65535.).round())
+    return np.uint16((img.clip(0, 1) * 65535.0).round())


 # --------------------------------------------
@@ -275,14 +301,25 @@ def single2uint16(img):
 def uint2tensor4(img):
    if img.ndim == 2:
        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0)
+    return (
+        torch.from_numpy(np.ascontiguousarray(img))
+        .permute(2, 0, 1)
+        .float()
+        .div(255.0)
+        .unsqueeze(0)
+    )


 # convert uint to 3-dimensional torch tensor
 def uint2tensor3(img):
    if img.ndim == 2:
        img = np.expand_dims(img, axis=2)
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.)
+    return (
+        torch.from_numpy(np.ascontiguousarray(img))
+        .permute(2, 0, 1)
+        .float()
+        .div(255.0)
+    )


 # convert 2/3/4-dimensional torch tensor to uint
@@ -290,7 +327,7 @@ def tensor2uint(img):
    img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy()
    if img.ndim == 3:
        img = np.transpose(img, (1, 2, 0))
-    return np.uint8((img*255.0).round())
+    return np.uint8((img * 255.0).round())


 # --------------------------------------------
@@ -305,7 +342,12 @@ def single2tensor3(img):

 # convert single (HxWxC) to 4-dimensional torch tensor
 def single2tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0)
+    return (
+        torch.from_numpy(np.ascontiguousarray(img))
+        .permute(2, 0, 1)
+        .float()
+        .unsqueeze(0)
+    )


 # convert torch tensor to single
@@ -316,6 +358,7 @@ def tensor2single(img):

    return img

+
 # convert torch tensor to single
 def tensor2single3(img):
    img = img.data.squeeze().float().cpu().numpy()
@@ -327,30 +370,48 @@ def tensor2single3(img):


 def single2tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0)
+    return (
+        torch.from_numpy(np.ascontiguousarray(img))
+        .permute(2, 0, 1, 3)
+        .float()
+        .unsqueeze(0)
+    )


 def single32tensor5(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0)
+    return (
+        torch.from_numpy(np.ascontiguousarray(img))
+        .float()
+        .unsqueeze(0)
+        .unsqueeze(0)
+    )


 def single42tensor4(img):
-    return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
+    return (
+        torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float()
+    )


 # from skimage.io import imread, imsave
 def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
-    '''
+    """
    Converts a torch Tensor into an image Numpy array of BGR channel order
    Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order
    Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default)
-    '''
-    tensor = tensor.squeeze().float().cpu().clamp_(*min_max)  # squeeze first, then clamp
-    tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0])  # to range [0,1]
+    """
+    tensor = (
+        tensor.squeeze().float().cpu().clamp_(*min_max)
+    )  # squeeze first, then clamp
+    tensor = (tensor - min_max[0]) / (
+        min_max[1] - min_max[0]
+    )  # to range [0,1]
    n_dim = tensor.dim()
    if n_dim == 4:
        n_img = len(tensor)
-        img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy()
+        img_np = make_grid(
+            tensor, nrow=int(math.sqrt(n_img)), normalize=False
+        ).numpy()
        img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))  # HWC, BGR
    elif n_dim == 3:
        img_np = tensor.numpy()
@@ -359,14 +420,17 @@ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
        img_np = tensor.numpy()
    else:
        raise TypeError(
-            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim))
+            'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(
+                n_dim
+            )
+        )
    if out_type == np.uint8:
        img_np = (img_np * 255.0).round()
        # Important. Unlike matlab, numpy.unit8() WILL NOT round by default.
    return img_np.astype(out_type)


-'''
+"""
 # --------------------------------------------
 # Augmentation, flipe and/or rotate
 # --------------------------------------------
@@ -374,12 +438,11 @@ def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
 # (1) augmet_img: numpy image of WxHxC or WxH
 # (2) augment_img_tensor4: tensor image 1xCxWxH
 # --------------------------------------------
-'''
+"""


 def augment_img(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
+    """Kai Zhang (github: https://github.com/cszn)"""
    if mode == 0:
        return img
    elif mode == 1:
@@ -399,8 +462,7 @@ def augment_img(img, mode=0):


 def augment_img_tensor4(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
+    """Kai Zhang (github: https://github.com/cszn)"""
    if mode == 0:
        return img
    elif mode == 1:
@@ -420,8 +482,7 @@ def augment_img_tensor4(img, mode=0):


 def augment_img_tensor(img, mode=0):
-    '''Kai Zhang (github: https://github.com/cszn)
-    '''
+    """Kai Zhang (github: https://github.com/cszn)"""
    img_size = img.size()
    img_np = img.data.cpu().numpy()
    if len(img_size) == 3:
@@ -484,11 +545,11 @@ def augment_imgs(img_list, hflip=True, rot=True):
    return [_augment(img) for img in img_list]


-'''
+"""
 # --------------------------------------------
 # modcrop and shave
 # --------------------------------------------
-'''
+"""


 def modcrop(img_in, scale):
@@ -497,11 +558,11 @@ def modcrop(img_in, scale):
    if img.ndim == 2:
        H, W = img.shape
        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r]
+        img = img[: H - H_r, : W - W_r]
    elif img.ndim == 3:
        H, W, C = img.shape
        H_r, W_r = H % scale, W % scale
-        img = img[:H - H_r, :W - W_r, :]
+        img = img[: H - H_r, : W - W_r, :]
    else:
        raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
    return img
@@ -511,11 +572,11 @@ def shave(img_in, border=0):
    # img_in: Numpy, HWC or HW
    img = np.copy(img_in)
    h, w = img.shape[:2]
-    img = img[border:h-border, border:w-border]
+    img = img[border : h - border, border : w - border]
    return img


-'''
+"""
 # --------------------------------------------
 # image processing process on numpy image
 # channel_convert(in_c, tar_type, img_list):
@@ -523,74 +584,92 @@ def shave(img_in, border=0):
 # bgr2ycbcr(img, only_y=True):
 # ycbcr2rgb(img):
 # --------------------------------------------
-'''
+"""


 def rgb2ycbcr(img, only_y=True):
-    '''same as matlab rgb2ycbcr
+    """same as matlab rgb2ycbcr
    only_y: only return Y channel
    Input:
        uint8, [0, 255]
        float, [0, 1]
-    '''
+    """
    in_img_type = img.dtype
    img.astype(np.float32)
    if in_img_type != np.uint8:
-        img *= 255.
+        img *= 255.0
    # convert
    if only_y:
        rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0
    else:
-        rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
-                              [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128]
+        rlt = np.matmul(
+            img,
+            [
+                [65.481, -37.797, 112.0],
+                [128.553, -74.203, -93.786],
+                [24.966, 112.0, -18.214],
+            ],
+        ) / 255.0 + [16, 128, 128]
    if in_img_type == np.uint8:
        rlt = rlt.round()
    else:
-        rlt /= 255.
+        rlt /= 255.0
    return rlt.astype(in_img_type)


 def ycbcr2rgb(img):
-    '''same as matlab ycbcr2rgb
+    """same as matlab ycbcr2rgb
    Input:
        uint8, [0, 255]
        float, [0, 1]
-    '''
+    """
    in_img_type = img.dtype
    img.astype(np.float32)
    if in_img_type != np.uint8:
-        img *= 255.
+        img *= 255.0
    # convert
-    rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071],
-                          [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836]
+    rlt = np.matmul(
+        img,
+        [
+            [0.00456621, 0.00456621, 0.00456621],
+            [0, -0.00153632, 0.00791071],
+            [0.00625893, -0.00318811, 0],
+        ],
+    ) * 255.0 + [-222.921, 135.576, -276.836]
    if in_img_type == np.uint8:
        rlt = rlt.round()
    else:
-        rlt /= 255.
+        rlt /= 255.0
    return rlt.astype(in_img_type)


 def bgr2ycbcr(img, only_y=True):
-    '''bgr version of rgb2ycbcr
+    """bgr version of rgb2ycbcr
    only_y: only return Y channel
    Input:
        uint8, [0, 255]
        float, [0, 1]
-    '''
+    """
    in_img_type = img.dtype
    img.astype(np.float32)
    if in_img_type != np.uint8:
-        img *= 255.
+        img *= 255.0
    # convert
    if only_y:
        rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
    else:
-        rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
-                              [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
+        rlt = np.matmul(
+            img,
+            [
+                [24.966, 112.0, -18.214],
+                [128.553, -74.203, -93.786],
+                [65.481, -37.797, 112.0],
+            ],
+        ) / 255.0 + [16, 128, 128]
    if in_img_type == np.uint8:
        rlt = rlt.round()
    else:
-        rlt /= 255.
+        rlt /= 255.0
    return rlt.astype(in_img_type)


@@ -608,11 +687,11 @@ def channel_convert(in_c, tar_type, img_list):
        return img_list


-'''
+"""
 # --------------------------------------------
 # metric, PSNR and SSIM
 # --------------------------------------------
-'''
+"""


 # --------------------------------------------
@@ -620,17 +699,17 @@ def channel_convert(in_c, tar_type, img_list):
 # --------------------------------------------
 def calculate_psnr(img1, img2, border=0):
    # img1 and img2 have range [0, 255]
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
+    # img1 = img1.squeeze()
+    # img2 = img2.squeeze()
    if not img1.shape == img2.shape:
        raise ValueError('Input images must have the same dimensions.')
    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
+    img1 = img1[border : h - border, border : w - border]
+    img2 = img2[border : h - border, border : w - border]

    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)
-    mse = np.mean((img1 - img2)**2)
+    mse = np.mean((img1 - img2) ** 2)
    if mse == 0:
        return float('inf')
    return 20 * math.log10(255.0 / math.sqrt(mse))
@@ -640,17 +719,17 @@ def calculate_psnr(img1, img2, border=0):
 # SSIM
 # --------------------------------------------
 def calculate_ssim(img1, img2, border=0):
-    '''calculate SSIM
+    """calculate SSIM
    the same outputs as MATLAB's
    img1, img2: [0, 255]
-    '''
-    #img1 = img1.squeeze()
-    #img2 = img2.squeeze()
+    """
+    # img1 = img1.squeeze()
+    # img2 = img2.squeeze()
    if not img1.shape == img2.shape:
        raise ValueError('Input images must have the same dimensions.')
    h, w = img1.shape[:2]
-    img1 = img1[border:h-border, border:w-border]
-    img2 = img2[border:h-border, border:w-border]
+    img1 = img1[border : h - border, border : w - border]
+    img2 = img2[border : h - border, border : w - border]

    if img1.ndim == 2:
        return ssim(img1, img2)
@@ -658,7 +737,7 @@ def calculate_ssim(img1, img2, border=0):
        if img1.shape[2] == 3:
            ssims = []
            for i in range(3):
-                ssims.append(ssim(img1[:,:,i], img2[:,:,i]))
+                ssims.append(ssim(img1[:, :, i], img2[:, :, i]))
            return np.array(ssims).mean()
        elif img1.shape[2] == 1:
            return ssim(np.squeeze(img1), np.squeeze(img2))
@@ -667,8 +746,8 @@ def calculate_ssim(img1, img2, border=0):


 def ssim(img1, img2):
-    C1 = (0.01 * 255)**2
-    C2 = (0.03 * 255)**2
+    C1 = (0.01 * 255) ** 2
+    C2 = (0.03 * 255) ** 2

    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)
@@ -684,16 +763,17 @@ def ssim(img1, img2):
    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2

-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
-                                                            (sigma1_sq + sigma2_sq + C2))
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
+        (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
+    )
    return ssim_map.mean()


-'''
+"""
 # --------------------------------------------
 # matlab's bicubic imresize (numpy and torch) [0, 1]
 # --------------------------------------------
-'''
+"""


 # matlab 'imresize' function, now only support 'bicubic'
@@ -701,11 +781,14 @@ def cubic(x):
    absx = torch.abs(x)
    absx2 = absx**2
    absx3 = absx**3
-    return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \
-        (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx))
+    return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).type_as(absx)) + (
+        -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
+    ) * (((absx > 1) * (absx <= 2)).type_as(absx))


-def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing):
+def calculate_weights_indices(
+    in_length, out_length, scale, kernel, kernel_width, antialiasing
+):
    if (scale < 1) and (antialiasing):
        # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width
        kernel_width = kernel_width / scale
@@ -729,8 +812,9 @@ def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width

    # The indices of the input pixels involved in computing the k-th output
    # pixel are in row k of the indices matrix.
-    indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view(
-        1, P).expand(out_length, P)
+    indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(
+        0, P - 1, P
+    ).view(1, P).expand(out_length, P)

    # The weights used to compute the k-th output pixel are in row k of the
    # weights matrix.
@@ -771,7 +855,11 @@ def imresize(img, scale, antialiasing=True):
    if need_squeeze:
        img.unsqueeze_(0)
    in_C, in_H, in_W = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
+    out_C, out_H, out_W = (
+        in_C,
+        math.ceil(in_H * scale),
+        math.ceil(in_W * scale),
+    )
    kernel_width = 4
    kernel = 'cubic'

@@ -782,9 +870,11 @@ def imresize(img, scale, antialiasing=True):

    # get weights and indices
    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
+        in_H, out_H, scale, kernel, kernel_width, antialiasing
+    )
    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
+        in_W, out_W, scale, kernel, kernel_width, antialiasing
+    )
    # process H dimension
    # symmetric copying
    img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W)
@@ -805,7 +895,11 @@ def imresize(img, scale, antialiasing=True):
    for i in range(out_H):
        idx = int(indices_H[i][0])
        for j in range(out_C):
-            out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i])
+            out_1[j, i, :] = (
+                img_aug[j, idx : idx + kernel_width, :]
+                .transpose(0, 1)
+                .mv(weights_H[i])
+            )

    # process W dimension
    # symmetric copying
@@ -827,7 +921,9 @@ def imresize(img, scale, antialiasing=True):
    for i in range(out_W):
        idx = int(indices_W[i][0])
        for j in range(out_C):
-            out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i])
+            out_2[j, :, i] = out_1_aug[j, :, idx : idx + kernel_width].mv(
+                weights_W[i]
+            )
    if need_squeeze:
        out_2.squeeze_()
    return out_2
@@ -846,7 +942,11 @@ def imresize_np(img, scale, antialiasing=True):
        img.unsqueeze_(2)

    in_H, in_W, in_C = img.size()
-    out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale)
+    out_C, out_H, out_W = (
+        in_C,
+        math.ceil(in_H * scale),
+        math.ceil(in_W * scale),
+    )
    kernel_width = 4
    kernel = 'cubic'

@@ -857,9 +957,11 @@ def imresize_np(img, scale, antialiasing=True):

    # get weights and indices
    weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices(
-        in_H, out_H, scale, kernel, kernel_width, antialiasing)
+        in_H, out_H, scale, kernel, kernel_width, antialiasing
+    )
    weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices(
-        in_W, out_W, scale, kernel, kernel_width, antialiasing)
+        in_W, out_W, scale, kernel, kernel_width, antialiasing
+    )
    # process H dimension
    # symmetric copying
    img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C)
@@ -880,7 +982,11 @@ def imresize_np(img, scale, antialiasing=True):
    for i in range(out_H):
        idx = int(indices_H[i][0])
        for j in range(out_C):
-            out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i])
+            out_1[i, :, j] = (
+                img_aug[idx : idx + kernel_width, :, j]
+                .transpose(0, 1)
+                .mv(weights_H[i])
+            )

    # process W dimension
    # symmetric copying
@@ -902,7 +1008,9 @@ def imresize_np(img, scale, antialiasing=True):
    for i in range(out_W):
        idx = int(indices_W[i][0])
        for j in range(out_C):
-            out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i])
+            out_2[:, i, j] = out_1_aug[:, idx : idx + kernel_width, j].mv(
+                weights_W[i]
+            )
    if need_squeeze:
        out_2.squeeze_()

@@ -913,4 +1021,4 @@ if __name__ == '__main__':
    print('---')
 #    img = imread_uint('test.bmp', 3)
 #    img = uint2single(img)
-#    img_bicubic = imresize_np(img, 1/4)
+#    img_bicubic = imresize_np(img, 1/4)
--- a/ldm/modules/losses/init.py
+++ b/ldm/modules/losses/init.py
@@ -1 +1 @@
-from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
+from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator
--- a/ldm/modules/losses/contperceptual.py
+++ b/ldm/modules/losses/contperceptual.py
@@ -5,13 +5,24 @@ from taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/


 class LPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_loss="hinge"):
+    def __init__(
+        self,
+        disc_start,
+        logvar_init=0.0,
+        kl_weight=1.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_loss='hinge',
+    ):

        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
+        assert disc_loss in ['hinge', 'vanilla']
        self.kl_weight = kl_weight
        self.pixel_weight = pixelloss_weight
        self.perceptual_loss = LPIPS().eval()
@@ -19,42 +30,68 @@ class LPIPSWithDiscriminator(nn.Module):
        # output log variance
        self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init)

-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm
-                                                 ).apply(weights_init)
+        self.discriminator = NLayerDiscriminator(
+            input_nc=disc_in_channels,
+            n_layers=disc_num_layers,
+            use_actnorm=use_actnorm,
+        ).apply(weights_init)
        self.discriminator_iter_start = disc_start
-        self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss
+        self.disc_loss = (
+            hinge_d_loss if disc_loss == 'hinge' else vanilla_d_loss
+        )
        self.disc_factor = disc_factor
        self.discriminator_weight = disc_weight
        self.disc_conditional = disc_conditional

    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+            nll_grads = torch.autograd.grad(
+                nll_loss, last_layer, retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, last_layer, retain_graph=True
+            )[0]
        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+            nll_grads = torch.autograd.grad(
+                nll_loss, self.last_layer[0], retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, self.last_layer[0], retain_graph=True
+            )[0]

        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
        d_weight = d_weight * self.discriminator_weight
        return d_weight

-    def forward(self, inputs, reconstructions, posteriors, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train",
-                weights=None):
-        rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+    def forward(
+        self,
+        inputs,
+        reconstructions,
+        posteriors,
+        optimizer_idx,
+        global_step,
+        last_layer=None,
+        cond=None,
+        split='train',
+        weights=None,
+    ):
+        rec_loss = torch.abs(
+            inputs.contiguous() - reconstructions.contiguous()
+        )
        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
            rec_loss = rec_loss + self.perceptual_weight * p_loss

        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
        weighted_nll_loss = nll_loss
        if weights is not None:
-            weighted_nll_loss = weights*nll_loss
-        weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+            weighted_nll_loss = weights * nll_loss
+        weighted_nll_loss = (
+            torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        )
        nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
        kl_loss = posteriors.kl()
        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
@@ -67,45 +104,72 @@ class LPIPSWithDiscriminator(nn.Module):
                logits_fake = self.discriminator(reconstructions.contiguous())
            else:
                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+                logits_fake = self.discriminator(
+                    torch.cat((reconstructions.contiguous(), cond), dim=1)
+                )
            g_loss = -torch.mean(logits_fake)

            if self.disc_factor > 0.0:
                try:
-                    d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
                except RuntimeError:
                    assert not self.training
                    d_weight = torch.tensor(0.0)
            else:
                d_weight = torch.tensor(0.0)

-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss
+            disc_factor = adopt_weight(
+                self.disc_factor,
+                global_step,
+                threshold=self.discriminator_iter_start,
+            )
+            loss = (
+                weighted_nll_loss
+                + self.kl_weight * kl_loss
+                + d_weight * disc_factor * g_loss
+            )

-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(),
-                   "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
+            log = {
+                '{}/total_loss'.format(split): loss.clone().detach().mean(),
+                '{}/logvar'.format(split): self.logvar.detach(),
+                '{}/kl_loss'.format(split): kl_loss.detach().mean(),
+                '{}/nll_loss'.format(split): nll_loss.detach().mean(),
+                '{}/rec_loss'.format(split): rec_loss.detach().mean(),
+                '{}/d_weight'.format(split): d_weight.detach(),
+                '{}/disc_factor'.format(split): torch.tensor(disc_factor),
+                '{}/g_loss'.format(split): g_loss.detach().mean(),
+            }
            return loss, log

        if optimizer_idx == 1:
            # second pass for discriminator update
            if cond is None:
                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+                logits_fake = self.discriminator(
+                    reconstructions.contiguous().detach()
+                )
            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+                logits_real = self.discriminator(
+                    torch.cat((inputs.contiguous().detach(), cond), dim=1)
+                )
+                logits_fake = self.discriminator(
+                    torch.cat(
+                        (reconstructions.contiguous().detach(), cond), dim=1
+                    )
+                )

-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            disc_factor = adopt_weight(
+                self.disc_factor,
+                global_step,
+                threshold=self.discriminator_iter_start,
+            )
            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)

-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
+            log = {
+                '{}/disc_loss'.format(split): d_loss.clone().detach().mean(),
+                '{}/logits_real'.format(split): logits_real.detach().mean(),
+                '{}/logits_fake'.format(split): logits_fake.detach().mean(),
+            }
            return d_loss, log
-
--- a/ldm/modules/losses/vqperceptual.py
+++ b/ldm/modules/losses/vqperceptual.py
@@ -3,21 +3,25 @@ from torch import nn
 import torch.nn.functional as F
 from einops import repeat

-from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
+from taming.modules.discriminator.model import (
+    NLayerDiscriminator,
+    weights_init,
+)
 from taming.modules.losses.lpips import LPIPS
 from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss


 def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
    assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0]
-    loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3])
-    loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3])
+    loss_real = torch.mean(F.relu(1.0 - logits_real), dim=[1, 2, 3])
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake), dim=[1, 2, 3])
    loss_real = (weights * loss_real).sum() / weights.sum()
    loss_fake = (weights * loss_fake).sum() / weights.sum()
    d_loss = 0.5 * (loss_real + loss_fake)
    return d_loss

-def adopt_weight(weight, global_step, threshold=0, value=0.):
+
+def adopt_weight(weight, global_step, threshold=0, value=0.0):
    if global_step < threshold:
        weight = value
    return weight
@@ -26,57 +30,76 @@ def adopt_weight(weight, global_step, threshold=0, value=0.):
 def measure_perplexity(predicted_indices, n_embed):
    # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py
    # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally
-    encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
+    encodings = (
+        F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed)
+    )
    avg_probs = encodings.mean(0)
    perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp()
    cluster_use = torch.sum(avg_probs > 0)
    return perplexity, cluster_use

+
 def l1(x, y):
-    return torch.abs(x-y)
+    return torch.abs(x - y)


 def l2(x, y):
-    return torch.pow((x-y), 2)
+    return torch.pow((x - y), 2)


 class VQLPIPSWithDiscriminator(nn.Module):
-    def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
-                 disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0,
-                 perceptual_weight=1.0, use_actnorm=False, disc_conditional=False,
-                 disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips",
-                 pixel_loss="l1"):
+    def __init__(
+        self,
+        disc_start,
+        codebook_weight=1.0,
+        pixelloss_weight=1.0,
+        disc_num_layers=3,
+        disc_in_channels=3,
+        disc_factor=1.0,
+        disc_weight=1.0,
+        perceptual_weight=1.0,
+        use_actnorm=False,
+        disc_conditional=False,
+        disc_ndf=64,
+        disc_loss='hinge',
+        n_classes=None,
+        perceptual_loss='lpips',
+        pixel_loss='l1',
+    ):
        super().__init__()
-        assert disc_loss in ["hinge", "vanilla"]
-        assert perceptual_loss in ["lpips", "clips", "dists"]
-        assert pixel_loss in ["l1", "l2"]
+        assert disc_loss in ['hinge', 'vanilla']
+        assert perceptual_loss in ['lpips', 'clips', 'dists']
+        assert pixel_loss in ['l1', 'l2']
        self.codebook_weight = codebook_weight
        self.pixel_weight = pixelloss_weight
-        if perceptual_loss == "lpips":
-            print(f"{self.__class__.__name__}: Running with LPIPS.")
+        if perceptual_loss == 'lpips':
+            print(f'{self.__class__.__name__}: Running with LPIPS.')
            self.perceptual_loss = LPIPS().eval()
        else:
-            raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<")
+            raise ValueError(
+                f'Unknown perceptual loss: >> {perceptual_loss} <<'
+            )
        self.perceptual_weight = perceptual_weight

-        if pixel_loss == "l1":
+        if pixel_loss == 'l1':
            self.pixel_loss = l1
        else:
            self.pixel_loss = l2

-        self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels,
-                                                 n_layers=disc_num_layers,
-                                                 use_actnorm=use_actnorm,
-                                                 ndf=disc_ndf
-                                                 ).apply(weights_init)
+        self.discriminator = NLayerDiscriminator(
+            input_nc=disc_in_channels,
+            n_layers=disc_num_layers,
+            use_actnorm=use_actnorm,
+            ndf=disc_ndf,
+        ).apply(weights_init)
        self.discriminator_iter_start = disc_start
-        if disc_loss == "hinge":
+        if disc_loss == 'hinge':
            self.disc_loss = hinge_d_loss
-        elif disc_loss == "vanilla":
+        elif disc_loss == 'vanilla':
            self.disc_loss = vanilla_d_loss
        else:
            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
-        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
+        print(f'VQLPIPSWithDiscriminator running with {disc_loss} loss.')
        self.disc_factor = disc_factor
        self.discriminator_weight = disc_weight
        self.disc_conditional = disc_conditional
@@ -84,31 +107,53 @@ class VQLPIPSWithDiscriminator(nn.Module):

    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
        if last_layer is not None:
-            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+            nll_grads = torch.autograd.grad(
+                nll_loss, last_layer, retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, last_layer, retain_graph=True
+            )[0]
        else:
-            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
-            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+            nll_grads = torch.autograd.grad(
+                nll_loss, self.last_layer[0], retain_graph=True
+            )[0]
+            g_grads = torch.autograd.grad(
+                g_loss, self.last_layer[0], retain_graph=True
+            )[0]

        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
        d_weight = d_weight * self.discriminator_weight
        return d_weight

-    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,
-                global_step, last_layer=None, cond=None, split="train", predicted_indices=None):
+    def forward(
+        self,
+        codebook_loss,
+        inputs,
+        reconstructions,
+        optimizer_idx,
+        global_step,
+        last_layer=None,
+        cond=None,
+        split='train',
+        predicted_indices=None,
+    ):
        if not exists(codebook_loss):
-            codebook_loss = torch.tensor([0.]).to(inputs.device)
-        #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
-        rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous())
+            codebook_loss = torch.tensor([0.0]).to(inputs.device)
+        # rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous())
+        rec_loss = self.pixel_loss(
+            inputs.contiguous(), reconstructions.contiguous()
+        )
        if self.perceptual_weight > 0:
-            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            p_loss = self.perceptual_loss(
+                inputs.contiguous(), reconstructions.contiguous()
+            )
            rec_loss = rec_loss + self.perceptual_weight * p_loss
        else:
            p_loss = torch.tensor([0.0])

        nll_loss = rec_loss
-        #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        # nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
        nll_loss = torch.mean(nll_loss)

        # now the GAN part
@@ -119,49 +164,77 @@ class VQLPIPSWithDiscriminator(nn.Module):
                logits_fake = self.discriminator(reconstructions.contiguous())
            else:
                assert self.disc_conditional
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1))
+                logits_fake = self.discriminator(
+                    torch.cat((reconstructions.contiguous(), cond), dim=1)
+                )
            g_loss = -torch.mean(logits_fake)

            try:
-                d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+                d_weight = self.calculate_adaptive_weight(
+                    nll_loss, g_loss, last_layer=last_layer
+                )
            except RuntimeError:
                assert not self.training
                d_weight = torch.tensor(0.0)

-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
-            loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean()
+            disc_factor = adopt_weight(
+                self.disc_factor,
+                global_step,
+                threshold=self.discriminator_iter_start,
+            )
+            loss = (
+                nll_loss
+                + d_weight * disc_factor * g_loss
+                + self.codebook_weight * codebook_loss.mean()
+            )

-            log = {"{}/total_loss".format(split): loss.clone().detach().mean(),
-                   "{}/quant_loss".format(split): codebook_loss.detach().mean(),
-                   "{}/nll_loss".format(split): nll_loss.detach().mean(),
-                   "{}/rec_loss".format(split): rec_loss.detach().mean(),
-                   "{}/p_loss".format(split): p_loss.detach().mean(),
-                   "{}/d_weight".format(split): d_weight.detach(),
-                   "{}/disc_factor".format(split): torch.tensor(disc_factor),
-                   "{}/g_loss".format(split): g_loss.detach().mean(),
-                   }
+            log = {
+                '{}/total_loss'.format(split): loss.clone().detach().mean(),
+                '{}/quant_loss'.format(split): codebook_loss.detach().mean(),
+                '{}/nll_loss'.format(split): nll_loss.detach().mean(),
+                '{}/rec_loss'.format(split): rec_loss.detach().mean(),
+                '{}/p_loss'.format(split): p_loss.detach().mean(),
+                '{}/d_weight'.format(split): d_weight.detach(),
+                '{}/disc_factor'.format(split): torch.tensor(disc_factor),
+                '{}/g_loss'.format(split): g_loss.detach().mean(),
+            }
            if predicted_indices is not None:
                assert self.n_classes is not None
                with torch.no_grad():
-                    perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes)
-                log[f"{split}/perplexity"] = perplexity
-                log[f"{split}/cluster_usage"] = cluster_usage
+                    perplexity, cluster_usage = measure_perplexity(
+                        predicted_indices, self.n_classes
+                    )
+                log[f'{split}/perplexity'] = perplexity
+                log[f'{split}/cluster_usage'] = cluster_usage
            return loss, log

        if optimizer_idx == 1:
            # second pass for discriminator update
            if cond is None:
                logits_real = self.discriminator(inputs.contiguous().detach())
-                logits_fake = self.discriminator(reconstructions.contiguous().detach())
+                logits_fake = self.discriminator(
+                    reconstructions.contiguous().detach()
+                )
            else:
-                logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1))
-                logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1))
+                logits_real = self.discriminator(
+                    torch.cat((inputs.contiguous().detach(), cond), dim=1)
+                )
+                logits_fake = self.discriminator(
+                    torch.cat(
+                        (reconstructions.contiguous().detach(), cond), dim=1
+                    )
+                )

-            disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start)
+            disc_factor = adopt_weight(
+                self.disc_factor,
+                global_step,
+                threshold=self.discriminator_iter_start,
+            )
            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)

-            log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(),
-                   "{}/logits_real".format(split): logits_real.detach().mean(),
-                   "{}/logits_fake".format(split): logits_fake.detach().mean()
-                   }
+            log = {
+                '{}/disc_loss'.format(split): d_loss.clone().detach().mean(),
+                '{}/logits_real'.format(split): logits_real.detach().mean(),
+                '{}/logits_fake'.format(split): logits_fake.detach().mean(),
+            }
            return d_loss, log
--- a/ldm/modules/x_transformer.py
+++ b/ldm/modules/x_transformer.py
@@ -11,15 +11,13 @@ from einops import rearrange, repeat, reduce

 DEFAULT_DIM_HEAD = 64

-Intermediates = namedtuple('Intermediates', [
-    'pre_softmax_attn',
-    'post_softmax_attn'
-])
+Intermediates = namedtuple(
+    'Intermediates', ['pre_softmax_attn', 'post_softmax_attn']
+)

-LayerIntermediates = namedtuple('Intermediates', [
-    'hiddens',
-    'attn_intermediates'
-])
+LayerIntermediates = namedtuple(
+    'Intermediates', ['hiddens', 'attn_intermediates']
+)


 class AbsolutePositionalEmbedding(nn.Module):
@@ -39,11 +37,16 @@ class AbsolutePositionalEmbedding(nn.Module):
 class FixedPositionalEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, x, seq_dim=1, offset=0):
-        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
+        t = (
+            torch.arange(x.shape[seq_dim], device=x.device).type_as(
+                self.inv_freq
+            )
+            + offset
+        )
        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
        return emb[None, :, :]
@@ -51,6 +54,7 @@ class FixedPositionalEmbedding(nn.Module):

 # helpers

+
 def exists(val):
    return val is not None

@@ -64,18 +68,21 @@ def default(val, d):
 def always(val):
    def inner(*args, **kwargs):
        return val
+
    return inner


 def not_equals(val):
    def inner(x):
        return x != val
+
    return inner


 def equals(val):
    def inner(x):
        return x == val
+
    return inner


@@ -85,6 +92,7 @@ def max_neg_value(tensor):

 # keyword argument helpers

+
 def pick_and_pop(keys, d):
    values = list(map(lambda key: d.pop(key), keys))
    return dict(zip(keys, values))
@@ -108,8 +116,15 @@ def group_by_key_prefix(prefix, d):


 def groupby_prefix_and_trim(prefix, d):
-    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
-    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    kwargs_with_prefix, kwargs = group_dict_by_key(
+        partial(string_begins_with, prefix), d
+    )
+    kwargs_without_prefix = dict(
+        map(
+            lambda x: (x[0][len(prefix) :], x[1]),
+            tuple(kwargs_with_prefix.items()),
+        )
+    )
    return kwargs_without_prefix, kwargs


@@ -139,7 +154,7 @@ class Rezero(nn.Module):
 class ScaleNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
-        self.scale = dim ** -0.5
+        self.scale = dim**-0.5
        self.eps = eps
        self.g = nn.Parameter(torch.ones(1))

@@ -151,7 +166,7 @@ class ScaleNorm(nn.Module):
 class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-8):
        super().__init__()
-        self.scale = dim ** -0.5
+        self.scale = dim**-0.5
        self.eps = eps
        self.g = nn.Parameter(torch.ones(dim))

@@ -173,7 +188,7 @@ class GRUGating(nn.Module):
    def forward(self, x, residual):
        gated_output = self.gru(
            rearrange(x, 'b n d -> (b n) d'),
-            rearrange(residual, 'b n d -> (b n) d')
+            rearrange(residual, 'b n d -> (b n) d'),
        )

        return gated_output.reshape_as(x)
@@ -181,6 +196,7 @@ class GRUGating(nn.Module):

 # feedforward

+
 class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
@@ -192,19 +208,18 @@ class GEGLU(nn.Module):


 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
        super().__init__()
        inner_dim = int(dim * mult)
        dim_out = default(dim_out, dim)
-        project_in = nn.Sequential(
-            nn.Linear(dim, inner_dim),
-            nn.GELU()
-        ) if not glu else GEGLU(dim, inner_dim)
+        project_in = (
+            nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU())
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )

        self.net = nn.Sequential(
-            project_in,
-            nn.Dropout(dropout),
-            nn.Linear(inner_dim, dim_out)
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
        )

    def forward(self, x):
@@ -214,23 +229,25 @@ class FeedForward(nn.Module):
 # attention.
 class Attention(nn.Module):
    def __init__(
-            self,
-            dim,
-            dim_head=DEFAULT_DIM_HEAD,
-            heads=8,
-            causal=False,
-            mask=None,
-            talking_heads=False,
-            sparse_topk=None,
-            use_entmax15=False,
-            num_mem_kv=0,
-            dropout=0.,
-            on_attn=False
+        self,
+        dim,
+        dim_head=DEFAULT_DIM_HEAD,
+        heads=8,
+        causal=False,
+        mask=None,
+        talking_heads=False,
+        sparse_topk=None,
+        use_entmax15=False,
+        num_mem_kv=0,
+        dropout=0.0,
+        on_attn=False,
    ):
        super().__init__()
        if use_entmax15:
-            raise NotImplementedError("Check out entmax activation instead of softmax activation!")
-        self.scale = dim_head ** -0.5
+            raise NotImplementedError(
+                'Check out entmax activation instead of softmax activation!'
+            )
+        self.scale = dim_head**-0.5
        self.heads = heads
        self.causal = causal
        self.mask = mask
@@ -252,7 +269,7 @@ class Attention(nn.Module):
        self.sparse_topk = sparse_topk

        # entmax
-        #self.attn_fn = entmax15 if use_entmax15 else F.softmax
+        # self.attn_fn = entmax15 if use_entmax15 else F.softmax
        self.attn_fn = F.softmax

        # add memory key / values
@@ -263,20 +280,29 @@ class Attention(nn.Module):

        # attention on attention
        self.attn_on_attn = on_attn
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
+        self.to_out = (
+            nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU())
+            if on_attn
+            else nn.Linear(inner_dim, dim)
+        )

    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            rel_pos=None,
-            sinusoidal_emb=None,
-            prev_attn=None,
-            mem=None
+        self,
+        x,
+        context=None,
+        mask=None,
+        context_mask=None,
+        rel_pos=None,
+        sinusoidal_emb=None,
+        prev_attn=None,
+        mem=None,
    ):
-        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
+        b, n, _, h, talking_heads, device = (
+            *x.shape,
+            self.heads,
+            self.talking_heads,
+            x.device,
+        )
        kv_input = default(context, x)

        q_input = x
@@ -297,23 +323,35 @@ class Attention(nn.Module):
        k = self.to_k(k_input)
        v = self.to_v(v_input)

-        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
+        q, k, v = map(
+            lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)
+        )

        input_mask = None
        if any(map(exists, (mask, context_mask))):
-            q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
+            q_mask = default(
+                mask, lambda: torch.ones((b, n), device=device).bool()
+            )
            k_mask = q_mask if not exists(context) else context_mask
-            k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
+            k_mask = default(
+                k_mask,
+                lambda: torch.ones((b, k.shape[-2]), device=device).bool(),
+            )
            q_mask = rearrange(q_mask, 'b i -> b () i ()')
            k_mask = rearrange(k_mask, 'b j -> b () () j')
            input_mask = q_mask * k_mask

        if self.num_mem_kv > 0:
-            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
+            mem_k, mem_v = map(
+                lambda t: repeat(t, 'h n d -> b h n d', b=b),
+                (self.mem_k, self.mem_v),
+            )
            k = torch.cat((mem_k, k), dim=-2)
            v = torch.cat((mem_v, v), dim=-2)
            if exists(input_mask):
-                input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
+                input_mask = F.pad(
+                    input_mask, (self.num_mem_kv, 0), value=True
+                )

        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
        mask_value = max_neg_value(dots)
@@ -324,7 +362,9 @@ class Attention(nn.Module):
        pre_softmax_attn = dots

        if talking_heads:
-            dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
+            dots = einsum(
+                'b h i j, h k -> b k i j', dots, self.pre_softmax_proj
+            ).contiguous()

        if exists(rel_pos):
            dots = rel_pos(dots)
@@ -336,7 +376,9 @@ class Attention(nn.Module):
        if self.causal:
            i, j = dots.shape[-2:]
            r = torch.arange(i, device=device)
-            mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
+            mask = rearrange(r, 'i -> () () i ()') < rearrange(
+                r, 'j -> () () () j'
+            )
            mask = F.pad(mask, (j - i, 0), value=False)
            dots.masked_fill_(mask, mask_value)
            del mask
@@ -354,14 +396,16 @@ class Attention(nn.Module):
        attn = self.dropout(attn)

        if talking_heads:
-            attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
+            attn = einsum(
+                'b h i j, h k -> b k i j', attn, self.post_softmax_proj
+            ).contiguous()

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')

        intermediates = Intermediates(
            pre_softmax_attn=pre_softmax_attn,
-            post_softmax_attn=post_softmax_attn
+            post_softmax_attn=post_softmax_attn,
        )

        return self.to_out(out), intermediates
@@ -369,28 +413,28 @@ class Attention(nn.Module):

 class AttentionLayers(nn.Module):
    def __init__(
-            self,
-            dim,
-            depth,
-            heads=8,
-            causal=False,
-            cross_attend=False,
-            only_cross=False,
-            use_scalenorm=False,
-            use_rmsnorm=False,
-            use_rezero=False,
-            rel_pos_num_buckets=32,
-            rel_pos_max_distance=128,
-            position_infused_attn=False,
-            custom_layers=None,
-            sandwich_coef=None,
-            par_ratio=None,
-            residual_attn=False,
-            cross_residual_attn=False,
-            macaron=False,
-            pre_norm=True,
-            gate_residual=False,
-            **kwargs
+        self,
+        dim,
+        depth,
+        heads=8,
+        causal=False,
+        cross_attend=False,
+        only_cross=False,
+        use_scalenorm=False,
+        use_rmsnorm=False,
+        use_rezero=False,
+        rel_pos_num_buckets=32,
+        rel_pos_max_distance=128,
+        position_infused_attn=False,
+        custom_layers=None,
+        sandwich_coef=None,
+        par_ratio=None,
+        residual_attn=False,
+        cross_residual_attn=False,
+        macaron=False,
+        pre_norm=True,
+        gate_residual=False,
+        **kwargs,
    ):
        super().__init__()
        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
@@ -403,10 +447,14 @@ class AttentionLayers(nn.Module):
        self.layers = nn.ModuleList([])

        self.has_pos_emb = position_infused_attn
-        self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
+        self.pia_pos_emb = (
+            FixedPositionalEmbedding(dim) if position_infused_attn else None
+        )
        self.rotary_pos_emb = always(None)

-        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
+        assert (
+            rel_pos_num_buckets <= rel_pos_max_distance
+        ), 'number of relative position buckets must be less than the relative position max distance'
        self.rel_pos = None

        self.pre_norm = pre_norm
@@ -438,15 +486,27 @@ class AttentionLayers(nn.Module):
            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
            default_block = tuple(filter(not_equals('f'), default_block))
            par_attn = par_depth // par_ratio
-            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            depth_cut = (
+                par_depth * 2 // 3
+            )  # 2 / 3 attention layer cutoff suggested by PAR paper
            par_width = (depth_cut + depth_cut // par_attn) // par_attn
-            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
-            par_block = default_block + ('f',) * (par_width - len(default_block))
+            assert (
+                len(default_block) <= par_width
+            ), 'default block is too large for par_ratio'
+            par_block = default_block + ('f',) * (
+                par_width - len(default_block)
+            )
            par_head = par_block * par_attn
            layer_types = par_head + ('f',) * (par_depth - len(par_head))
        elif exists(sandwich_coef):
-            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
-            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
+            assert (
+                sandwich_coef > 0 and sandwich_coef <= depth
+            ), 'sandwich coefficient should be less than the depth'
+            layer_types = (
+                ('a',) * sandwich_coef
+                + default_block * (depth - sandwich_coef)
+                + ('f',) * sandwich_coef
+            )
        else:
            layer_types = default_block * depth

@@ -455,7 +515,9 @@ class AttentionLayers(nn.Module):

        for layer_type in self.layer_types:
            if layer_type == 'a':
-                layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
+                layer = Attention(
+                    dim, heads=heads, causal=causal, **attn_kwargs
+                )
            elif layer_type == 'c':
                layer = Attention(dim, heads=heads, **attn_kwargs)
            elif layer_type == 'f':
@@ -472,20 +534,17 @@ class AttentionLayers(nn.Module):
            else:
                residual_fn = Residual()

-            self.layers.append(nn.ModuleList([
-                norm_fn(),
-                layer,
-                residual_fn
-            ]))
+            self.layers.append(nn.ModuleList([norm_fn(), layer, residual_fn]))

    def forward(
-            self,
-            x,
-            context=None,
-            mask=None,
-            context_mask=None,
-            mems=None,
-            return_hiddens=False
+        self,
+        x,
+        context=None,
+        mask=None,
+        context_mask=None,
+        mems=None,
+        return_hiddens=False,
+        **kwargs,
    ):
        hiddens = []
        intermediates = []
@@ -494,7 +553,9 @@ class AttentionLayers(nn.Module):

        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers

-        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
+        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(
+            zip(self.layer_types, self.layers)
+        ):
            is_last = ind == (len(self.layers) - 1)

            if layer_type == 'a':
@@ -507,10 +568,22 @@ class AttentionLayers(nn.Module):
                x = norm(x)

            if layer_type == 'a':
-                out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
-                                   prev_attn=prev_attn, mem=layer_mem)
+                out, inter = block(
+                    x,
+                    mask=mask,
+                    sinusoidal_emb=self.pia_pos_emb,
+                    rel_pos=self.rel_pos,
+                    prev_attn=prev_attn,
+                    mem=layer_mem,
+                )
            elif layer_type == 'c':
-                out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
+                out, inter = block(
+                    x,
+                    context=context,
+                    mask=mask,
+                    context_mask=context_mask,
+                    prev_attn=prev_cross_attn,
+                )
            elif layer_type == 'f':
                out = block(x)

@@ -529,8 +602,7 @@ class AttentionLayers(nn.Module):

        if return_hiddens:
            intermediates = LayerIntermediates(
-                hiddens=hiddens,
-                attn_intermediates=intermediates
+                hiddens=hiddens, attn_intermediates=intermediates
            )

            return x, intermediates
@@ -544,23 +616,24 @@ class Encoder(AttentionLayers):
        super().__init__(causal=False, **kwargs)


-
 class TransformerWrapper(nn.Module):
    def __init__(
-            self,
-            *,
-            num_tokens,
-            max_seq_len,
-            attn_layers,
-            emb_dim=None,
-            max_mem_len=0.,
-            emb_dropout=0.,
-            num_memory_tokens=None,
-            tie_embedding=False,
-            use_pos_emb=True
+        self,
+        *,
+        num_tokens,
+        max_seq_len,
+        attn_layers,
+        emb_dim=None,
+        max_mem_len=0.0,
+        emb_dropout=0.0,
+        num_memory_tokens=None,
+        tie_embedding=False,
+        use_pos_emb=True,
    ):
        super().__init__()
-        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
+        assert isinstance(
+            attn_layers, AttentionLayers
+        ), 'attention layers must be one of Encoder or Decoder'

        dim = attn_layers.dim
        emb_dim = default(emb_dim, dim)
@@ -570,23 +643,34 @@ class TransformerWrapper(nn.Module):
        self.num_tokens = num_tokens

        self.token_emb = nn.Embedding(num_tokens, emb_dim)
-        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
-                    use_pos_emb and not attn_layers.has_pos_emb) else always(0)
+        self.pos_emb = (
+            AbsolutePositionalEmbedding(emb_dim, max_seq_len)
+            if (use_pos_emb and not attn_layers.has_pos_emb)
+            else always(0)
+        )
        self.emb_dropout = nn.Dropout(emb_dropout)

-        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        self.project_emb = (
+            nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        )
        self.attn_layers = attn_layers
        self.norm = nn.LayerNorm(dim)

        self.init_()

-        self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+        self.to_logits = (
+            nn.Linear(dim, num_tokens)
+            if not tie_embedding
+            else lambda t: t @ self.token_emb.weight.t()
+        )

        # memory tokens (like [cls]) from Memory Transformers paper
        num_memory_tokens = default(num_memory_tokens, 0)
        self.num_memory_tokens = num_memory_tokens
        if num_memory_tokens > 0:
-            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+            self.memory_tokens = nn.Parameter(
+                torch.randn(num_memory_tokens, dim)
+            )

            # let funnel encoder know number of memory tokens, if specified
            if hasattr(attn_layers, 'num_memory_tokens'):
@@ -596,18 +680,26 @@ class TransformerWrapper(nn.Module):
        nn.init.normal_(self.token_emb.weight, std=0.02)

    def forward(
-            self,
-            x,
-            return_embeddings=False,
-            mask=None,
-            return_mems=False,
-            return_attn=False,
-            mems=None,
-            **kwargs
+        self,
+        x,
+        return_embeddings=False,
+        mask=None,
+        return_mems=False,
+        return_attn=False,
+        mems=None,
+        embedding_manager=None,
+        **kwargs,
    ):
        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
-        x = self.token_emb(x)
-        x += self.pos_emb(x)
+
+        embedded_x = self.token_emb(x)
+
+        if embedding_manager:
+            x = embedding_manager(x, embedded_x)
+        else:
+            x = embedded_x
+
+        x = x + self.pos_emb(x)
        x = self.emb_dropout(x)

        x = self.project_emb(x)
@@ -620,7 +712,9 @@ class TransformerWrapper(nn.Module):
            if exists(mask):
                mask = F.pad(mask, (num_mem, 0), value=True)

-        x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
+        x, intermediates = self.attn_layers(
+            x, mask=mask, mems=mems, return_hiddens=True, **kwargs
+        )
        x = self.norm(x)

        mem, x = x[:, :num_mem], x[:, num_mem:]
@@ -629,13 +723,30 @@ class TransformerWrapper(nn.Module):

        if return_mems:
            hiddens = intermediates.hiddens
-            new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
-            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
+            new_mems = (
+                list(
+                    map(
+                        lambda pair: torch.cat(pair, dim=-2),
+                        zip(mems, hiddens),
+                    )
+                )
+                if exists(mems)
+                else hiddens
+            )
+            new_mems = list(
+                map(
+                    lambda t: t[..., -self.max_mem_len :, :].detach(), new_mems
+                )
+            )
            return out, new_mems

        if return_attn:
-            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            attn_maps = list(
+                map(
+                    lambda t: t.post_softmax_attn,
+                    intermediates.attn_intermediates,
+                )
+            )
            return out, attn_maps

        return out
-
--- a/ldm/simplet2i.py
+++ b/ldm/simplet2i.py
--- a/ldm/util.py
+++ b/ldm/util.py
@@ -12,22 +12,26 @@ from queue import Queue

 from inspect import isfunction
 from PIL import Image, ImageDraw, ImageFont
+
+
 def log_txt_as_img(wh, xc, size=10):
    # wh a tuple of (width, height)
    # xc a list of captions to plot
    b = len(xc)
    txts = list()
    for bi in range(b):
-        txt = Image.new("RGB", wh, color="white")
+        txt = Image.new('RGB', wh, color='white')
        draw = ImageDraw.Draw(txt)
-        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        font = ImageFont.load_default()
        nc = int(40 * (wh[0] / 256))
-        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+        lines = '\n'.join(
+            xc[bi][start : start + nc] for start in range(0, len(xc[bi]), nc)
+        )

        try:
-            draw.text((0, 0), lines, fill="black", font=font)
+            draw.text((0, 0), lines, fill='black', font=font)
        except UnicodeEncodeError:
-            print("Cant encode string for logging. Skipping.")
+            print('Cant encode string for logging. Skipping.')

        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
        txts.append(txt)
@@ -69,22 +73,26 @@ def mean_flat(tensor):
 def count_params(model, verbose=False):
    total_params = sum(p.numel() for p in model.parameters())
    if verbose:
-        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+        print(
+            f'{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.'
+        )
    return total_params


-def instantiate_from_config(config):
-    if not "target" in config:
+def instantiate_from_config(config, **kwargs):
+    if not 'target' in config:
        if config == '__is_first_stage__':
            return None
-        elif config == "__is_unconditional__":
+        elif config == '__is_unconditional__':
            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+        raise KeyError('Expected key `target` to instantiate.')
+    return get_obj_from_str(config['target'])(
+        **config.get('params', dict()), **kwargs
+    )


 def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
+    module, cls = string.rsplit('.', 1)
    if reload:
        module_imp = importlib.import_module(module)
        importlib.reload(module_imp)
@@ -100,31 +108,36 @@ def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False):
    else:
        res = func(data)
    Q.put([idx, res])
-    Q.put("Done")
+    Q.put('Done')


 def parallel_data_prefetch(
-        func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False
+    func: callable,
+    data,
+    n_proc,
+    target_data_type='ndarray',
+    cpu_intensive=True,
+    use_worker_id=False,
 ):
    # if target_data_type not in ["ndarray", "list"]:
    #     raise ValueError(
    #         "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray."
    #     )
-    if isinstance(data, np.ndarray) and target_data_type == "list":
-        raise ValueError("list expected but function got ndarray.")
+    if isinstance(data, np.ndarray) and target_data_type == 'list':
+        raise ValueError('list expected but function got ndarray.')
    elif isinstance(data, abc.Iterable):
        if isinstance(data, dict):
            print(
                f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
            )
            data = list(data.values())
-        if target_data_type == "ndarray":
+        if target_data_type == 'ndarray':
            data = np.asarray(data)
        else:
            data = list(data)
    else:
        raise TypeError(
-            f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}."
+            f'The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}.'
        )

    if cpu_intensive:
@@ -134,7 +147,7 @@ def parallel_data_prefetch(
        Q = Queue(1000)
        proc = Thread
    # spawn processes
-    if target_data_type == "ndarray":
+    if target_data_type == 'ndarray':
        arguments = [
            [func, Q, part, i, use_worker_id]
            for i, part in enumerate(np.array_split(data, n_proc))
@@ -148,7 +161,7 @@ def parallel_data_prefetch(
        arguments = [
            [func, Q, part, i, use_worker_id]
            for i, part in enumerate(
-                [data[i: i + step] for i in range(0, len(data), step)]
+                [data[i : i + step] for i in range(0, len(data), step)]
            )
        ]
    processes = []
@@ -157,7 +170,7 @@ def parallel_data_prefetch(
        processes += [p]

    # start processes
-    print(f"Start prefetching...")
+    print(f'Start prefetching...')
    import time

    start = time.time()
@@ -170,13 +183,13 @@ def parallel_data_prefetch(
        while k < n_proc:
            # get result
            res = Q.get()
-            if res == "Done":
+            if res == 'Done':
                k += 1
            else:
                gather_res[res[0]] = res[1]

    except Exception as e:
-        print("Exception: ", e)
+        print('Exception: ', e)
        for p in processes:
            p.terminate()

@@ -184,7 +197,7 @@ def parallel_data_prefetch(
    finally:
        for p in processes:
            p.join()
-        print(f"Prefetching complete. [{time.time() - start} sec.]")
+        print(f'Prefetching complete. [{time.time() - start} sec.]')

    if target_data_type == 'ndarray':
        if not isinstance(gather_res[0], np.ndarray):
--- a/main.py
+++ b/main.py
--- a/notebook_helpers.py
+++ b/notebook_helpers.py
@@ -14,7 +14,7 @@ from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.util import ismap
 import time
 from omegaconf import OmegaConf
-
+from ldm.dream.devices import choose_torch_device

 def download_models(mode):

@@ -117,7 +117,8 @@ def get_cond(mode, selected_path):
        c = rearrange(c, '1 c h w -> 1 h w c')
        c = 2. * c - 1.

-        c = c.to(torch.device("cuda"))
+        device = choose_torch_device()
+        c = c.to(device)
        example["LR_image"] = c
        example["image"] = c_up

@@ -267,4 +268,4 @@ def make_convolutional_sample(batch, model, mode="vanilla", custom_steps=None, e
    log["sample"] = x_sample
    log["time"] = t1 - t0

-    return log
+    return log
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+albumentations==0.4.3
+einops==0.3.0
+huggingface-hub==0.8.1
+imageio==2.9.0
+imageio-ffmpeg==0.4.2
+kornia==0.6.0
+numpy==1.23.1
+--pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+omegaconf==2.1.1
+opencv-python==4.6.0.66
+pillow==9.2.0
+pudb==2019.2
+torch==1.12.1
+torchvision==0.12.0
+pytorch-lightning==1.4.2
+streamlit==1.12.0
+test-tube>=0.7.5
+torch-fidelity==0.3.0
+torchmetrics==0.6.0
+transformers==4.19.2
+-e git+https://github.com/openai/CLIP.git@main#egg=clip
+-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+-e git+https://github.com/Birch-san/k-diffusion.git@mps#egg=k-diffusion
--- a/scripts/dream.py
+++ b/scripts/dream.py
@@ -1,277 +1,633 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
+
 import argparse
 import shlex
-import atexit
 import os
-
-# readline unavailable on windows systems
-try:
-    import readline
-    readline_available = True
-except:
-    readline_available = False
-
-debugging = True
+import re
+import sys
+import copy
+import warnings
+import time
+import ldm.dream.readline
+from ldm.dream.pngwriter import PngWriter, PromptFormatter
+from ldm.dream.server import DreamServer, ThreadingDreamServer
+from ldm.dream.image_util import make_grid
+from omegaconf import OmegaConf

 def main():
-    ''' Initialize command-line parsers and the diffusion model '''
+    """Initialize command-line parsers and the diffusion model"""
    arg_parser = create_argv_parser()
-    opt        = arg_parser.parse_args()
+    opt = arg_parser.parse_args()
+    
    if opt.laion400m:
-        # defaults suitable to the older latent diffusion weights
-        width   = 256
-        height  = 256
-        config  = "configs/latent-diffusion/txt2img-1p4B-eval.yaml"
-        weights = "models/ldm/text2img-large/model.ckpt"
-    else:
-        # some defaults suitable for stable diffusion weights
-        width   = 512
-        height  = 512
-        config  = "configs/stable-diffusion/v1-inference.yaml"
-        weights = "models/ldm/stable-diffusion-v1/model.ckpt"
+        print('--laion400m flag has been deprecated. Please use --model laion400m instead.')
+        sys.exit(-1)
+    if opt.weights != 'model':
+        print('--weights argument has been deprecated. Please configure ./configs/models.yaml, and call it using --model instead.')
+        sys.exit(-1)
+        
+    try:
+        models  = OmegaConf.load(opt.config)
+        width   = models[opt.model].width
+        height  = models[opt.model].height
+        config  = models[opt.model].config
+        weights = models[opt.model].weights
+    except (FileNotFoundError, IOError, KeyError) as e:
+        print(f'{e}. Aborting.')
+        sys.exit(-1)

-    # command line history will be stored in a file called "~/.dream_history"
-    if readline_available:
-        setup_readline()
-
-    print("* Initializing, be patient...\n")
+    print('* Initializing, be patient...\n')
+    sys.path.append('.')
    from pytorch_lightning import logging
    from ldm.simplet2i import T2I

+    # these two lines prevent a horrible warning message from appearing
+    # when the frozen CLIP tokenizer is imported
+    import transformers
+
+    transformers.logging.set_verbosity_error()
+
    # creating a simple text2image object with a handful of
    # defaults passed on the command line.
    # additional parameters will be added (or overriden) during
    # the user input loop
-    t2i = T2I(width=width,
-              height=height,
-              batch_size=opt.batch_size,
-              outdir=opt.outdir,
-              sampler=opt.sampler,
-              weights=weights,
-              config=config)
+    t2i = T2I(
+        width=width,
+        height=height,
+        sampler_name=opt.sampler_name,
+        weights=weights,
+        full_precision=opt.full_precision,
+        config=config,
+        grid  = opt.grid,
+        # this is solely for recreating the prompt
+        latent_diffusion_weights=opt.laion400m,
+        embedding_path=opt.embedding_path,
+        device_type=opt.device
+    )

    # make sure the output directory exists
    if not os.path.exists(opt.outdir):
        os.makedirs(opt.outdir)
-        
+
    # gets rid of annoying messages about random seed
-    logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
+    logging.getLogger('pytorch_lightning').setLevel(logging.ERROR)
+
+    # load the infile as a list of lines
+    infile = None
+    if opt.infile:
+        try:
+            if os.path.isfile(opt.infile):
+                infile = open(opt.infile, 'r', encoding='utf-8')
+            elif opt.infile == '-':  # stdin
+                infile = sys.stdin
+            else:
+                raise FileNotFoundError(f'{opt.infile} not found.')
+        except (FileNotFoundError, IOError) as e:
+            print(f'{e}. Aborting.')
+            sys.exit(-1)

    # preload the model
-    if not debugging:
-        t2i.load_model()
-    print("\n* Initialization done! Awaiting your command (-h for help, q to quit)...")
+    tic = time.time()
+    t2i.load_model()
+    print(
+        f'>> model loaded in', '%4.2fs' % (time.time() - tic)
+    )

-    log_path   = os.path.join(opt.outdir,"dream_log.txt")
-    with open(log_path,'a') as log:
-        cmd_parser = create_cmd_parser()
-        main_loop(t2i,cmd_parser,log)
-        log.close()
+    if not infile:
+        print(
+            "\n* Initialization done! Awaiting your command (-h for help, 'q' to quit)"
+        )
+
+    cmd_parser = create_cmd_parser()
+    if opt.web:
+        dream_server_loop(t2i, opt.host, opt.port)
+    else:
+        main_loop(t2i, opt.outdir, opt.prompt_as_dir, cmd_parser, infile)


-def main_loop(t2i,parser,log):
-    ''' prompt/read/execute loop '''
+def main_loop(t2i, outdir, prompt_as_dir, parser, infile):
+    """prompt/read/execute loop"""
    done = False
-    
+    last_seeds = []
+    path_filter = re.compile(r'[<>:"/\\|?*]')
+
+    # os.pathconf is not available on Windows
+    if hasattr(os, 'pathconf'):
+        path_max = os.pathconf(outdir, 'PC_PATH_MAX')
+        name_max = os.pathconf(outdir, 'PC_NAME_MAX')
+    else:
+        path_max = 260
+        name_max = 255
+
    while not done:
        try:
-            command = input("dream> ")
+            command = get_next_command(infile)
        except EOFError:
            done = True
            break

-        elements = shlex.split(command)
-        if elements[0]=='q':  # 
+        # skip empty lines
+        if not command.strip():
+            continue
+
+        if command.startswith(('#', '//')):
+            continue
+
+        # before splitting, escape single quotes so as not to mess
+        # up the parser
+        command = command.replace("'", "\\'")
+
+        try:
+            elements = shlex.split(command)
+        except ValueError as e:
+            print(str(e))
+            continue
+
+        if elements[0] == 'q':
            done = True
            break
-        if elements[0].startswith('!dream'): # in case a stored prompt still contains the !dream command
+
+        if elements[0].startswith(
+            '!dream'
+        ):   # in case a stored prompt still contains the !dream command
            elements.pop(0)
-            
+
        # rearrange the arguments to mimic how it works in the Dream bot.
        switches = ['']
        switches_started = False

        for el in elements:
-            if el[0]=='-' and not switches_started:
+            if el[0] == '-' and not switches_started:
                switches_started = True
            if switches_started:
                switches.append(el)
            else:
                switches[0] += el
                switches[0] += ' '
-        switches[0] = switches[0][:len(switches[0])-1]
+        switches[0] = switches[0][: len(switches[0]) - 1]

        try:
-            opt      = parser.parse_args(switches)
+            opt = parser.parse_args(switches)
        except SystemExit:
            parser.print_help()
            continue
-        if len(opt.prompt)==0:
-            print("Try again with a prompt!")
+        if len(opt.prompt) == 0:
+            print('Try again with a prompt!')
            continue
+        if opt.seed is not None and opt.seed < 0:   # retrieve previous value!
+            try:
+                opt.seed = last_seeds[opt.seed]
+                print(f'reusing previous seed {opt.seed}')
+            except IndexError:
+                print(f'No previous seed at position {opt.seed} found')
+                opt.seed = None

-        try:
-            if opt.init_img is None:
-                results = t2i.txt2img(**vars(opt))
+        do_grid           = opt.grid or t2i.grid
+
+        if opt.with_variations is not None:
+            # shotgun parsing, woo
+            parts = []
+            broken = False # python doesn't have labeled loops...
+            for part in opt.with_variations.split(','):
+                seed_and_weight = part.split(':')
+                if len(seed_and_weight) != 2:
+                    print(f'could not parse with_variation part "{part}"')
+                    broken = True
+                    break
+                try:
+                    seed = int(seed_and_weight[0])
+                    weight = float(seed_and_weight[1])
+                except ValueError:
+                    print(f'could not parse with_variation part "{part}"')
+                    broken = True
+                    break
+                parts.append([seed, weight])
+            if broken:
+                continue
+            if len(parts) > 0:
+                opt.with_variations = parts
            else:
-                results = t2i.img2img(**vars(opt))
-            print("Outputs:")
-            write_log_message(opt,switches,results,log)
-        except KeyboardInterrupt:
-            print('*interrupted*')
+                opt.with_variations = None
+
+        if opt.outdir:
+            if not os.path.exists(opt.outdir):
+                os.makedirs(opt.outdir)
+            current_outdir = opt.outdir
+        elif prompt_as_dir:
+            # sanitize the prompt to a valid folder name
+            subdir = path_filter.sub('_', opt.prompt)[:name_max].rstrip(' .')
+
+            # truncate path to maximum allowed length
+            # 27 is the length of '######.##########.##.png', plus two separators and a NUL
+            subdir = subdir[:(path_max - 27 - len(os.path.abspath(outdir)))]
+            current_outdir = os.path.join(outdir, subdir)
+
+            print ('Writing files to directory: "' + current_outdir + '"')
+
+            # make sure the output directory exists
+            if not os.path.exists(current_outdir):
+                os.makedirs(current_outdir)
+        else:
+            current_outdir = outdir
+
+        # Here is where the images are actually generated!
+        try:
+            file_writer = PngWriter(current_outdir)
+            prefix = file_writer.unique_prefix()
+            seeds = set()
+            results = [] # list of filename, prompt pairs
+            grid_images = dict() # seed -> Image, only used if `do_grid`
+            def image_writer(image, seed, upscaled=False):
+                if do_grid:
+                    grid_images[seed] = image
+                else:
+                    if upscaled and opt.save_original:
+                        filename = f'{prefix}.{seed}.postprocessed.png'
+                    else:
+                        filename = f'{prefix}.{seed}.png'
+                    if opt.variation_amount > 0:
+                        iter_opt = argparse.Namespace(**vars(opt)) # copy
+                        this_variation = [[seed, opt.variation_amount]]
+                        if opt.with_variations is None:
+                            iter_opt.with_variations = this_variation
+                        else:
+                            iter_opt.with_variations = opt.with_variations + this_variation
+                        iter_opt.variation_amount = 0
+                        normalized_prompt = PromptFormatter(t2i, iter_opt).normalize_prompt()
+                        metadata_prompt = f'{normalized_prompt} -S{iter_opt.seed}'
+                    elif opt.with_variations is not None:
+                        normalized_prompt = PromptFormatter(t2i, opt).normalize_prompt()
+                        metadata_prompt = f'{normalized_prompt} -S{opt.seed}' # use the original seed - the per-iteration value is the last variation-seed
+                    else:
+                        normalized_prompt = PromptFormatter(t2i, opt).normalize_prompt()
+                        metadata_prompt = f'{normalized_prompt} -S{seed}'
+                    path = file_writer.save_image_and_prompt_to_png(image, metadata_prompt, filename)
+                    if (not upscaled) or opt.save_original:
+                        # only append to results if we didn't overwrite an earlier output
+                        results.append([path, metadata_prompt])
+
+                seeds.add(seed)
+
+            t2i.prompt2image(image_callback=image_writer, **vars(opt))
+
+            if do_grid and len(grid_images) > 0:
+                grid_img = make_grid(list(grid_images.values()))
+                first_seed = next(iter(seeds))
+                filename = f'{prefix}.{first_seed}.png'
+                # TODO better metadata for grid images
+                normalized_prompt = PromptFormatter(t2i, opt).normalize_prompt()
+                metadata_prompt = f'{normalized_prompt} -S{first_seed} --grid -N{len(grid_images)}'
+                path = file_writer.save_image_and_prompt_to_png(
+                    grid_img, metadata_prompt, filename
+                )
+                results = [[path, metadata_prompt]]
+
+            last_seeds = list(seeds)
+
+        except AssertionError as e:
+            print(e)
            continue

-    print("goodbye!")
+        except OSError as e:
+            print(e)
+            continue
+
+        print('Outputs:')
+        log_path = os.path.join(current_outdir, 'dream_log.txt')
+        write_log_message(results, log_path)
+
+    print('goodbye!')


-def write_log_message(opt,switches,results,logfile):
-    ''' logs the name of the output image, its prompt and seed to both the terminal and the log file '''
-    if opt.grid:
-        _output_for_grid(switches,results,logfile)
+def get_next_command(infile=None) -> str: #command string
+    if infile is None:
+        command = input('dream> ')
    else:
-        _output_for_individual(switches,results,logfile)
+        command = infile.readline()
+        if not command:
+            raise EOFError
+        else:
+            command = command.strip()
+        print(f'#{command}')
+    return command

-def _output_for_individual(switches,results,logfile):
-    for r in results:
-        log_message = " ".join(['   ',str(r[0])+':',
-                                f'"{switches[0]}"',
-                                *switches[1:],f'-S {r[1]}'])
-        print(log_message)
-        logfile.write(log_message+"\n")
-        logfile.flush()
+def dream_server_loop(t2i, host, port):
+    print('\n* --web was specified, starting web server...')
+    # Change working directory to the stable-diffusion directory
+    os.chdir(
+        os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    )

-def _output_for_grid(switches,results,logfile):
-    first_seed = results[0][1]
-    log_message = " ".join(['   ',str(results[0][0])+':',
-                            f'"{switches[0]}"',
-                            *switches[1:],f'-S {results[0][1]}'])
-    print(log_message)
-    logfile.write(log_message+"\n")
-    all_seeds   = [row[1] for row in results]
-    log_message = f'    seeds for individual rows: {all_seeds}'
-    print(log_message)
-    logfile.write(log_message+"\n")
+    # Start server
+    DreamServer.model = t2i
+    dream_server = ThreadingDreamServer((host, port))
+    print(">> Started Stable Diffusion dream server!")
+    if host == '0.0.0.0':
+        print(f"Point your browser at http://localhost:{port} or use the host's DNS name or IP address.")
+    else:
+        print(">> Default host address now 127.0.0.1 (localhost). Use --host 0.0.0.0 to bind any address.")
+        print(f">> Point your browser at http://{host}:{port}.")
+
+    try:
+        dream_server.serve_forever()
+    except KeyboardInterrupt:
+        pass
+
+    dream_server.server_close()
+
+
+def write_log_message(results, log_path):
+    """logs the name of the output image, prompt, and prompt args to the terminal and log file"""
+    log_lines = [f'{path}: {prompt}\n' for path, prompt in results]
+    print(*log_lines, sep='')
+
+    with open(log_path, 'a', encoding='utf-8') as file:
+        file.writelines(log_lines)
+
+
+SAMPLER_CHOICES=[
+    'ddim',
+    'k_dpm_2_a',
+    'k_dpm_2',
+    'k_euler_a',
+    'k_euler',
+    'k_heun',
+    'k_lms',
+    'plms',
+]

 def create_argv_parser():
-    parser = argparse.ArgumentParser(description="Parse script's command line args")
-    parser.add_argument("--laion400m",
-                        "--latent_diffusion",
-                        "-l",
-                        dest='laion400m',
-                        action='store_true',
-                        help="fallback to the latent diffusion (LAION4400M) weights and config")
-    parser.add_argument('-n','--iterations',
-                        type=int,
-                        default=1,
-                        help="number of images to generate")
-    parser.add_argument('-b','--batch_size',
-                        type=int,
-                        default=1,
-                        help="number of images to produce per iteration (currently not working properly - producing too many images)")
-    parser.add_argument('--sampler',
-                        choices=['plms','ddim'],
-                        default='plms',
-                        help="which sampler to use")
-    parser.add_argument('-o',
-                        '--outdir',
-                        type=str,
-                        default="outputs/img-samples",
-                        help="directory in which to place generated images and a log of prompts and seeds")
+    parser = argparse.ArgumentParser(
+        description="""Generate images using Stable Diffusion.
+        Use --web to launch the web interface. 
+        Use --from_file to load prompts from a file path or standard input ("-").
+        Otherwise you will be dropped into an interactive command prompt (type -h for help.)
+        Other command-line arguments are defaults that can usually be overridden
+        prompt the command prompt.
+"""
+    )
+    parser.add_argument(
+        '--laion400m',
+        '--latent_diffusion',
+        '-l',
+        dest='laion400m',
+        action='store_true',
+        help='Fallback to the latent diffusion (laion400m) weights and config',
+    )
+    parser.add_argument(
+        '--from_file',
+        dest='infile',
+        type=str,
+        help='If specified, load prompts from this file',
+    )
+    parser.add_argument(
+        '-n',
+        '--iterations',
+        type=int,
+        default=1,
+        help='Number of images to generate',
+    )
+    parser.add_argument(
+        '-F',
+        '--full_precision',
+        dest='full_precision',
+        action='store_true',
+        help='Use more memory-intensive full precision math for calculations',
+    )
+    parser.add_argument(
+        '-g',
+        '--grid',
+        action='store_true',
+        help='Generate a grid instead of individual images',
+    )
+    parser.add_argument(
+        '-A',
+        '-m',
+        '--sampler',
+        dest='sampler_name',
+        choices=SAMPLER_CHOICES,
+        metavar='SAMPLER_NAME',
+        default='k_lms',
+        help=f'Set the initial sampler. Default: k_lms. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
+    )
+    parser.add_argument(
+        '--outdir',
+        '-o',
+        type=str,
+        default='outputs/img-samples',
+        help='Directory to save generated images and a log of prompts and seeds. Default: outputs/img-samples',
+    )
+    parser.add_argument(
+        '--embedding_path',
+        type=str,
+        help='Path to a pre-trained embedding manager checkpoint - can only be set on command line',
+    )
+    parser.add_argument(
+        '--prompt_as_dir',
+        '-p',
+        action='store_true',
+        help='Place images in subdirectories named after the prompt.',
+    )
+    # GFPGAN related args
+    parser.add_argument(
+        '--gfpgan_bg_upsampler',
+        type=str,
+        default='realesrgan',
+        help='Background upsampler. Default: realesrgan. Options: realesrgan, none. Only used if --gfpgan is specified',
+
+    )
+    parser.add_argument(
+        '--gfpgan_bg_tile',
+        type=int,
+        default=400,
+        help='Tile size for background sampler, 0 for no tile during testing. Default: 400.',
+    )
+    parser.add_argument(
+        '--gfpgan_model_path',
+        type=str,
+        default='experiments/pretrained_models/GFPGANv1.3.pth',
+        help='Indicates the path to the GFPGAN model, relative to --gfpgan_dir.',
+    )
+    parser.add_argument(
+        '--gfpgan_dir',
+        type=str,
+        default='../GFPGAN',
+        help='Indicates the directory containing the GFPGAN code.',
+    )
+    parser.add_argument(
+        '--web',
+        dest='web',
+        action='store_true',
+        help='Start in web server mode.',
+    )
+    parser.add_argument(
+        '--host',
+        type=str,
+        default='127.0.0.1',
+        help='Web server: Host or IP to listen on. Set to 0.0.0.0 to accept traffic from other devices on your network.'
+    )
+    parser.add_argument(
+        '--port',
+        type=int,
+        default='9090',
+        help='Web server: Port to listen on'
+    )
+    parser.add_argument(
+        '--weights',
+        default='model',
+        help='Indicates the Stable Diffusion model to use.',
+    )
+    parser.add_argument(
+        '--device',
+        '-d',
+        type=str,
+        default='cuda',
+        help="device to run stable diffusion on. defaults to cuda `torch.cuda.current_device()` if available"
+    )
+    parser.add_argument(
+        '--model',
+        default='stable-diffusion-1.4',
+        help='Indicates which diffusion model to load. (currently "stable-diffusion-1.4" (default) or "laion400m")',
+    )
+    parser.add_argument(
+        '--config',
+        default ='configs/models.yaml',
+        help    ='Path to configuration file for alternate models.',
+    )
    return parser
-                        
-    
+
+
 def create_cmd_parser():
-    parser = argparse.ArgumentParser(description='Example: dream> a fantastic alien landscape -W1024 -H960 -s100 -n12')
+    parser = argparse.ArgumentParser(
+        description='Example: dream> a fantastic alien landscape -W1024 -H960 -s100 -n12'
+    )
    parser.add_argument('prompt')
-    parser.add_argument('-s','--steps',type=int,help="number of steps")
-    parser.add_argument('-S','--seed',type=int,help="image seed")
-    parser.add_argument('-n','--iterations',type=int,default=1,help="number of samplings to perform")
-    parser.add_argument('-b','--batch_size',type=int,default=1,help="number of images to produce per sampling (currently broken)")
-    parser.add_argument('-W','--width',type=int,help="image width, multiple of 64")
-    parser.add_argument('-H','--height',type=int,help="image height, multiple of 64")
-    parser.add_argument('-C','--cfg_scale',default=7.5,type=float,help="prompt configuration scale")
-    parser.add_argument('-g','--grid',action='store_true',help="generate a grid")
-    parser.add_argument('-i','--individual',action='store_true',help="generate individual files (default)")
-    parser.add_argument('-I','--init_img',type=str,help="path to input image (supersedes width and height)")
-    parser.add_argument('-f','--strength',default=0.75,type=float,help="strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely")
+    parser.add_argument('-s', '--steps', type=int, help='Number of steps')
+    parser.add_argument(
+        '-S',
+        '--seed',
+        type=int,
+        help='Image seed; a +ve integer, or use -1 for the previous seed, -2 for the one before that, etc',
+    )
+    parser.add_argument(
+        '-n',
+        '--iterations',
+        type=int,
+        default=1,
+        help='Number of samplings to perform (slower, but will provide seeds for individual images)',
+    )
+    parser.add_argument(
+        '-W', '--width', type=int, help='Image width, multiple of 64'
+    )
+    parser.add_argument(
+        '-H', '--height', type=int, help='Image height, multiple of 64'
+    )
+    parser.add_argument(
+        '-C',
+        '--cfg_scale',
+        default=7.5,
+        type=float,
+        help='Classifier free guidance (CFG) scale - higher numbers cause generator to "try" harder.',
+    )
+    parser.add_argument(
+        '-g', '--grid', action='store_true', help='generate a grid'
+    )
+    parser.add_argument(
+        '--outdir',
+        '-o',
+        type=str,
+        default=None,
+        help='Directory to save generated images and a log of prompts and seeds',
+    )
+    parser.add_argument(
+        '-i',
+        '--individual',
+        action='store_true',
+        help='Generate individual files (default)',
+    )
+    parser.add_argument(
+        '-I',
+        '--init_img',
+        type=str,
+        help='Path to input image for img2img mode (supersedes width and height)',
+    )
+    parser.add_argument(
+        '-T',
+        '-fit',
+        '--fit',
+        action='store_true',
+        help='If specified, will resize the input image to fit within the dimensions of width x height (512x512 default)',
+    )
+    parser.add_argument(
+        '-f',
+        '--strength',
+        default=0.75,
+        type=float,
+        help='Strength for noising/unnoising. 0.0 preserves image exactly, 1.0 replaces it completely',
+    )
+    parser.add_argument(
+        '-G',
+        '--gfpgan_strength',
+        default=0,
+        type=float,
+        help='The strength at which to apply the GFPGAN model to the result, in order to improve faces.',
+    )
+    parser.add_argument(
+        '-U',
+        '--upscale',
+        nargs='+',
+        default=None,
+        type=float,
+        help='Scale factor (2, 4) for upscaling followed by upscaling strength (0-1.0). If strength not specified, defaults to 0.75'
+    )
+    parser.add_argument(
+        '-save_orig',
+        '--save_original',
+        action='store_true',
+        help='Save original. Use it when upscaling to save both versions.',
+    )
+    # variants is going to be superseded by a generalized "prompt-morph" function
+    #    parser.add_argument('-v','--variants',type=int,help="in img2img mode, the first generated image will get passed back to img2img to generate the requested number of variants")
+    parser.add_argument(
+        '-x',
+        '--skip_normalize',
+        action='store_true',
+        help='Skip subprompt weight normalization',
+    )
+    parser.add_argument(
+        '-A',
+        '-m',
+        '--sampler',
+        dest='sampler_name',
+        default=None,
+        type=str,
+        choices=SAMPLER_CHOICES,
+        metavar='SAMPLER_NAME',
+        help=f'Switch to a different sampler. Supported samplers: {", ".join(SAMPLER_CHOICES)}',
+    )
+    parser.add_argument(
+        '-t',
+        '--log_tokenization',
+        action='store_true',
+        help='shows how the prompt is split into tokens'
+    )
+    parser.add_argument(
+        '-v',
+        '--variation_amount',
+        default=0.0,
+        type=float,
+        help='If > 0, generates variations on the initial seed instead of random seeds per iteration. Must be between 0 and 1. Higher values will be more different.'
+    )
+    parser.add_argument(
+        '-V',
+        '--with_variations',
+        default=None,
+        type=str,
+        help='list of variations to apply, in the format `seed:weight,seed:weight,...'
+    )
    return parser

-if readline_available:
-    def setup_readline():
-        readline.set_completer(Completer(['--steps','-s','--seed','-S','--iterations','-n','--batch_size','-b',
-                                          '--width','-W','--height','-H','--cfg_scale','-C','--grid','-g',
-                                          '--individual','-i','--init_img','-I','--strength','-f']).complete)
-        readline.set_completer_delims(" ")
-        readline.parse_and_bind('tab: complete')
-        load_history()

-    def load_history():
-        histfile = os.path.join(os.path.expanduser('~'),".dream_history")
-        try:
-            readline.read_history_file(histfile)
-            readline.set_history_length(1000)
-        except FileNotFoundError:
-            pass
-        atexit.register(readline.write_history_file,histfile)
-
-    class Completer():
-        def __init__(self,options):
-            self.options = sorted(options)
-            return
-
-        def complete(self,text,state):
-            if text.startswith('-I') or text.startswith('--init_img'):
-                return self._image_completions(text,state)
-
-            response = None
-            if state == 0:
-                # This is the first time for this text, so build a match list.
-                if text:
-                    self.matches = [s 
-                                    for s in self.options
-                                    if s and s.startswith(text)]
-                else:
-                    self.matches = self.options[:]
-
-            # Return the state'th item from the match list,
-            # if we have that many.
-            try:
-                response = self.matches[state]
-            except IndexError:
-                response = None
-            return response
-
-        def _image_completions(self,text,state):
-            # get the path so far
-            if text.startswith('-I'):
-                path = text.replace('-I','',1).lstrip()
-            elif text.startswith('--init_img='):
-                path = text.replace('--init_img=','',1).lstrip()
-
-            matches  = list()
-
-            path = os.path.expanduser(path)
-            if len(path)==0:
-                matches.append(text+'./')
-            else:
-                dir  = os.path.dirname(path)
-                dir_list = os.listdir(dir)
-                for n in dir_list:
-                    if n.startswith('.') and len(n)>1:
-                        continue
-                    full_path = os.path.join(dir,n)
-                    if full_path.startswith(path):
-                        if os.path.isdir(full_path):
-                            matches.append(os.path.join(os.path.dirname(text),n)+'/')
-                        elif n.endswith('.png'):
-                            matches.append(os.path.join(os.path.dirname(text),n))
-
-            try:
-                response = matches[state]
-            except IndexError:
-                response = None
-            return response
-        
-
-if __name__ == "__main__":
+if __name__ == '__main__':
    main()
--- a/scripts/images2prompt.py
+++ b/scripts/images2prompt.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+'''This script reads the "Dream" Stable Diffusion prompt embedded in files generated by dream.py'''
+
+import sys
+from PIL import Image,PngImagePlugin
+
+if len(sys.argv) < 2:
+    print("Usage: file2prompt.py <file1.png> <file2.png> <file3.png>...")
+    print("This script opens up the indicated dream.py-generated PNG file(s) and prints out the prompt used to generate them.")
+    exit(-1)
+
+filenames = sys.argv[1:]
+for f in filenames:
+    try:
+        im = Image.open(f)
+        try:
+            prompt = im.text['Dream']
+        except KeyError:
+            prompt = ''
+        print(f'{f}: {prompt}')
+    except FileNotFoundError:
+        sys.stderr.write(f'{f} not found\n')
+        continue
+    except PermissionError:
+        sys.stderr.write(f'{f} could not be opened due to inadequate permissions\n')
+        continue
+        
+
+
+
--- a/scripts/inpaint.py
+++ b/scripts/inpaint.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 from main import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
-
+from ldm.dream.devices import choose_torch_device

 def make_batch(image, mask, device):
    image = np.array(Image.open(image).convert("RGB"))
@@ -61,8 +61,8 @@ if __name__ == "__main__":
    model.load_state_dict(torch.load("models/ldm/inpainting_big/last.ckpt")["state_dict"],
                          strict=False)

-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
+    device  = choose_torch_device()
+    model   = model.to(device)
    sampler = DDIMSampler(model)

    os.makedirs(opt.outdir, exist_ok=True)
--- a/scripts/merge_embeddings.py
+++ b/scripts/merge_embeddings.py
@@ -0,0 +1,115 @@
+from ldm.modules.encoders.modules import FrozenCLIPEmbedder, BERTEmbedder
+from ldm.modules.embedding_manager import EmbeddingManager
+
+import argparse, os
+from functools import partial
+
+import torch
+
+def get_placeholder_loop(placeholder_string, embedder, use_bert):
+    
+    new_placeholder   = None
+    
+    while True:
+        if new_placeholder is None:
+            new_placeholder = input(f"Placeholder string {placeholder_string} was already used. Please enter a replacement string: ")
+        else:
+            new_placeholder = input(f"Placeholder string '{new_placeholder}' maps to more than a single token. Please enter another string: ")
+
+        token = get_bert_token_for_string(embedder.tknz_fn, new_placeholder) if use_bert else get_clip_token_for_string(embedder.tokenizer, new_placeholder)
+
+        if token is not None:
+            return new_placeholder, token
+            
+def get_clip_token_for_string(tokenizer, string):
+    batch_encoding = tokenizer(
+        string,
+        truncation=True,
+        max_length=77,
+        return_length=True,
+        return_overflowing_tokens=False,
+        padding="max_length",
+        return_tensors="pt"
+    )
+
+    tokens = batch_encoding["input_ids"]
+
+    if torch.count_nonzero(tokens - 49407) == 2:
+        return tokens[0, 1]
+    
+    return None
+
+def get_bert_token_for_string(tokenizer, string):
+    token = tokenizer(string)
+    if torch.count_nonzero(token) == 3:
+        return token[0, 1]
+
+    return None
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--manager_ckpts", 
+        type=str, 
+        nargs="+", 
+        required=True,
+        help="Paths to a set of embedding managers to be merged."
+    )
+
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output path for the merged manager",
+    )
+
+    parser.add_argument(
+        "-sd", "--use_bert",
+        action="store_true",
+        help="Flag to denote that we are not merging stable diffusion embeddings"
+    )
+
+    args = parser.parse_args()
+
+    if args.use_bert:
+        embedder = BERTEmbedder(n_embed=1280, n_layer=32).cuda()
+    else:
+        embedder = FrozenCLIPEmbedder().cuda()
+
+    EmbeddingManager = partial(EmbeddingManager, embedder, ["*"])
+
+    string_to_token_dict = {}    
+    string_to_param_dict = torch.nn.ParameterDict()
+
+    placeholder_to_src = {}
+
+    for manager_ckpt in args.manager_ckpts:
+        print(f"Parsing {manager_ckpt}...")
+
+        manager = EmbeddingManager()
+        manager.load(manager_ckpt)
+
+        for placeholder_string in manager.string_to_token_dict:
+            if not placeholder_string in string_to_token_dict:
+                string_to_token_dict[placeholder_string] = manager.string_to_token_dict[placeholder_string]
+                string_to_param_dict[placeholder_string] = manager.string_to_param_dict[placeholder_string]
+
+                placeholder_to_src[placeholder_string] = manager_ckpt
+            else:
+                new_placeholder, new_token = get_placeholder_loop(placeholder_string, embedder, use_bert=args.use_bert)
+                string_to_token_dict[new_placeholder] = new_token
+                string_to_param_dict[new_placeholder] = manager.string_to_param_dict[placeholder_string]
+
+                placeholder_to_src[new_placeholder] = manager_ckpt
+
+    print("Saving combined manager...")
+    merged_manager = EmbeddingManager()
+    merged_manager.string_to_param_dict = string_to_param_dict
+    merged_manager.string_to_token_dict = string_to_token_dict
+    merged_manager.save(args.output_path)
+
+    print("Managers merged. Final list of placeholders: ")
+    print(placeholder_to_src)
--- a/scripts/orig_scripts/download_first_stages.sh
+++ b/scripts/orig_scripts/download_first_stages.sh
--- a/scripts/orig_scripts/download_models.sh
+++ b/scripts/orig_scripts/download_models.sh
--- a/scripts/orig_scripts/img2img.py
+++ b/scripts/orig_scripts/img2img.py
@@ -18,6 +18,7 @@ from pytorch_lightning import seed_everything
 from ldm.util import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.models.diffusion.plms import PLMSSampler
+from ldm.dream.devices         import choose_torch_device


 def chunk(it, size):
@@ -40,7 +41,7 @@ def load_model_from_config(config, ckpt, verbose=False):
        print("unexpected keys:")
        print(u)

-    model.cuda()
+    model.to(choose_torch_device())
    model.eval()
    return model

@@ -199,7 +200,7 @@ def main():
    config = OmegaConf.load(f"{opt.config}")
    model = load_model_from_config(config, f"{opt.ckpt}")

-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    device = torch.device(choose_torch_device())
    model = model.to(device)

    if opt.plms:
@@ -241,8 +242,10 @@ def main():
    print(f"target t_enc is {t_enc} steps")

    precision_scope = autocast if opt.precision == "autocast" else nullcontext
+    if device.type in ['mps', 'cpu']:
+        precision_scope = nullcontext # have to use f32 on mps
    with torch.no_grad():
-        with precision_scope("cuda"):
+        with precision_scope(device.type):
            with model.ema_scope():
                tic = time.time()
                all_samples = list()
--- a/scripts/orig_scripts/knn2img.py
+++ b/scripts/orig_scripts/knn2img.py
--- a/scripts/orig_scripts/latent_imagenet_diffusion.ipynb
+++ b/scripts/orig_scripts/latent_imagenet_diffusion.ipynb
--- a/scripts/orig_scripts/sample_diffusion.py
+++ b/scripts/orig_scripts/sample_diffusion.py
--- a/scripts/orig_scripts/train_searcher.py
+++ b/scripts/orig_scripts/train_searcher.py
--- a/scripts/orig_scripts/txt2img.py
+++ b/scripts/orig_scripts/txt2img.py
@@ -12,10 +12,13 @@ from pytorch_lightning import seed_everything
 from torch import autocast
 from contextlib import contextmanager, nullcontext

-from ldm.util import instantiate_from_config
+import k_diffusion as K
+import torch.nn as nn
+
+from ldm.util                  import instantiate_from_config
 from ldm.models.diffusion.ddim import DDIMSampler
 from ldm.models.diffusion.plms import PLMSSampler
-
+from ldm.dream.devices         import choose_torch_device

 def chunk(it, size):
    it = iter(it)
@@ -37,7 +40,7 @@ def load_model_from_config(config, ckpt, verbose=False):
        print("unexpected keys:")
        print(u)

-    model.cuda()
+    model.to(choose_torch_device())
    model.eval()
    return model

@@ -80,6 +83,11 @@ def main():
        action='store_true',
        help="use plms sampling",
    )
+    parser.add_argument(
+        "--klms",
+        action='store_true',
+        help="use klms sampling",
+    )
    parser.add_argument(
        "--laion400m",
        action='store_true',
@@ -182,13 +190,28 @@ def main():
        opt.ckpt = "models/ldm/text2img-large/model.ckpt"
        opt.outdir = "outputs/txt2img-samples-laion400m"

-    seed_everything(opt.seed)

    config = OmegaConf.load(f"{opt.config}")
    model = load_model_from_config(config, f"{opt.ckpt}")

-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
+    seed_everything(opt.seed)
+
+    device = torch.device(choose_torch_device())
+    model  = model.to(device)
+
+    #for klms
+    model_wrap = K.external.CompVisDenoiser(model)
+    class CFGDenoiser(nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.inner_model = model
+
+        def forward(self, x, sigma, uncond, cond, cond_scale):
+            x_in = torch.cat([x] * 2)
+            sigma_in = torch.cat([sigma] * 2)
+            cond_in = torch.cat([uncond, cond])
+            uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
+            return uncond + (cond - uncond) * cond_scale

    if opt.plms:
        sampler = PLMSSampler(model)
@@ -218,11 +241,17 @@ def main():

    start_code = None
    if opt.fixed_code:
-        start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
+        shape = [opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f]
+        if device.type == 'mps':
+            start_code = torch.randn(shape, device='cpu').to(device)
+        else:
+            torch.randn(shape, device=device)

    precision_scope = autocast if opt.precision=="autocast" else nullcontext
+    if device.type in ['mps', 'cpu']:
+        precision_scope = nullcontext # have to use f32 on mps
    with torch.no_grad():
-        with precision_scope("cuda"):
+        with precision_scope(device.type):
            with model.ema_scope():
                tic = time.time()
                all_samples = list()
@@ -235,16 +264,27 @@ def main():
                            prompts = list(prompts)
                        c = model.get_learned_conditioning(prompts)
                        shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
-                        samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
-                                                         conditioning=c,
-                                                         batch_size=opt.n_samples,
-                                                         shape=shape,
-                                                         verbose=False,
-                                                         unconditional_guidance_scale=opt.scale,
-                                                         unconditional_conditioning=uc,
-                                                         eta=opt.ddim_eta,
-                                                         x_T=start_code)
-
+                        
+                        if not opt.klms:
+                            samples_ddim, _ = sampler.sample(S=opt.ddim_steps,
+                                                            conditioning=c,
+                                                            batch_size=opt.n_samples,
+                                                            shape=shape,
+                                                            verbose=False,
+                                                            unconditional_guidance_scale=opt.scale,
+                                                            unconditional_conditioning=uc,
+                                                            eta=opt.ddim_eta,
+                                                            x_T=start_code)
+                        else:
+                            sigmas = model_wrap.get_sigmas(opt.ddim_steps)
+                            if start_code:
+                                x = start_code
+                            else:
+                                x = torch.randn([opt.n_samples, *shape], device=device) * sigmas[0] # for GPU draw
+                            model_wrap_cfg = CFGDenoiser(model_wrap)
+                            extra_args = {'cond': c, 'uncond': uc, 'cond_scale': opt.scale}
+                            samples_ddim = K.sampling.sample_lms(model_wrap_cfg, x, sigmas, extra_args=extra_args)
+                        
                        x_samples_ddim = model.decode_first_stage(samples_ddim)
                        x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)

--- a/scripts/preload_models.py
+++ b/scripts/preload_models.py
@@ -1,17 +1,86 @@
-#!/usr/bin/env python
-
+#!/usr/bin/env python3
+# Copyright (c) 2022 Lincoln D. Stein (https://github.com/lstein)
 # Before running stable-diffusion on an internet-isolated machine,
 # run this script from one with internet connectivity. The
 # two machines must share a common .cache directory.
+from transformers import CLIPTokenizer, CLIPTextModel
+import clip
+from transformers import BertTokenizerFast
+import sys
+import transformers
+import os
+import warnings
+
+transformers.logging.set_verbosity_error()

 # this will preload the Bert tokenizer fles
-print("preloading bert tokenizer...")
-from transformers import BertTokenizerFast
-tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-print("...success")
+print('preloading bert tokenizer...')
+
+tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+print('...success')

 # this will download requirements for Kornia
-print("preloading Kornia requirements...")
-import kornia
-print("...success")
+print('preloading Kornia requirements (ignore the deprecation warnings)...')
+with warnings.catch_warnings():
+    warnings.filterwarnings('ignore', category=DeprecationWarning)
+    import kornia
+print('...success')

+version = 'openai/clip-vit-large-patch14'
+
+print('preloading CLIP model (Ignore the deprecation warnings)...')
+sys.stdout.flush()
+
+tokenizer = CLIPTokenizer.from_pretrained(version)
+transformer = CLIPTextModel.from_pretrained(version)
+print('\n\n...success')
+
+# In the event that the user has installed GFPGAN and also elected to use
+# RealESRGAN, this will attempt to download the model needed by RealESRGANer
+gfpgan = False
+try:
+    from realesrgan import RealESRGANer
+
+    gfpgan = True
+except ModuleNotFoundError:
+    pass
+
+if gfpgan:
+    print('Loading models from RealESRGAN and facexlib')
+    try:
+        from basicsr.archs.rrdbnet_arch import RRDBNet
+        from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+
+        RealESRGANer(
+            scale=2,
+            model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth',
+            model=RRDBNet(
+                num_in_ch=3,
+                num_out_ch=3,
+                num_feat=64,
+                num_block=23,
+                num_grow_ch=32,
+                scale=2,
+            ),
+        )
+
+        RealESRGANer(
+            scale=4,
+            model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth',
+            model=RRDBNet(
+                num_in_ch=3,
+                num_out_ch=3,
+                num_feat=64,
+                num_block=23,
+                num_grow_ch=32,
+                scale=4,
+            ),
+        )
+
+        FaceRestoreHelper(1, det_model='retinaface_resnet50')
+        print('...success')
+    except Exception:
+        import traceback
+
+        print('Error loading GFPGAN:')
+        print(traceback.format_exc())
--- a/static/colab_notebook.png
+++ b/static/colab_notebook.png
--- a/static/dream-py-demo.png
+++ b/static/dream-py-demo.png
--- a/static/dream_web/index.css
+++ b/static/dream_web/index.css
@@ -0,0 +1,97 @@
+* {
+    font-family: 'Arial';
+}
+#header {
+    text-decoration: dotted underline;
+}
+#search {
+    margin-top: 20vh;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 1024px;
+    text-align: center;
+}
+fieldset {
+    border: none;
+}
+div {
+    padding: 10px 10px 10px 10px;
+}
+#fieldset-search {
+    display: flex;
+}
+#scaling-inprocess-message{
+    font-weight: bold;
+    font-style: italic;
+    display: none;
+}
+#prompt {
+    flex-grow: 1;
+
+    border-radius: 20px 0px 0px 20px;
+    padding: 5px 10px 5px 10px;
+    border: 1px solid black;
+    border-right: none;
+    outline: none;
+}
+#submit {
+    border-radius: 0px 20px 20px 0px;
+    padding: 5px 10px 5px 10px;
+    border: 1px solid black;
+}
+#reset-all {
+    background-color: pink;
+}
+#results {
+    text-align: center;
+//    max-width: 1024px;
+    margin: auto;
+    padding-top: 10px;
+}
+#results img {
+    cursor: pointer;
+    height: 30vh;
+    border-radius: 5px;
+    margin: 10px;
+}
+#fieldset-config {
+    line-height:2em;
+}
+input[type="number"] {
+    width: 60px;
+}
+#seed {
+    width: 150px;
+}
+hr {
+//    width: 200px;
+}
+label {
+    white-space: nowrap;
+}
+#progress-section {
+    display: none;
+}
+#progress-image {
+    width: 30vh;
+    height: 30vh;
+}
+#cancel-button {
+    cursor: pointer;
+    color: red;
+}
+#txt2img {
+    background-color: #DCDCDC;
+}
+#img2img {
+    background-color: #F5F5F5;
+}
+#gfpgan {
+    background-color: #DCDCDC;
+}
+#progress-section {
+    background-color: #F5F5F5;
+}
+#about {
+    background-color: #DCDCDC;
+}
--- a/static/dream_web/index.html
+++ b/static/dream_web/index.html
@@ -0,0 +1,111 @@
+<html lang="en">
+  <head>
+    <title>Stable Diffusion Dream Server</title>
+    <meta charset="utf-8">
+    <link rel="icon" href="data:,">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+    <link rel="stylesheet" href="static/dream_web/index.css">
+    <script src="config.js"></script>
+    <script src="static/dream_web/index.js"></script>
+  </head>
+  <body>
+    <div id="search">
+      <h2 id="header">Stable Diffusion Dream Server</h2>
+
+      <form id="generate-form" method="post" action="#">
+	<div id="txt2img">
+          <fieldset id="fieldset-search">
+            <input type="text" id="prompt" name="prompt">
+            <input type="submit" id="submit" value="Generate">
+          </fieldset>
+          <fieldset id="fieldset-config">
+            <label for="iterations">Images to generate:</label>
+            <input value="1" type="number" id="iterations" name="iterations" size="4">
+            <label for="steps">Steps:</label>
+            <input value="50" type="number" id="steps" name="steps">
+            <label for="cfgscale">Cfg Scale:</label>
+            <input value="7.5" type="number" id="cfgscale" name="cfgscale" step="any">
+            <label for="sampler">Sampler:</label>
+            <select id="sampler" name="sampler" value="k_lms">
+              <option value="ddim">DDIM</option>
+              <option value="plms">PLMS</option>
+              <option value="k_lms" selected>KLMS</option>
+              <option value="k_dpm_2">KDPM_2</option>
+              <option value="k_dpm_2_a">KDPM_2A</option>
+              <option value="k_euler">KEULER</option>
+	      <option value="k_euler_a">KEULER_A</option>
+              <option value="k_heun">KHEUN</option>
+            </select>
+            <br>
+            <label title="Set to multiple of 64" for="width">Width:</label>
+            <select id="width" name="width" value="512">
+              <option value="64">64</option> <option value="128">128</option>
+              <option value="192">192</option> <option value="256">256</option>
+              <option value="320">320</option> <option value="384">384</option>
+              <option value="448">448</option> <option value="512" selected>512</option>
+              <option value="576">576</option> <option value="640">640</option>
+              <option value="704">704</option> <option value="768">768</option>
+              <option value="832">832</option> <option value="896">896</option>
+              <option value="960">960</option> <option value="1024">1024</option>
+            </select>
+            <label title="Set to multiple of 64" for="height">Height:</label>
+            <select id="height" name="height" value="512">
+              <option value="64">64</option> <option value="128">128</option>
+              <option value="192">192</option> <option value="256">256</option>
+              <option value="320">320</option> <option value="384">384</option>
+              <option value="448">448</option> <option value="512" selected>512</option>
+              <option value="576">576</option> <option value="640">640</option>
+              <option value="704">704</option> <option value="768">768</option>
+              <option value="832">832</option> <option value="896">896</option>
+              <option value="960">960</option> <option value="1024">1024</option>
+            </select>
+            <label title="Set to -1 for random seed" for="seed">Seed:</label>
+            <input value="-1" type="number" id="seed" name="seed">
+            <button type="button" id="reset-seed">&olarr;</button>
+            <input type="checkbox" name="progress_images" id="progress_images">
+	    <label for="progress_images">Display in-progress images (slows down generation):</label>
+	    <button type="button" id="reset-all">Reset to Defaults</button>
+	</div>
+	<div id="img2img">
+          <label title="Upload an image to use img2img" for="initimg">Initial image:</label>
+          <input type="file" id="initimg" name="initimg" accept=".jpg, .jpeg, .png">
+	  <br>
+          <label for="strength">Img2Img Strength:</label>
+          <input value="0.75" type="number" id="strength" name="strength" step="0.01" min="0" max="1">
+          <input type="checkbox" id="fit" name="fit" checked>
+          <label title="Rescale image to fit within requested width and height" for="fit">Fit to width/height:</label>
+	</div>
+        <div id="gfpgan">
+          <label title="Strength of the gfpgan (face fixing) algorithm." for="gfpgan_strength">GPFGAN Strength (0 to disable):</label>
+          <input value="0.8" min="0" max="1" type="number" id="gfpgan_strength" name="gfpgan_strength" step="0.05">
+          <label title="Upscaling to perform using ESRGAN." for="upscale_level">Upscaling Level</label>
+          <select id="upscale_level" name="upscale_level" value="">
+            <option value="" selected>None</option>
+            <option value="2">2x</option>
+            <option value="4">4x</option>
+          </select>
+          <label title="Strength of the esrgan (upscaling) algorithm." for="upscale_strength">Upscale Strength:</label>
+          <input value="0.75" min="0" max="1" type="number" id="upscale_strength" name="upscale_strength" step="0.05">
+        </div>
+        </fieldset>
+      </form>
+      <div id="about">For news and support for this web service, visit our <a href="http://github.com/lstein/stable-diffusion">GitHub site</a></div>
+      <br>
+      <div id="progress-section">
+        <progress id="progress-bar" value="0" max="1"></progress>
+        <span id="cancel-button" title="Cancel">&#10006;</span>
+        <br>
+        <img id="progress-image" src='data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg"/>'></img>
+        <div id="scaling-inprocess-message">
+          <i><span>Postprocessing...</span><span id="processing_cnt">1/3</span></i>
+        </div>
+      </div>
+    </div>
+    <div id="results">
+      <div id="no-results-message">
+        <i><p>No results...</p></i>
+      </div>
+    </div>
+  </body>
+</html>
--- a/static/dream_web/index.js
+++ b/static/dream_web/index.js
@@ -0,0 +1,161 @@
+function toBase64(file) {
+    return new Promise((resolve, reject) => {
+        const r = new FileReader();
+        r.readAsDataURL(file);
+        r.onload = () => resolve(r.result);
+        r.onerror = (error) => reject(error);
+    });
+}
+
+function appendOutput(src, seed, config) {
+    let outputNode = document.createElement("img");
+    outputNode.src = src;
+
+    let altText = seed.toString() + " | " + config.prompt;
+    outputNode.alt = altText;
+    outputNode.title = altText;
+
+    // Reload image config
+    outputNode.addEventListener('click', () => {
+        let form = document.querySelector("#generate-form");
+        for (const [k, v] of new FormData(form)) {
+            form.querySelector(`*[name=${k}]`).value = config[k];
+        }
+        document.querySelector("#seed").value = seed;
+
+        saveFields(document.querySelector("#generate-form"));
+    });
+
+    document.querySelector("#results").prepend(outputNode);
+}
+
+function saveFields(form) {
+    for (const [k, v] of new FormData(form)) {
+        if (typeof v !== 'object') { // Don't save 'file' type
+            localStorage.setItem(k, v);
+        }
+    }
+}
+
+function loadFields(form) {
+    for (const [k, v] of new FormData(form)) {
+        const item = localStorage.getItem(k);
+        if (item != null) {
+            form.querySelector(`*[name=${k}]`).value = item;
+        }
+    }
+}
+
+function clearFields(form) {
+    localStorage.clear();
+    let prompt = form.prompt.value;
+    form.reset();
+    form.prompt.value = prompt;
+}
+
+const BLANK_IMAGE_URL = 'data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg"/>';
+async function generateSubmit(form) {
+    const prompt = document.querySelector("#prompt").value;
+
+    // Convert file data to base64
+    let formData = Object.fromEntries(new FormData(form));
+    formData.initimg = formData.initimg.name !== '' ? await toBase64(formData.initimg) : null;
+
+    let strength = formData.strength;
+    let totalSteps = formData.initimg ? Math.floor(strength * formData.steps) : formData.steps;
+
+    let progressSectionEle = document.querySelector('#progress-section');
+    progressSectionEle.style.display = 'initial';
+    let progressEle = document.querySelector('#progress-bar');
+    progressEle.setAttribute('max', totalSteps);
+    let progressImageEle = document.querySelector('#progress-image');
+    progressImageEle.src = BLANK_IMAGE_URL;
+
+    progressImageEle.style.display = {}.hasOwnProperty.call(formData, 'progress_images') ? 'initial': 'none';
+
+    // Post as JSON, using Fetch streaming to get results
+    fetch(form.action, {
+        method: form.method,
+        body: JSON.stringify(formData),
+    }).then(async (response) => {
+        const reader = response.body.getReader();
+
+        let noOutputs = true;
+        while (true) {
+            let {value, done} = await reader.read();
+            value = new TextDecoder().decode(value);
+            if (done) {
+                progressSectionEle.style.display = 'none';
+                break;
+            }
+
+            for (let event of value.split('\n').filter(e => e !== '')) {
+                const data = JSON.parse(event);
+
+                if (data.event === 'result') {
+                    noOutputs = false;
+                    document.querySelector("#no-results-message")?.remove();
+                    appendOutput(data.url, data.seed, data.config);
+                    progressEle.setAttribute('value', 0);
+                    progressEle.setAttribute('max', totalSteps);
+                } else if (data.event === 'upscaling-started') {
+                    document.getElementById("processing_cnt").textContent=data.processed_file_cnt;
+                    document.getElementById("scaling-inprocess-message").style.display = "block";
+                } else if (data.event === 'upscaling-done') {
+                    document.getElementById("scaling-inprocess-message").style.display = "none";
+                } else if (data.event === 'step') {
+                    progressEle.setAttribute('value', data.step);
+                    if (data.url) {
+                        progressImageEle.src = data.url;
+                    }
+                } else if (data.event === 'canceled') {
+                    // avoid alerting as if this were an error case
+                    noOutputs = false;
+                }
+            }
+        }
+
+        // Re-enable form, remove no-results-message
+        form.querySelector('fieldset').removeAttribute('disabled');
+        document.querySelector("#prompt").value = prompt;
+        document.querySelector('progress').setAttribute('value', '0');
+
+        if (noOutputs) {
+            alert("Error occurred while generating.");
+        }
+    });
+
+    // Disable form while generating
+    form.querySelector('fieldset').setAttribute('disabled','');
+    document.querySelector("#prompt").value = `Generating: "${prompt}"`;
+}
+
+window.onload = () => {
+    document.querySelector("#generate-form").addEventListener('submit', (e) => {
+        e.preventDefault();
+        const form = e.target;
+
+        generateSubmit(form);
+    });
+    document.querySelector("#generate-form").addEventListener('change', (e) => {
+        saveFields(e.target.form);
+    });
+    document.querySelector("#reset-seed").addEventListener('click', (e) => {
+        document.querySelector("#seed").value = -1;
+        saveFields(e.target.form);
+    });
+    document.querySelector("#reset-all").addEventListener('click', (e) => {
+        clearFields(e.target.form);
+    });
+    loadFields(document.querySelector("#generate-form"));
+
+    document.querySelector('#cancel-button').addEventListener('click', () => {
+        fetch('/cancel').catch(e => {
+            console.error(e);
+        });
+    });
+
+    if (!config.gfpgan_model_exists) {
+        document.querySelector("#gfpgan").style.display = 'none';
+    }
+};
--- a/static/dream_web_server.png
+++ b/static/dream_web_server.png
--- a/static/logo_temp.png
+++ b/static/logo_temp.png
--- a/static/variation_walkthru/000001.3357757885.png
+++ b/static/variation_walkthru/000001.3357757885.png
--- a/static/variation_walkthru/000002.1614299449.png
+++ b/static/variation_walkthru/000002.1614299449.png
--- a/static/variation_walkthru/000002.3647897225.png
+++ b/static/variation_walkthru/000002.3647897225.png
--- a/static/variation_walkthru/000003.1614299449.png
+++ b/static/variation_walkthru/000003.1614299449.png
--- a/static/variation_walkthru/000004.3747154981.png
+++ b/static/variation_walkthru/000004.3747154981.png
				`@@ -0,0 +1 @@`
				`"a photograph of an astronaut riding a horse" -s50 -S42`