[SD] Improve vmfb caching algo and retry mechanism (#1248 )

-- This commit gets rid of the all-or-nothing vmfb caching mechanism and improves the retry mechanism by providing lower-level granularity for compiling each model units. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Ean Garvey <87458719+monorimet@users.noreply.github.com>
Add Intel ARC A770 target triple (#1263 )
2026-01-12 07:18:27 -05:00 · 2023-03-31 09:38:14 -07:00 · 2023-03-29 14:49:05 -07:00 · 2023-03-29 09:48:11 -07:00 · 2023-03-28 23:38:45 -05:00 · 2023-03-28 14:31:01 -07:00
106 changed files with 12733 additions and 1453 deletions
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+count = 1
+show-source = 1
+select = E9,F63,F7,F82
+exclude = lit.cfg.py
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -14,7 +14,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]

    steps:
    - uses: actions/checkout@v2
@@ -44,18 +44,20 @@ jobs:
        body: |
          Automatic snapshot release of nod.ai SHARK.
        draft: true
-        prerelease: false
+        prerelease: true

    - name: Build Package 
      shell: powershell
      run: |
        ./setup_venv.ps1
+        python process_skipfiles.py
        pyinstaller .\apps\stable_diffusion\shark_sd.spec
        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
-        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
+        signtool sign /f c:\g\shark_02152023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
        pyinstaller .\apps\stable_diffusion\shark_sd_cli.spec
+        python process_skipfiles.py
        mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
-        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+        signtool sign /f c:\g\shark_02152023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe

        
    # GHA windows VM OOMs so disable for now
@@ -65,9 +67,9 @@ jobs:
    #    $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
    #    pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html

-    - uses: actions/upload-artifact@v2
-      with:
-        path: dist/*
+    #- uses: actions/upload-artifact@v2
+    #  with:
+    #    path: dist/*
    
    - name: Upload Release Assets
      id: upload-release-assets
@@ -77,6 +79,7 @@ jobs:
      with:
        release_id: ${{ steps.create_release.outputs.id }}
        assets_path: ./dist/*
+        #asset_content_type: application/vnd.microsoft.portable-executable 

    - name: Publish Release
      id: publish_release
@@ -92,7 +95,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.10"]
+        python-version: ["3.11"]
        backend: [IREE, SHARK]

    steps:
@@ -131,7 +134,7 @@ jobs:
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -31,7 +31,7 @@ jobs:
      matrix:
        os: [7950x, icelake, a100, MacStudio, ubuntu-latest]
        suite: [cpu,cuda,vulkan]
-        python-version: ["3.10"]
+        python-version: ["3.11"]
        include:
          - os: ubuntu-latest
            suite: lint
@@ -99,11 +99,12 @@ jobs:
      run: |
        # black format check
        black --version
-        black --line-length 79 --check .
+        black --check .
        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude lit.cfg.py
+        flake8 . --statistics
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py
+        flake8 . --isolated --count --exit-zero --max-complexity=10 --max-line-length=127 \
+          --statistics --exclude lit.cfg.py

    - name: Validate Models on CPU
      if: matrix.suite == 'cpu'
@@ -111,7 +112,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k cpu
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cpu 
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -119,9 +120,9 @@ jobs:
      if: matrix.suite == 'cuda'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k cuda
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
        # Disabled due to black image bug
@@ -136,7 +137,7 @@ jobs:
        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan --update_tank
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k vulkan --update_tank

    - name: Validate Vulkan Models (a100)
      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
@@ -144,19 +145,19 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k vulkan
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k vulkan
        python build_tools/stable_diffusion_testing.py --device=vulkan

    - name: Validate Vulkan Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
      run: |
        ./setup_venv.ps1
-        pytest --benchmark -k vulkan -s
-        type bench_results.csv
+        pytest -k vulkan -s

    - name: Validate Stable Diffusion Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
      run: |
        ./setup_venv.ps1
-        ./shark.venv/Scripts/activate
+        python process_skipfiles.py
+        pyinstaller .\apps\stable_diffusion\shark_sd.spec
        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/.gitignore
+++ b/.gitignore
@@ -168,6 +168,8 @@ shark_tmp/
 *.vmfb
 .use-iree
 tank/dict_configs.py
+*.csv
+reproducers/

 # ORT related artefacts
 cache_models/
@@ -182,3 +184,6 @@ models/

 # models folder
 apps/stable_diffusion/web/models/
+
+# Stencil annotators.
+stencil_annotator/
--- a/.style.yapf
+++ b/.style.yapf
@@ -1,3 +0,0 @@
-[style]
-  based_on_style = google
-  column_limit = 80
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
  <summary>Prerequisites - Drivers </summary>
  
 #### Install your Windows hardware drivers
-* [AMD RDNA Users] Download this specific driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree). Latest drivers may not work.
+* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
 * [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
  
@@ -25,18 +25,32 @@ Other users please ensure you have your latest vendor drivers and Vulkan SDK fro
 
 ### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users

-Install Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 
+Install the Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 

-Download the latest .exe https://github.com/nod-ai/SHARK/releases. 
+Download the [stable release](https://github.com/nod-ai/shark/releases/latest)

-Double click the .exe and you should have the [UI]( http://localhost:8080/?__theme=dark) in the browser. 
+Double click the .exe and you should have the [UI](http://localhost:8080/) in the browser. 

-If you have custom models (ckpt, safetensors) put in a `models/` directory where the .exe is. 
+If you have custom models put them in a `models/` directory where the .exe is. 

 Enjoy. 

-Some known AMD Driver quirks and fixes with cursors are documented [here](https://github.com/nod-ai/SHARK/blob/main/apps/stable_diffusion/stable_diffusion_amd.md ).
+<details>
+  <summary>More installation notes</summary>
+* We recommend that you download EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files with `rm *.vmfb`. You can also use `--clear_all` flag once to clean all the old files. 
+* If you recently updated the driver or this binary (EXE file), we recommend you clear all the local artifacts with `--clear_all` 

+## Running
+
+* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE)
+* The first run may take few minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
+* You will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
+* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/.
+
+## Stopping
+
+* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment or close the terminal. 
+</details>

 <details>
  <summary>Advanced Installation (Only for developers)</summary>
@@ -54,7 +68,7 @@ cd SHARK

 ### Windows 10/11 Users

-* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)
+* Install the latest Python 3.11.x version from [here](https://www.python.org/downloads/windows/)

 * Install Git for Windows from [here](https://git-scm.com/download/win)

@@ -100,21 +114,20 @@ source shark.venv/bin/activate

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\txt2img.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
 ```

 #### Linux / macOS Users
 ```shell
-python3.10 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+python3.11 apps/stable_diffusion/scripts/main.py --app=txt2img --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
 ```

 You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
 </details>

-The output on a 7900XTX would like:
+The output on a AMD 7900XTX would look something like:

-```shell 
-Stats for run 0:
+```shell
 Average step time: 47.19188690185547ms/it
 Clip Inference time (ms) = 109.531
 VAE Inference time (ms): 78.590
@@ -140,7 +153,7 @@ Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any
 This step sets up a new VirtualEnv for Python

 ```shell
-python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
+python --version #Check you have 3.11 on Linux, macOS or Windows Powershell
 python -m venv shark_venv
 source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows

@@ -154,7 +167,7 @@ python -m pip install --upgrade pip

 ### Install SHARK

-This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10
+This step pip installs SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11

 ```shell
 pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
@@ -189,10 +202,10 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 <details>
  <summary>Development, Testing and Benchmarks</summary>

-If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
+If you want to use Python3.11 and with TF Import tools you can use the environment variables like:
 Set `USE_IREE=1` to use upstream IREE
 ```
-# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
+# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
 ```

 ### Run any of the hundreds of SHARK tank models via the test framework
@@ -202,14 +215,14 @@ python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use g
 pytest tank/test_models.py -k "MiniLM"
 ```
  
-
+### How to use your locally built IREE / Torch-MLIR with SHARK
 If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
 with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

-### How to use your locally built Torch-MLIR with SHARK
+How to use your locally built Torch-MLIR with SHARK:
 ```shell
 1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
 2.) Run `pip uninstall torch-mlir`.
@@ -227,9 +240,15 @@ Now the SHARK will use your locally build Torch-MLIR repo.

 ## Benchmarking Dispatches

-To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.  
+To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your pytest command line argument.  
 If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`

+For example, to generate and run dispatch benchmarks for MiniLM on CUDA:
+```
+pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s --dispatch_benchmarks_dir=./my_dispatch_benchmarks                                                                                
+```
+The given command will populate `<dispatch_benchmarks_dir>/<model_name>/` with an `ordered_dispatches.txt` that lists and orders the dispatches and their latencies, as well as folders for each dispatch that contain .mlir, .vmfb, and results of the benchmark for that dispatch.
+
 if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:

 ```
@@ -253,7 +272,7 @@ Output will include:
 - A .txt file containing benchmark output


-See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.
+See tank/README.md for further instructions on how to run model tests and benchmarks from the SHARK tank.

 </details>

--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1 +1,6 @@
 from apps.stable_diffusion.scripts.txt2img import txt2img_inf
+from apps.stable_diffusion.scripts.img2img import img2img_inf
+from apps.stable_diffusion.scripts.inpaint import inpaint_inf
+from apps.stable_diffusion.scripts.outpaint import outpaint_inf
+from apps.stable_diffusion.scripts.upscaler import upscaler_inf
+from apps.stable_diffusion.scripts.train_lora_word import lora_train
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -0,0 +1,391 @@
+import sys
+import torch
+import time
+from PIL import Image
+import transformers
+from apps.stable_diffusion.src import (
+    args,
+    Image2ImagePipeline,
+    StencilPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# For stencil, the input image can be of any size but we need to ensure that
+# it conforms with our model contraints :-
+#   Both width and height should be in the range of [128, 768] and multiple of 8.
+# This utility function performs the transformation on the input image while
+# also maintaining the aspect ratio before sending it to the stencil pipeline.
+def resize_stencil(image: Image.Image):
+    width, height = image.size
+    aspect_ratio = width / height
+    min_size = min(width, height)
+    if min_size < 128:
+        n_size = 128
+        if width == min_size:
+            width = n_size
+            height = n_size / aspect_ratio
+        else:
+            height = n_size
+            width = n_size * aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+
+    min_size = min(width, height)
+    if min_size > 768:
+        n_size = 768
+        if width == min_size:
+            height = n_size
+            width = n_size * aspect_ratio
+        else:
+            width = n_size
+            height = n_size / aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+    new_image = image.resize((n_width, n_height))
+    return new_image, n_width, n_height
+
+
+# Exposed to UI.
+def img2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    height: int,
+    width: int,
+    steps: int,
+    strength: float,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    use_stencil: str,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.strength = strength
+    args.scheduler = scheduler
+    args.img_path = "not none"
+
+    if init_image is None:
+        return None, "An Initial Image is required"
+    image = init_image.convert("RGB")
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    use_stencil = None if use_stencil == "None" else use_stencil
+    args.use_stencil = use_stencil
+    if use_stencil is not None:
+        args.scheduler = "DDIM"
+        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
+        image, width, height = resize_stencil(image)
+    elif args.scheduler != "PNDM":
+        if "Shark" in args.scheduler:
+            print(
+                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
+            )
+            args.scheduler = "PNDM"
+        else:
+            sys.exit(
+                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
+            )
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    args.precision = precision
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    new_config_obj = Config(
+        "img2img",
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=use_stencil,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(args.scheduler)
+
+        if use_stencil is not None:
+            args.use_tuned = False
+            global_obj.set_sd_obj(
+                StencilPipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    args.batch_size,
+                    args.height,
+                    args.width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    use_stencil=use_stencil,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                )
+            )
+        else:
+            global_obj.set_sd_obj(
+                Image2ImagePipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    args.batch_size,
+                    args.height,
+                    args.width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                )
+            )
+
+    global_obj.set_sd_scheduler(args.scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    extra_info = {"STRENGTH": strength}
+    text_output = ""
+    for current_batch in range(batch_count):
+        if current_batch > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            batch_size,
+            height,
+            width,
+            steps,
+            strength,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+            use_stencil=use_stencil,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed, extra_info)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+
+
+def main():
+    if args.clear_all:
+        clear_all()
+
+    if args.img_path is None:
+        print("Flag --img_path is required.")
+        exit()
+
+    image = Image.open(args.img_path).convert("RGB")
+    # When the models get uploaded, it should be default to False.
+    args.import_mlir = True
+
+    use_stencil = args.use_stencil
+    if use_stencil:
+        args.scheduler = "DDIM"
+        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
+        image, args.width, args.height = resize_stencil(image)
+    elif args.scheduler != "PNDM":
+        if "Shark" in args.scheduler:
+            print(
+                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
+            )
+            args.scheduler = "PNDM"
+        else:
+            sys.exit(
+                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
+            )
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+    seed = utils.sanitize_seed(args.seed)
+    # Adjust for height and width based on model
+
+    if use_stencil:
+        img2img_obj = StencilPipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.custom_vae,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+            args.use_tuned,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+            use_stencil=use_stencil,
+            debug=args.import_debug if args.import_mlir else False,
+            use_lora=args.use_lora,
+        )
+    else:
+        img2img_obj = Image2ImagePipeline.from_pretrained(
+            scheduler_obj,
+            args.import_mlir,
+            args.hf_model_id,
+            args.ckpt_loc,
+            args.custom_vae,
+            args.precision,
+            args.max_length,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.use_base_vae,
+            args.use_tuned,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+            debug=args.import_debug if args.import_mlir else False,
+            use_lora=args.use_lora,
+        )
+
+    start_time = time.time()
+    generated_imgs = img2img_obj.generate_images(
+        args.prompts,
+        args.negative_prompts,
+        image,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.steps,
+        args.strength,
+        args.guidance_scale,
+        seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+        use_stencil=use_stencil,
+    )
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+    text_output += f"\nsteps={args.steps}, strength={args.strength}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += img2img_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    extra_info = {"STRENGTH": args.strength}
+    save_output_img(generated_imgs[0], seed, extra_info)
+    print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/inpaint.py
+++ b/apps/stable_diffusion/scripts/inpaint.py
@@ -0,0 +1,280 @@
+import torch
+import time
+from PIL import Image
+import transformers
+from apps.stable_diffusion.src import (
+    args,
+    InpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def inpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    image_dict,
+    height: int,
+    width: int,
+    inpaint_full_res: bool,
+    inpaint_full_res_padding: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.mask_path = "not none"
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "inpaint",
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            InpaintPipeline.from_pretrained(
+                scheduler=scheduler_obj,
+                import_mlir=args.import_mlir,
+                model_id=args.hf_model_id,
+                ckpt_loc=args.ckpt_loc,
+                custom_vae=args.custom_vae,
+                precision=args.precision,
+                max_length=args.max_length,
+                batch_size=args.batch_size,
+                height=args.height,
+                width=args.width,
+                use_base_vae=args.use_base_vae,
+                use_tuned=args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                debug=args.import_debug if args.import_mlir else False,
+                use_lora=args.use_lora,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    image = image_dict["image"]
+    mask_image = image_dict["mask"]
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            mask_image,
+            batch_size,
+            height,
+            width,
+            inpaint_full_res,
+            inpaint_full_res_padding,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+
+
+def main():
+    if args.clear_all:
+        clear_all()
+
+    if args.img_path is None:
+        print("Flag --img_path is required.")
+        exit()
+    if args.mask_path is None:
+        print("Flag --mask_path is required.")
+        exit()
+
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    model_id = (
+        args.hf_model_id
+        if "inpaint" in args.hf_model_id
+        else "stabilityai/stable-diffusion-2-inpainting"
+    )
+    schedulers = get_schedulers(model_id)
+    scheduler_obj = schedulers[args.scheduler]
+    seed = args.seed
+    image = Image.open(args.img_path)
+    mask_image = Image.open(args.mask_path)
+
+    inpaint_obj = InpaintPipeline.from_pretrained(
+        scheduler=scheduler_obj,
+        import_mlir=args.import_mlir,
+        model_id=args.hf_model_id,
+        ckpt_loc=args.ckpt_loc,
+        custom_vae=args.custom_vae,
+        precision=args.precision,
+        max_length=args.max_length,
+        batch_size=args.batch_size,
+        height=args.height,
+        width=args.width,
+        use_base_vae=args.use_base_vae,
+        use_tuned=args.use_tuned,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        debug=args.import_debug if args.import_mlir else False,
+        use_lora=args.use_lora,
+    )
+
+    for current_batch in range(args.batch_count):
+        if current_batch > 0:
+            seed = -1
+        seed = utils.sanitize_seed(seed)
+
+        start_time = time.time()
+        generated_imgs = inpaint_obj.generate_images(
+            args.prompts,
+            args.negative_prompts,
+            image,
+            mask_image,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.inpaint_full_res,
+            args.inpaint_full_res_padding,
+            args.steps,
+            args.guidance_scale,
+            seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        total_time = time.time() - start_time
+        text_output = f"prompt={args.prompts}"
+        text_output += f"\nnegative prompt={args.negative_prompts}"
+        text_output += (
+            f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+        )
+        text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+        text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
+        text_output += (
+            f", batch size={args.batch_size}, max_length={args.max_length}"
+        )
+        text_output += inpaint_obj.log
+        text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+        save_output_img(generated_imgs[0], seed)
+        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/main.py
+++ b/apps/stable_diffusion/scripts/main.py
@@ -0,0 +1,19 @@
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.scripts import (
+    img2img,
+    txt2img,
+    #    inpaint,
+    #    outpaint,
+)
+
+if __name__ == "__main__":
+    if args.app == "txt2img":
+        txt2img.main()
+    elif args.app == "img2img":
+        img2img.main()
+    #   elif args.app == "inpaint":
+    #       inpaint.main()
+    #   elif args.app == "outpaint":
+    #       outpaint.main()
+    else:
+        print(f"args.app value is {args.app} but this isn't supported")
--- a/apps/stable_diffusion/scripts/outpaint.py
+++ b/apps/stable_diffusion/scripts/outpaint.py
@@ -0,0 +1,305 @@
+import torch
+import time
+from PIL import Image
+import transformers
+from apps.stable_diffusion.src import (
+    args,
+    OutpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def outpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    pixels: int,
+    mask_blur: int,
+    directions: list,
+    noise_q: float,
+    color_variation: float,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.img_path = "not none"
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "outpaint",
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            OutpaintPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                use_lora=args.use_lora,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+
+    left = True if "left" in directions else False
+    right = True if "right" in directions else False
+    top = True if "up" in directions else False
+    bottom = True if "down" in directions else False
+
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            init_image,
+            pixels,
+            mask_blur,
+            left,
+            right,
+            top,
+            bottom,
+            noise_q,
+            color_variation,
+            batch_size,
+            height,
+            width,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+
+
+def main():
+    if args.clear_all:
+        clear_all()
+
+    if args.img_path is None:
+        print("Flag --img_path is required.")
+        exit()
+
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    model_id = (
+        args.hf_model_id
+        if "inpaint" in args.hf_model_id
+        else "stabilityai/stable-diffusion-2-inpainting"
+    )
+    schedulers = get_schedulers(model_id)
+    scheduler_obj = schedulers[args.scheduler]
+    seed = args.seed
+    image = Image.open(args.img_path)
+
+    outpaint_obj = OutpaintPipeline.from_pretrained(
+        scheduler_obj,
+        args.import_mlir,
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.use_base_vae,
+        args.use_tuned,
+        use_lora=args.use_lora,
+    )
+
+    for current_batch in range(args.batch_count):
+        if current_batch > 0:
+            seed = -1
+        seed = utils.sanitize_seed(seed)
+
+        start_time = time.time()
+        generated_imgs = outpaint_obj.generate_images(
+            args.prompts,
+            args.negative_prompts,
+            image,
+            args.pixels,
+            args.mask_blur,
+            args.left,
+            args.right,
+            args.top,
+            args.bottom,
+            args.noise_q,
+            args.color_variation,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.steps,
+            args.guidance_scale,
+            seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        total_time = time.time() - start_time
+        text_output = f"prompt={args.prompts}"
+        text_output += f"\nnegative prompt={args.negative_prompts}"
+        text_output += (
+            f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+        )
+        text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+        text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
+        text_output += (
+            f", batch size={args.batch_size}, max_length={args.max_length}"
+        )
+        text_output += outpaint_obj.log
+        text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+        # save this information as metadata of output generated image.
+        directions = []
+        if args.left:
+            directions.append("left")
+        if args.right:
+            directions.append("right")
+        if args.top:
+            directions.append("up")
+        if args.bottom:
+            directions.append("down")
+        extra_info = {
+            "PIXELS": args.pixels,
+            "MASK_BLUR": args.mask_blur,
+            "DIRECTIONS": directions,
+            "NOISE_Q": args.noise_q,
+            "COLOR_VARIATION": args.color_variation,
+        }
+        save_output_img(generated_imgs[0], seed, extra_info)
+        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/train_lora_word.py
+++ b/apps/stable_diffusion/scripts/train_lora_word.py
@@ -0,0 +1,674 @@
+# Install the required libs
+# pip install -U git+https://github.com/huggingface/diffusers.git
+# pip install accelerate transformers ftfy
+
+# HuggingFace Token
+# YOUR_TOKEN = "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"
+
+
+# Import required libraries
+import itertools
+import math
+import os
+from typing import List
+import random
+import torch_mlir
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import PIL
+import logging
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.cross_attention import LoRACrossAttnProcessor
+
+import torch_mlir
+from torch_mlir.dynamo import make_simple_dynamo_backend
+import torch._dynamo as dynamo
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+from shark.shark_inference import SharkInference
+
+torch._dynamo.config.verbose = True
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionSafetyChecker,
+)
+from PIL import Image
+from tqdm.auto import tqdm
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+from io import BytesIO
+
+from dataclasses import dataclass
+from apps.stable_diffusion.src import (
+    args,
+    get_schedulers,
+    set_init_device_flags,
+    clear_all,
+)
+
+
+# Setup the dataset
+class LoraDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        set="train",
+        prompt="myloraprompt",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.size = size
+        self.center_crop = center_crop
+        self.prompt = prompt
+
+        self.image_paths = [
+            os.path.join(self.data_root, file_path)
+            for file_path in os.listdir(self.data_root)
+        ]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+        }[interpolation]
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        example["input_ids"] = self.tokenizer(
+            self.prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[
+                (h - crop) // 2 : (h + crop) // 2,
+                (w - crop) // 2 : (w + crop) // 2,
+            ]
+
+        image = Image.fromarray(img)
+        image = image.resize(
+            (self.size, self.size), resample=self.interpolation
+        )
+
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+########## Setting up the model ##########
+def lora_train(
+    prompt: str,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    training_images_dir: str,
+    lora_save_dir: str,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+
+    print(
+        "Note LoRA training is not compatible with the latest torch-mlir branch"
+    )
+    print(
+        "To run LoRA training you'll need this to follow this guide for the torch-mlir branch: https://github.com/nod-ai/SHARK/tree/main/shark/examples/shark_training/stable_diffusion"
+    )
+    torch.manual_seed(seed)
+
+    args.prompts = [prompt]
+    args.steps = steps
+
+    # set ckpt_loc and hf_model_id.
+    types = (
+        ".ckpt",
+        ".safetensors",
+    )  # the tuple of file types
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = custom_model
+    else:
+        args.hf_model_id = custom_model
+
+    args.training_images_dir = training_images_dir
+    args.lora_save_dir = lora_save_dir
+
+    args.precision = precision
+    args.batch_size = batch_size
+    args.max_length = max_length
+    args.height = height
+    args.width = width
+    device_str = device.split("=>", 1)[1].strip().split("://")
+    if len(device_str) > 1:
+        device_str = device_str[0] + ":" + device_str[1]
+    else:
+        device_str = device_str[0]
+    args.device = device_str
+
+    # Load the Stable Diffusion model
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.hf_model_id, subfolder="text_encoder"
+    )
+    vae = AutoencoderKL.from_pretrained(args.hf_model_id, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(
+        args.hf_model_id, subfolder="unet"
+    )
+
+    def freeze_params(params):
+        for param in params:
+            param.requires_grad = False
+
+    # Freeze everything but LoRA
+    freeze_params(vae.parameters())
+    freeze_params(unet.parameters())
+    freeze_params(text_encoder.parameters())
+
+    # Move vae and unet to device
+    vae.to(args.device)
+    unet.to(args.device)
+    text_encoder.to(args.device)
+
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = (
+            None
+            if name.endswith("attn1.processor")
+            else unet.config.cross_attention_dim
+        )
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[
+                block_id
+            ]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+
+        lora_attn_procs[name] = LoRACrossAttnProcessor(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+        )
+
+    unet.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(unet.attn_processors)
+
+    class VaeModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.vae = vae
+
+        def forward(self, input):
+            x = self.vae.encode(input, return_dict=False)[0]
+            return x
+
+    class UnetModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.unet = unet
+
+        def forward(self, x, y, z):
+            return self.unet.forward(x, y, z, return_dict=False)[0]
+
+    shark_vae = VaeModel()
+    shark_unet = UnetModel()
+
+    ####### Creating our training data ########
+
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.hf_model_id,
+        subfolder="tokenizer",
+    )
+
+    # Let's create the Dataset and Dataloader
+    train_dataset = LoraDataset(
+        data_root=args.training_images_dir,
+        tokenizer=tokenizer,
+        size=vae.sample_size,
+        prompt=args.prompts[0],
+        repeats=100,
+        center_crop=False,
+        set="train",
+    )
+
+    def create_dataloader(train_batch_size=1):
+        return torch.utils.data.DataLoader(
+            train_dataset, batch_size=train_batch_size, shuffle=True
+        )
+
+    # Create noise_scheduler for training
+    noise_scheduler = DDPMScheduler.from_config(
+        args.hf_model_id, subfolder="scheduler"
+    )
+
+    ######## Training ###########
+
+    # Define hyperparameters for our training. If you are not happy with your results,
+    # you can tune the `learning_rate` and the `max_train_steps`
+
+    # Setting up all training args
+    hyperparameters = {
+        "learning_rate": 5e-04,
+        "scale_lr": True,
+        "max_train_steps": steps,
+        "train_batch_size": batch_size,
+        "gradient_accumulation_steps": 1,
+        "gradient_checkpointing": True,
+        "mixed_precision": "fp16",
+        "seed": 42,
+        "output_dir": "sd-concept-output",
+    }
+    # creating output directory
+    cwd = os.getcwd()
+    out_dir = os.path.join(cwd, hyperparameters["output_dir"])
+    while not os.path.exists(str(out_dir)):
+        try:
+            os.mkdir(out_dir)
+        except OSError as error:
+            print("Output directory not created")
+
+    ###### Torch-MLIR Compilation ######
+
+    def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+        removed_indexes = []
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, (list, tuple)):
+                    node_arg = list(node_arg)
+                    node_args_len = len(node_arg)
+                    for i in range(node_args_len):
+                        curr_index = node_args_len - (i + 1)
+                        if node_arg[curr_index] is None:
+                            removed_indexes.append(curr_index)
+                            node_arg.pop(curr_index)
+                    node.args = (tuple(node_arg),)
+                    break
+
+        if len(removed_indexes) > 0:
+            fx_g.graph.lint()
+            fx_g.graph.eliminate_dead_code()
+            fx_g.recompile()
+        removed_indexes.sort()
+        return removed_indexes
+
+    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+        """
+        Replace tuple with tuple element in functions that return one-element tuples.
+        Returns true if an unwrapping took place, and false otherwise.
+        """
+        unwrapped_tuple = False
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple):
+                    if len(node_arg) == 1:
+                        node.args = (node_arg[0],)
+                        unwrapped_tuple = True
+                        break
+
+        if unwrapped_tuple:
+            fx_g.graph.lint()
+            fx_g.recompile()
+        return unwrapped_tuple
+
+    def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple):
+                    return len(node_arg) == 0
+        return False
+
+    def transform_fx(fx_g):
+        for node in fx_g.graph.nodes:
+            if node.op == "call_function":
+                if node.target in [
+                    torch.ops.aten.empty,
+                ]:
+                    # aten.empty should be filled with zeros.
+                    if node.target in [torch.ops.aten.empty]:
+                        with fx_g.graph.inserting_after(node):
+                            new_node = fx_g.graph.call_function(
+                                torch.ops.aten.zero_,
+                                args=(node,),
+                            )
+                            node.append(new_node)
+                            node.replace_all_uses_with(new_node)
+                            new_node.args = (node,)
+
+        fx_g.graph.lint()
+
+    @make_simple_dynamo_backend
+    def refbackend_torchdynamo_backend(
+        fx_graph: torch.fx.GraphModule, example_inputs: List[torch.Tensor]
+    ):
+        # handling usage of empty tensor without initializing
+        transform_fx(fx_graph)
+        fx_graph.recompile()
+        if _returns_nothing(fx_graph):
+            return fx_graph
+        removed_none_indexes = _remove_nones(fx_graph)
+        was_unwrapped = _unwrap_single_tuple_return(fx_graph)
+
+        mlir_module = torch_mlir.compile(
+            fx_graph, example_inputs, output_type="linalg-on-tensors"
+        )
+
+        bytecode_stream = BytesIO()
+        mlir_module.operation.write_bytecode(bytecode_stream)
+        bytecode = bytecode_stream.getvalue()
+
+        shark_module = SharkInference(
+            mlir_module=bytecode, device=args.device, mlir_dialect="tm_tensor"
+        )
+        shark_module.compile()
+
+        def compiled_callable(*inputs):
+            inputs = [x.numpy() for x in inputs]
+            result = shark_module("forward", inputs)
+            if was_unwrapped:
+                result = [
+                    result,
+                ]
+            if not isinstance(result, list):
+                result = torch.from_numpy(result)
+            else:
+                result = tuple(torch.from_numpy(x) for x in result)
+                result = list(result)
+                for removed_index in removed_none_indexes:
+                    result.insert(removed_index, None)
+                result = tuple(result)
+            return result
+
+        return compiled_callable
+
+    def predictions(torch_func, jit_func, batchA, batchB):
+        res = jit_func(batchA.numpy(), batchB.numpy())
+        if res is not None:
+            # prediction = torch.from_numpy(res)
+            prediction = res
+        else:
+            prediction = None
+        return prediction
+
+    logger = logging.getLogger(__name__)
+
+    train_batch_size = hyperparameters["train_batch_size"]
+    gradient_accumulation_steps = hyperparameters[
+        "gradient_accumulation_steps"
+    ]
+    learning_rate = hyperparameters["learning_rate"]
+    if hyperparameters["scale_lr"]:
+        learning_rate = (
+            learning_rate
+            * gradient_accumulation_steps
+            * train_batch_size
+            # * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        lora_layers.parameters(),  # only optimize the embeddings
+        lr=learning_rate,
+    )
+
+    # Training function
+    def train_func(batch_pixel_values, batch_input_ids):
+        # Convert images to latent space
+        latents = shark_vae(batch_pixel_values).sample().detach()
+        latents = latents * 0.18215
+
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(latents)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0,
+            noise_scheduler.num_train_timesteps,
+            (bsz,),
+            device=latents.device,
+        ).long()
+
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+        # Get the text embedding for conditioning
+        encoder_hidden_states = text_encoder(batch_input_ids)[0]
+
+        # Predict the noise residual
+        noise_pred = shark_unet(
+            noisy_latents,
+            timesteps,
+            encoder_hidden_states,
+        )
+
+        # Get the target for loss depending on the prediction type
+        if noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif noise_scheduler.config.prediction_type == "v_prediction":
+            target = noise_scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            raise ValueError(
+                f"Unknown prediction type {noise_scheduler.config.prediction_type}"
+            )
+
+        loss = (
+            F.mse_loss(noise_pred, target, reduction="none")
+            .mean([1, 2, 3])
+            .mean()
+        )
+        loss.backward()
+
+        optimizer.step()
+        optimizer.zero_grad()
+
+        return loss
+
+    def training_function():
+        max_train_steps = hyperparameters["max_train_steps"]
+        output_dir = hyperparameters["output_dir"]
+        gradient_checkpointing = hyperparameters["gradient_checkpointing"]
+
+        train_dataloader = create_dataloader(train_batch_size)
+
+        # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+        num_update_steps_per_epoch = math.ceil(
+            len(train_dataloader) / gradient_accumulation_steps
+        )
+        num_train_epochs = math.ceil(
+            max_train_steps / num_update_steps_per_epoch
+        )
+
+        # Train!
+        total_batch_size = (
+            train_batch_size
+            * gradient_accumulation_steps
+            # train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+        )
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_dataset)}")
+        logger.info(
+            f"  Instantaneous batch size per device = {train_batch_size}"
+        )
+        logger.info(
+            f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+        )
+        logger.info(
+            f"  Gradient Accumulation steps = {gradient_accumulation_steps}"
+        )
+        logger.info(f"  Total optimization steps = {max_train_steps}")
+        # Only show the progress bar once on each machine.
+        progress_bar = tqdm(
+            # range(max_train_steps), disable=not accelerator.is_local_main_process
+            range(max_train_steps)
+        )
+        progress_bar.set_description("Steps")
+        global_step = 0
+
+        params__ = [
+            i for i in text_encoder.get_input_embeddings().parameters()
+        ]
+
+        for epoch in range(num_train_epochs):
+            unet.train()
+            for step, batch in enumerate(train_dataloader):
+                dynamo_callable = dynamo.optimize(
+                    refbackend_torchdynamo_backend
+                )(train_func)
+                lam_func = lambda x, y: dynamo_callable(
+                    torch.from_numpy(x), torch.from_numpy(y)
+                )
+                loss = predictions(
+                    train_func,
+                    lam_func,
+                    batch["pixel_values"],
+                    batch["input_ids"],
+                )
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                progress_bar.update(1)
+                global_step += 1
+
+                logs = {"loss": loss.detach().item()}
+                progress_bar.set_postfix(**logs)
+
+                if global_step >= max_train_steps:
+                    break
+
+    training_function()
+
+    # Save the lora weights
+    unet.save_attn_procs(args.lora_save_dir)
+
+    for param in itertools.chain(unet.parameters(), text_encoder.parameters()):
+        if param.grad is not None:
+            del param.grad  # free some memory
+        torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    if args.clear_all:
+        clear_all()
+
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+    seed = args.seed
+    if len(args.prompts) != 1:
+        print("Need exactly one prompt for the LoRA word")
+    lora_train(
+        args.prompts[0],
+        args.height,
+        args.width,
+        args.training_steps,
+        args.guidance_scale,
+        args.seed,
+        args.batch_count,
+        args.batch_size,
+        args.scheduler,
+        "None",
+        args.hf_model_id,
+        args.precision,
+        args.device,
+        args.max_length,
+        args.training_images_dir,
+        args.lora_save_dir,
+    )
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -1,132 +1,22 @@
-import os
-
-if "AMD_ENABLE_LLPC" not in os.environ:
-    os.environ["AMD_ENABLE_LLPC"] = "1"
-
-import sys
-import json
 import torch
-import re
+import transformers
 import time
-from pathlib import Path
-from PIL import PngImagePlugin
-from datetime import datetime as dt
-from dataclasses import dataclass
-from csv import DictWriter
 from apps.stable_diffusion.src import (
    args,
    Text2ImagePipeline,
    get_schedulers,
    set_init_device_flags,
    utils,
+    clear_all,
+    save_output_img,
 )
+from apps.stable_diffusion.src.utils import get_generation_text_info


-@dataclass
-class Config:
-    model_id: str
-    ckpt_loc: str
-    precision: str
-    batch_size: int
-    max_length: int
-    height: int
-    width: int
-    device: str
-
-
-# This has to come before importing cache objects
-if args.clear_all:
-    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
-    from glob import glob
-    import shutil
-
-    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
-    for vmfb in vmfbs:
-        if os.path.exists(vmfb):
-            os.remove(vmfb)
-    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
-    # TODO: Remove this once we have better weight updation logic.
-    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
-    for yaml in inference_yaml:
-        if os.path.exists(yaml):
-            os.remove(yaml)
-    home = os.path.expanduser("~")
-    if os.name == "nt":  # Windows
-        appdata = os.getenv("LOCALAPPDATA")
-        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
-        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
-    elif os.name == "unix":
-        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
-        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
-
-
-# save output images and the inputs corresponding to it.
-def save_output_img(output_img, img_seed):
-    output_path = args.output_dir if args.output_dir else Path.cwd()
-    generated_imgs_path = Path(output_path, "generated_imgs")
-    generated_imgs_path.mkdir(parents=True, exist_ok=True)
-    csv_path = Path(generated_imgs_path, "imgs_details.csv")
-
-    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
-    out_img_name = (
-        f"{prompt_slice}_{img_seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
-    )
-
-    img_model = args.hf_model_id
-    if args.ckpt_loc:
-        img_model = os.path.basename(args.ckpt_loc)
-
-    if args.output_img_format == "jpg":
-        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
-        output_img.save(out_img_path, quality=95, subsampling=0)
-    else:
-        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
-        pngInfo = PngImagePlugin.PngInfo()
-
-        if args.write_metadata_to_png:
-            pngInfo.add_text(
-                "parameters",
-                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {img_seed}, Size: {args.width}x{args.height}, Model: {img_model}",
-            )
-
-        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
-
-        if args.output_img_format not in ["png", "jpg"]:
-            print(
-                f"[ERROR] Format {args.output_img_format} is not supported yet."
-                "Image saved as png instead. Supported formats: png / jpg"
-            )
-
-    new_entry = {
-        "VARIANT": img_model,
-        "SCHEDULER": args.scheduler,
-        "PROMPT": args.prompts[0],
-        "NEG_PROMPT": args.negative_prompts[0],
-        "SEED": img_seed,
-        "CFG_SCALE": args.guidance_scale,
-        "PRECISION": args.precision,
-        "STEPS": args.steps,
-        "HEIGHT": args.height,
-        "WIDTH": args.width,
-        "MAX_LENGTH": args.max_length,
-        "OUTPUT": out_img_path,
-    }
-
-    with open(csv_path, "a") as csv_obj:
-        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
-        dictwriter_obj.writerow(new_entry)
-        csv_obj.close()
-
-    if args.save_metadata_to_json:
-        del new_entry["OUTPUT"]
-        json_path = Path(generated_imgs_path, f"{out_img_name}.json")
-        with open(json_path, "w") as f:
-            json.dump(new_entry, f, indent=4)
-
-
-txt2img_obj = None
-config_obj = None
-schedulers = None
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir


 # Exposed to UI.
@@ -148,10 +38,18 @@ def txt2img_inf(
    max_length: int,
    save_metadata_to_json: bool,
    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
 ):
-    global txt2img_obj
-    global config_obj
-    global schedulers
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )

    args.prompts = [prompt]
    args.negative_prompts = [negative_prompt]
@@ -160,10 +58,6 @@ def txt2img_inf(
    args.scheduler = scheduler

    # set ckpt_loc and hf_model_id.
-    types = (
-        ".ckpt",
-        ".safetensors",
-    )  # the tuple of file types
    args.ckpt_loc = ""
    args.hf_model_id = ""
    if custom_model == "None":
@@ -174,16 +68,21 @@ def txt2img_inf(
            )
        args.hf_model_id = hf_model_id
    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = custom_model
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
    else:
        args.hf_model_id = custom_model

    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png

+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
    dtype = torch.float32 if precision == "fp32" else torch.half
    cpu_scheduling = not scheduler.startswith("Shark")
    new_config_obj = Config(
+        "txt2img",
        args.hf_model_id,
        args.ckpt_loc,
        precision,
@@ -192,53 +91,66 @@ def txt2img_inf(
        height,
        width,
        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
    )
-    if config_obj != new_config_obj:
-        config_obj = new_config_obj
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
        args.precision = precision
+        args.batch_count = batch_count
        args.batch_size = batch_size
        args.max_length = max_length
        args.height = height
        args.width = width
        args.device = device.split("=>", 1)[1].strip()
-        args.use_tuned = True
-        args.import_mlir = False
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        args.img_path = None
        set_init_device_flags()
        model_id = (
            args.hf_model_id
            if args.hf_model_id
            else "stabilityai/stable-diffusion-2-1-base"
        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-        txt2img_obj = Text2ImagePipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            Text2ImagePipeline.from_pretrained(
+                scheduler=scheduler_obj,
+                import_mlir=args.import_mlir,
+                model_id=args.hf_model_id,
+                ckpt_loc=args.ckpt_loc,
+                precision=args.precision,
+                max_length=args.max_length,
+                batch_size=args.batch_size,
+                height=args.height,
+                width=args.width,
+                use_base_vae=args.use_base_vae,
+                use_tuned=args.use_tuned,
+                custom_vae=args.custom_vae,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                debug=args.import_debug if args.import_mlir else False,
+                use_lora=args.use_lora,
+            )
        )

-    if not txt2img_obj:
-        sys.exit("text to image pipeline must not return a null value")
-
-    txt2img_obj.scheduler = schedulers[scheduler]
+    global_obj.set_sd_scheduler(scheduler)

    start_time = time.time()
-    txt2img_obj.log = ""
+    global_obj.get_sd_obj().log = ""
    generated_imgs = []
    seeds = []
    img_seed = utils.sanitize_seed(seed)
+    text_output = ""
    for i in range(batch_count):
        if i > 0:
            img_seed = utils.sanitize_seed(-1)
-        out_imgs = txt2img_obj.generate_images(
+        out_imgs = global_obj.get_sd_obj().generate_images(
            prompt,
            negative_prompt,
            batch_size,
@@ -252,48 +164,53 @@ def txt2img_inf(
            args.use_base_vae,
            cpu_scheduling,
        )
-        save_output_img(out_imgs[0], img_seed)
-        generated_imgs.extend(out_imgs)
        seeds.append(img_seed)
-        txt2img_obj.log += "\n"
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"

-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seeds}"
-    text_output += f"\nsize={args.height}x{args.width}, batch-count={batch_count}, batch-size={args.batch_size}, max_length={args.max_length}"
-    text_output += txt2img_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output

    return generated_imgs, text_output


-if __name__ == "__main__":
+def main():
+    if args.clear_all:
+        clear_all()
+
    dtype = torch.float32 if args.precision == "fp32" else torch.half
    cpu_scheduling = not args.scheduler.startswith("Shark")
    set_init_device_flags()
    schedulers = get_schedulers(args.hf_model_id)
    scheduler_obj = schedulers[args.scheduler]
    seed = args.seed
-
    txt2img_obj = Text2ImagePipeline.from_pretrained(
-        scheduler_obj,
-        args.import_mlir,
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.precision,
-        args.max_length,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.use_base_vae,
-        args.use_tuned,
+        scheduler=scheduler_obj,
+        import_mlir=args.import_mlir,
+        model_id=args.hf_model_id,
+        ckpt_loc=args.ckpt_loc,
+        precision=args.precision,
+        max_length=args.max_length,
+        batch_size=args.batch_size,
+        height=args.height,
+        width=args.width,
+        use_base_vae=args.use_base_vae,
+        use_tuned=args.use_tuned,
+        custom_vae=args.custom_vae,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        debug=args.import_debug if args.import_mlir else False,
+        use_lora=args.use_lora,
+        use_quantize=args.use_quantize,
    )

-    for run in range(args.runs):
-        if run > 0:
+    for current_batch in range(args.batch_count):
+        if current_batch > 0:
            seed = -1
        seed = utils.sanitize_seed(seed)

@@ -323,9 +240,13 @@ if __name__ == "__main__":
        text_output += (
            f", batch size={args.batch_size}, max_length={args.max_length}"
        )
-        # TODO: if using --runs=x txt2img_obj.log will output on each display every iteration infos from the start
+        # TODO: if using --batch_count=x txt2img_obj.log will output on each display every iteration infos from the start
        text_output += txt2img_obj.log
        text_output += f"\nTotal image generation time: {total_time:.4f}sec"

        save_output_img(generated_imgs[0], seed)
        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/upscaler.py
+++ b/apps/stable_diffusion/scripts/upscaler.py
@@ -0,0 +1,273 @@
+import torch
+import time
+from PIL import Image
+import transformers
+from apps.stable_diffusion.src import (
+    args,
+    UpscalerPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def upscaler_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    height: int,
+    width: int,
+    steps: int,
+    noise_level: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.scheduler = scheduler
+
+    if init_image is None:
+        return None, "An Initial Image is required"
+    image = init_image.convert("RGB").resize((height, width))
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    args.height = 128
+    args.width = 128
+    new_config_obj = Config(
+        "upscaler",
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        args.height,
+        args.width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            UpscalerPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                use_lora=args.use_lora,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+    global_obj.get_sd_obj().low_res_scheduler = global_obj.get_scheduler(
+        "DDPM"
+    )
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    extra_info = {"NOISE LEVEL": noise_level}
+    for current_batch in range(batch_count):
+        if current_batch > 0:
+            img_seed = utils.sanitize_seed(-1)
+        low_res_img = image
+        high_res_img = Image.new("RGB", (height * 4, width * 4))
+
+        for i in range(0, width, 128):
+            for j in range(0, height, 128):
+                box = (j, i, j + 128, i + 128)
+                upscaled_image = global_obj.get_sd_obj().generate_images(
+                    prompt,
+                    negative_prompt,
+                    low_res_img.crop(box),
+                    batch_size,
+                    args.height,
+                    args.width,
+                    steps,
+                    noise_level,
+                    guidance_scale,
+                    img_seed,
+                    args.max_length,
+                    dtype,
+                    args.use_base_vae,
+                    cpu_scheduling,
+                )
+                high_res_img.paste(upscaled_image[0], (j * 4, i * 4))
+
+        save_output_img(high_res_img, img_seed, extra_info)
+        generated_imgs.append(high_res_img)
+        seeds.append(img_seed)
+        global_obj.get_sd_obj().log += "\n"
+        yield generated_imgs, global_obj.get_sd_obj().log
+
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={steps}, noise_level={noise_level}, guidance_scale={guidance_scale}, seed={seeds}"
+    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
+    text_output += global_obj.get_sd_obj().log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    yield generated_imgs, text_output
+
+
+if __name__ == "__main__":
+    if args.clear_all:
+        clear_all()
+
+    if args.img_path is None:
+        print("Flag --img_path is required.")
+        exit()
+
+    # When the models get uploaded, it should be default to False.
+    args.import_mlir = True
+
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+
+    scheduler_obj = schedulers[args.scheduler]
+    image = (
+        Image.open(args.img_path)
+        .convert("RGB")
+        .resize((args.height, args.width))
+    )
+    seed = utils.sanitize_seed(args.seed)
+    # Adjust for height and width based on model
+
+    upscaler_obj = UpscalerPipeline.from_pretrained(
+        scheduler_obj,
+        args.import_mlir,
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        args.precision,
+        args.max_length,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.use_base_vae,
+        args.use_tuned,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        use_lora=args.use_lora,
+        ddpm_scheduler=schedulers["DDPM"],
+    )
+
+    start_time = time.time()
+    generated_imgs = upscaler_obj.generate_images(
+        args.prompts,
+        args.negative_prompts,
+        image,
+        args.batch_size,
+        args.height,
+        args.width,
+        args.steps,
+        args.noise_level,
+        args.guidance_scale,
+        seed,
+        args.max_length,
+        dtype,
+        args.use_base_vae,
+        cpu_scheduling,
+    )
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+    text_output += f"\nsteps={args.steps}, noise_level={args.noise_level}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
+    text_output += (
+        f", batch size={args.batch_size}, max_length={args.max_length}"
+    )
+    text_output += upscaler_obj.log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    extra_info = {"NOISE LEVEL": args.noise_level}
+    save_output_img(generated_imgs[0], seed, extra_info)
+    print(text_output)
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -1,6 +1,7 @@
 # -*- mode: python ; coding: utf-8 -*-
 from PyInstaller.utils.hooks import collect_data_files
 from PyInstaller.utils.hooks import copy_metadata
+from PyInstaller.utils.hooks import collect_submodules

 import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)

@@ -15,12 +16,14 @@ datas += copy_metadata('filelock')
 datas += copy_metadata('numpy')
 datas += copy_metadata('tokenizers')
 datas += copy_metadata('importlib_metadata')
-datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
-datas += copy_metadata('diffusers')
-datas += copy_metadata('transformers')
 datas += copy_metadata('omegaconf')
 datas += copy_metadata('safetensors')
+datas += collect_data_files('diffusers')
+datas += collect_data_files('transformers')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('opencv-python')
+datas += collect_data_files('skimage')
 datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
@@ -30,21 +33,23 @@ datas += [
         ( 'src/utils/resources/model_db.json', 'resources' ),
         ( 'src/utils/resources/opt_flags.json', 'resources' ),
         ( 'src/utils/resources/base_model.json', 'resources' ),
-         ( 'web/css/*', 'css' ),
-         ( 'web/logos/*', 'logos' )
+         ( 'web/ui/css/*', 'ui/css' ),
+         ( 'web/ui/logos/*', 'logos' )
         ]

 binaries = []

 block_cipher = None

+hiddenimports = ['shark', 'shark.shark_inference', 'apps']
+hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]

 a = Analysis(
    ['web/index.py'],
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hiddenimports=hiddenimports,
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/apps/stable_diffusion/shark_sd_cli.spec
+++ b/apps/stable_diffusion/shark_sd_cli.spec
@@ -1,5 +1,6 @@
 # -*- mode: python ; coding: utf-8 -*-
 from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_submodules
 from PyInstaller.utils.hooks import copy_metadata

 import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
@@ -15,12 +16,14 @@ datas += copy_metadata('filelock')
 datas += copy_metadata('numpy')
 datas += copy_metadata('tokenizers')
 datas += copy_metadata('importlib_metadata')
-datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
-datas += copy_metadata('diffusers')
-datas += copy_metadata('transformers')
 datas += copy_metadata('omegaconf')
 datas += copy_metadata('safetensors')
+datas += collect_data_files('diffusers')
+datas += collect_data_files('transformers')
+datas += collect_data_files('opencv-python')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('skimage')
 datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
@@ -36,13 +39,15 @@ binaries = []

 block_cipher = None

+hiddenimports = ['shark', 'shark.shark_inference', 'apps']
+hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]

 a = Analysis(
-    ['scripts/txt2img.py'],
+    ['scripts/main.py'],
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
+    hiddenimports=hiddenimports,
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -3,6 +3,15 @@ from apps.stable_diffusion.src.utils import (
    set_init_device_flags,
    prompt_examples,
    get_available_devices,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.pipelines import (
+    Text2ImagePipeline,
+    Image2ImagePipeline,
+    InpaintPipeline,
+    OutpaintPipeline,
+    StencilPipeline,
+    UpscalerPipeline,
 )
-from apps.stable_diffusion.src.pipelines import Text2ImagePipeline
 from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -2,6 +2,7 @@ from apps.stable_diffusion.src.models.model_wrappers import (
    SharkifyStableDiffusionModel,
 )
 from apps.stable_diffusion.src.models.opt_params import (
+    get_vae_encode,
    get_vae,
    get_unet,
    get_clip,
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -1,19 +1,24 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
 from transformers import CLIPTextModel
 from collections import defaultdict
 import torch
+import safetensors.torch
 import traceback
-import re
 import sys
+import os
 from apps.stable_diffusion.src.utils import (
    compile_through_fx,
    get_opt_flags,
    base_models,
    args,
-    fetch_or_delete_vmfbs,
+    fetch_vmfbs,
    preprocessCKPT,
    get_path_to_diffusers_checkpoint,
    fetch_and_update_base_model_id,
+    get_path_stem,
+    get_extended_name,
+    get_stencil_model_id,
+    update_lora_weight,
 )


@@ -28,44 +33,34 @@ def replace_shape_str(shape, max_len, width, height, batch_size):
        elif shape[i] == "width":
            new_shape.append(width)
        elif isinstance(shape[i], str):
-            if "batch_size" in shape[i]:
+            if "*" in shape[i]:
                mul_val = int(shape[i].split("*")[0])
-                new_shape.append(batch_size * mul_val)
+                if "batch_size" in shape[i]:
+                    new_shape.append(batch_size * mul_val)
+                elif "height" in shape[i]:
+                    new_shape.append(height * mul_val)
+                elif "width" in shape[i]:
+                    new_shape.append(width * mul_val)
+            elif "/" in shape[i]:
+                import math
+                div_val = int(shape[i].split("/")[1])
+                if "batch_size" in shape[i]:
+                    new_shape.append(math.ceil(batch_size / div_val))
+                elif "height" in shape[i]:
+                    new_shape.append(math.ceil(height / div_val))
+                elif "width" in shape[i]:
+                    new_shape.append(math.ceil(width / div_val))
        else:
            new_shape.append(shape[i])
    return new_shape


-# Get the input info for various models i.e. "unet", "clip", "vae".
-def get_input_info(model_info, max_len, width, height, batch_size):
-    dtype_config = {"f32": torch.float32, "i64": torch.int64}
-    input_map = defaultdict(list)
-    for k in model_info:
-        for inp in model_info[k]:
-            shape = model_info[k][inp]["shape"]
-            dtype = dtype_config[model_info[k][inp]["dtype"]]
-            tensor = None
-            if isinstance(shape, list):
-                clean_shape = replace_shape_str(
-                    shape, max_len, width, height, batch_size
-                )
-                if dtype == torch.int64:
-                    tensor = torch.randint(1, 3, tuple(clean_shape))
-                else:
-                    tensor = torch.randn(*clean_shape).to(dtype)
-            elif isinstance(shape, int):
-                tensor = torch.tensor(shape).to(dtype)
-            else:
-                sys.exit("shape isn't specified correctly.")
-            input_map[k].append(tensor)
-    return input_map
-
-
 class SharkifyStableDiffusionModel:
    def __init__(
        self,
        model_id: str,
        custom_weights: str,
+        custom_vae: str,
        precision: str,
        max_len: int = 64,
        width: int = 512,
@@ -73,6 +68,15 @@ class SharkifyStableDiffusionModel:
        batch_size: int = 1,
        use_base_vae: bool = False,
        use_tuned: bool = False,
+        low_cpu_mem_usage: bool = False,
+        debug: bool = False,
+        sharktank_dir: str = "",
+        generate_vmfb: bool = True,
+        is_inpaint: bool = False,
+        is_upscaler: bool = False,
+        use_stencil: str = None,
+        use_lora: str = "",
+        use_quantize: str = None,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
@@ -80,16 +84,22 @@ class SharkifyStableDiffusionModel:
        self.width = width // 8
        self.batch_size = batch_size
        self.custom_weights = custom_weights
+        self.use_quantize = use_quantize
        if custom_weights != "":
            assert custom_weights.lower().endswith(
                (".ckpt", ".safetensors")
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
            custom_weights = get_path_to_diffusers_checkpoint(custom_weights)
        self.model_id = model_id if custom_weights == "" else custom_weights
+        # TODO: remove the following line when stable-diffusion-2-1 works
+        if self.model_id == "stabilityai/stable-diffusion-2-1":
+            self.model_id = "stabilityai/stable-diffusion-2-1-base"
+        self.custom_vae = custom_vae
        self.precision = precision
        self.base_vae = use_base_vae
        self.model_name = (
-            str(batch_size)
+            "_"
+            + str(batch_size)
            + "_"
            + str(max_len)
            + "_"
@@ -99,37 +109,126 @@ class SharkifyStableDiffusionModel:
            + "_"
            + precision
        )
+        print(f'use_tuned? sharkify: {use_tuned}')
        self.use_tuned = use_tuned
        if use_tuned:
            self.model_name = self.model_name + "_tuned"
-        # We need a better naming convention for the .vmfbs because despite
-        # using the custom model variant the .vmfb names remain the same and
-        # it'll always pick up the compiled .vmfb instead of compiling the
-        # custom model.
-        # So, currently, we add `self.model_id` in the `self.model_name` of
-        # .vmfb file.
-        # TODO: Have a better way of naming the vmfbs using self.model_name.
-        model_name = re.sub(r"\W+", "_", self.model_id)
-        if model_name[0] == "_":
-            model_name = model_name[1:]
-        self.model_name = self.model_name + "_" + model_name
+        self.model_name = self.model_name + "_" + get_path_stem(self.model_id)
+        self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.is_inpaint = is_inpaint
+        self.is_upscaler = is_upscaler
+        self.use_stencil = get_stencil_model_id(use_stencil)
+        if use_lora != "":
+            self.model_name = self.model_name + "_" + get_path_stem(use_lora)
+        self.use_lora = use_lora
+
+        print(self.model_name)
+        self.debug = debug
+        self.sharktank_dir = sharktank_dir
+        self.generate_vmfb = generate_vmfb
+
+    def get_extended_name_for_all_model(self, mask_to_fetch):
+        model_name = {}
+        sub_model_list = ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
+        index = 0
+        for model in sub_model_list:
+            if mask_to_fetch[index] == False:
+                index += 1
+                continue
+            sub_model = model
+            model_config = self.model_name
+            if "vae" == model:
+                if self.custom_vae != "":
+                    model_config = model_config + get_path_stem(self.custom_vae)
+                if self.base_vae:
+                    sub_model = "base_vae"
+            model_name[model] = get_extended_name(sub_model + model_config)
+            index += 1
+        return model_name

    def check_params(self, max_len, width, height):
        if not (max_len >= 32 and max_len <= 77):
            sys.exit("please specify max_len in the range [32, 77].")
-        if not (width % 8 == 0 and width >= 384):
-            sys.exit("width should be greater than 384 and multiple of 8")
-        if not (height % 8 == 0 and height >= 384):
-            sys.exit("height should be greater than 384 and multiple of 8")
+        if not (width % 8 == 0 and width >= 128):
+            sys.exit("width should be greater than 128 and multiple of 8")
+        if not (height % 8 == 0 and height >= 128):
+            sys.exit("height should be greater than 128 and multiple of 8")

-    def get_vae(self):
-        class VaeModel(torch.nn.Module):
-            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
+    # Get the input info for a model i.e. "unet", "clip", "vae", etc.
+    def get_input_info_for(self, model_info):
+        dtype_config = {"f32": torch.float32, "i64": torch.int64}
+        input_map = []
+        for inp in model_info:
+            shape = model_info[inp]["shape"]
+            dtype = dtype_config[model_info[inp]["dtype"]]
+            tensor = None
+            if isinstance(shape, list):
+                clean_shape = replace_shape_str(
+                    shape, self.max_len, self.width, self.height, self.batch_size
+                )
+                if dtype == torch.int64:
+                    tensor = torch.randint(1, 3, tuple(clean_shape))
+                else:
+                    tensor = torch.randn(*clean_shape).to(dtype)
+            elif isinstance(shape, int):
+                tensor = torch.tensor(shape).to(dtype)
+            else:
+                sys.exit("shape isn't specified correctly.")
+            input_map.append(tensor)
+        return input_map
+    
+    def get_vae_encode(self):
+        class VaeEncodeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
                super().__init__()
                self.vae = AutoencoderKL.from_pretrained(
                    model_id,
                    subfolder="vae",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
+
+            def forward(self, input):
+                latents = self.vae.encode(input).latent_dist.sample()
+                return 0.18215 * latents
+
+        vae_encode = VaeEncodeModel()
+        inputs = tuple(self.inputs["vae_encode"])
+        is_f16 = True if self.precision == "fp16" else False
+        shark_vae_encode = compile_through_fx(
+            vae_encode,
+            inputs,
+            is_f16=is_f16,
+            use_tuned=self.use_tuned,
+            model_name=self.model_name["vae_encode"],
+            extra_args=get_opt_flags("vae", precision=self.precision),
+            base_model_id=self.base_model_id,
+        )
+        return shark_vae_encode
+
+    def get_vae(self):
+        class VaeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, base_vae=self.base_vae, custom_vae=self.custom_vae, low_cpu_mem_usage=False):
+                super().__init__()
+                self.vae = None
+                if custom_vae == "":
+                    self.vae = AutoencoderKL.from_pretrained(
+                        model_id,
+                        subfolder="vae",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                elif not isinstance(custom_vae, dict):
+                    self.vae = AutoencoderKL.from_pretrained(
+                        custom_vae,
+                        subfolder="vae",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                else:
+                    self.vae = AutoencoderKL.from_pretrained(
+                        model_id,
+                        subfolder="vae",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                    self.vae.load_state_dict(custom_vae)
                self.base_vae = base_vae

            def forward(self, input):
@@ -142,33 +241,157 @@ class SharkifyStableDiffusionModel:
                x = x * 255.0
                return x.round()

-        vae = VaeModel()
+        vae = VaeModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        inputs = tuple(self.inputs["vae"])
        is_f16 = True if self.precision == "fp16" else False
-        vae_name = "base_vae" if self.base_vae else "vae"
+        save_dir = os.path.join(self.sharktank_dir, self.model_name["vae"])
+        if self.debug:
+            os.makedirs(save_dir, exist_ok=True)
        shark_vae = compile_through_fx(
            vae,
            inputs,
            is_f16=is_f16,
            use_tuned=self.use_tuned,
-            model_name=vae_name + self.model_name,
+            model_name=self.model_name["vae"],
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
            extra_args=get_opt_flags("vae", precision=self.precision),
+            base_model_id=self.base_model_id,
        )
        return shark_vae

-    def get_unet(self):
-        class UnetModel(torch.nn.Module):
-            def __init__(self, model_id=self.model_id):
+    def get_controlled_unet(self):
+        class ControlledUnetModel(torch.nn.Module):
+            def __init__(
+                self, model_id=self.model_id, low_cpu_mem_usage=False, use_lora=self.use_lora
+            ):
                super().__init__()
                self.unet = UNet2DConditionModel.from_pretrained(
                    model_id,
                    subfolder="unet",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
+                if use_lora != "":
+                    update_lora_weight(self.unet, use_lora, "unet")
                self.in_channels = self.unet.in_channels
                self.train(False)

+            def forward( self, latent, timestep, text_embedding, guidance_scale, control1,
+                         control2, control3, control4, control5, control6, control7,
+                         control8, control9, control10, control11, control12, control13,
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                db_res_samples = tuple([ control1, control2, control3, control4, control5, control6, control7, control8, control9, control10, control11, control12,])
+                mb_res_samples = control13
+                latents = torch.cat([latent] * 2)
+                unet_out = self.unet.forward(
+                    latents,
+                    timestep,
+                    encoder_hidden_states=text_embedding,
+                    down_block_additional_residuals=db_res_samples,
+                    mid_block_additional_residual=mb_res_samples,
+                    return_dict=False,
+                )[0]
+                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = ControlledUnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        is_f16 = True if self.precision == "fp16" else False
+
+        inputs = tuple(self.inputs["unet"])
+        input_mask = [True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True,]
+        shark_controlled_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name=self.model_name["stencil_unet"],
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+            base_model_id=self.base_model_id,
+        )
+        return shark_controlled_unet
+
+    def get_control_net(self):
+        class StencilControlNetModel(torch.nn.Module):
+            def __init__(
+                self, model_id=self.use_stencil, low_cpu_mem_usage=False
+            ):
+                super().__init__()
+                self.cnet = ControlNetModel.from_pretrained(
+                    model_id,
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                )
+                self.in_channels = self.cnet.in_channels
+                self.train(False)
+
            def forward(
-                self, latent, timestep, text_embedding, guidance_scale
+                self,
+                latent,
+                timestep,
+                text_embedding,
+                stencil_image_input,
+            ):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                # TODO: guidance NOT NEEDED change in `get_input_info` later
+                latents = torch.cat(
+                    [latent] * 2
+                )  # needs to be same as controlledUNET latents
+                stencil_image = torch.cat(
+                    [stencil_image_input] * 2
+                )  # needs to be same as controlledUNET latents
+                down_block_res_samples, mid_block_res_sample = self.cnet.forward(
+                    latents,
+                    timestep,
+                    encoder_hidden_states=text_embedding,
+                    controlnet_cond=stencil_image,
+                    return_dict=False,
+                )
+                return tuple(list(down_block_res_samples) + [mid_block_res_sample])
+
+        scnet = StencilControlNetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        is_f16 = True if self.precision == "fp16" else False
+
+        inputs = tuple(self.inputs["stencil_adaptor"])
+        input_mask = [True, True, True, True]
+        shark_cnet = compile_through_fx(
+            scnet,
+            inputs,
+            model_name=self.model_name["stencil_adaptor"],
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+            base_model_id=self.base_model_id,
+        )
+        return shark_cnet
+
+    def get_unet(self):
+        class UnetModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False, use_lora=self.use_lora):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    model_id,
+                    subfolder="unet",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                )
+                if use_lora != "":
+                    update_lora_weight(self.unet, use_lora, "unet")
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+                if(args.attention_slicing is not None and args.attention_slicing != "none"):
+                    if(args.attention_slicing.isdigit()):
+                        self.unet.set_attention_slice(int(args.attention_slicing))
+                    else:
+                        self.unet.set_attention_slice(args.attention_slicing)
+
+            # TODO: Instead of flattening the `control` try to use the list.
+            def forward(
+                self, latent, timestep, text_embedding, guidance_scale,
            ):
                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
                latents = torch.cat([latent] * 2)
@@ -181,115 +404,258 @@ class SharkifyStableDiffusionModel:
                )
                return noise_pred

-        unet = UnetModel()
+        unet = UnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        is_f16 = True if self.precision == "fp16" else False
+        inputs = tuple(self.inputs["unet"])
+        input_mask = [True, True, True, False]
+        save_dir = os.path.join(self.sharktank_dir, self.model_name["unet"])
+        if self.debug:
+            os.makedirs(
+                save_dir,
+                exist_ok=True,
+            )
+        shark_unet = compile_through_fx(
+            unet,
+            inputs,
+            model_name=self.model_name["unet"],
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+            base_model_id=self.base_model_id,
+        )
+        return shark_unet
+
+    def get_unet_upscaler(self):
+        class UnetModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
+                super().__init__()
+                self.unet = UNet2DConditionModel.from_pretrained(
+                    model_id,
+                    subfolder="unet",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                )
+                self.in_channels = self.unet.in_channels
+                self.train(False)
+
+            def forward(self, latent, timestep, text_embedding, noise_level):
+                unet_out = self.unet.forward(
+                    latent,
+                    timestep,
+                    text_embedding,
+                    noise_level,
+                    return_dict=False,
+                )[0]
+                return unet_out
+
+        unet = UnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False]
        shark_unet = compile_through_fx(
            unet,
            inputs,
-            model_name="unet" + self.model_name,
+            model_name=self.model_name["unet"],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
+            base_model_id=self.base_model_id,
        )
        return shark_unet

    def get_clip(self):
        class CLIPText(torch.nn.Module):
-            def __init__(self, model_id=self.model_id):
+            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False, use_lora=self.use_lora):
                super().__init__()
                self.text_encoder = CLIPTextModel.from_pretrained(
                    model_id,
                    subfolder="text_encoder",
+                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
+                if use_lora != "":
+                    update_lora_weight(self.text_encoder, use_lora, "text_encoder")

            def forward(self, input):
                return self.text_encoder(input)[0]

-        clip_model = CLIPText()
+        clip_model = CLIPText(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        save_dir = os.path.join(self.sharktank_dir, self.model_name["clip"])
+        if self.debug:
+            os.makedirs(
+                save_dir,
+                exist_ok=True,
+            )
        shark_clip = compile_through_fx(
            clip_model,
            tuple(self.inputs["clip"]),
-            model_name="clip" + self.model_name,
+            model_name=self.model_name["clip"],
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
            extra_args=get_opt_flags("clip", precision="fp32"),
+            base_model_id=self.base_model_id,
        )
        return shark_clip

-    # Compiles Clip, Unet and Vae with `base_model_id` as defining their input
-    # configiration.
-    def compile_all(self, base_model_id):
-        self.inputs = get_input_info(
-            base_models[base_model_id],
-            self.max_len,
-            self.width,
-            self.height,
-            self.batch_size,
-        )
-        compiled_unet = self.get_unet()
-        compiled_vae = self.get_vae()
-        compiled_clip = self.get_clip()
+    def process_custom_vae(self):
+        custom_vae = self.custom_vae.lower()
+        if not custom_vae.endswith((".ckpt", ".safetensors")):
+            return self.custom_vae
+        try:
+            preprocessCKPT(self.custom_vae)
+            return get_path_to_diffusers_checkpoint(self.custom_vae)
+        except:
+            print("Processing standalone Vae checkpoint")
+            vae_checkpoint = None
+            vae_ignore_keys = {"model_ema.decay", "model_ema.num_updates"}
+            if custom_vae.endswith(".ckpt"):
+                vae_checkpoint = torch.load(self.custom_vae, map_location="cpu")
+            else:
+                vae_checkpoint = safetensors.torch.load_file(self.custom_vae, device="cpu")
+            if "state_dict" in vae_checkpoint:
+                vae_checkpoint = vae_checkpoint["state_dict"]
+            vae_dict = {k: v for k, v in vae_checkpoint.items() if k[0:4] != "loss" and k not in vae_ignore_keys}
+            return vae_dict
+
+    def compile_unet_variants(self, need_stencil):
+        compiled_unet = None
+        if self.is_upscaler:
+            compiled_unet = self.get_unet_upscaler()
+        elif need_stencil:
+            compiled_unet = self.get_controlled_unet()
+        else:
+            # TODO: Plug the experimental "int8" support at right place.
+            if self.use_quantize == "int8":
+                from apps.stable_diffusion.src.models.opt_params import get_unet
+                compiled_unet = get_unet()
+            else:
+                compiled_unet = self.get_unet()
+        return compiled_unet
+    
+    def compile_models(self, vmfbs, need_stencil, need_vae_encode, model_to_run):
+        def check_compilation(model, model_name):
+            if not model:
+                raise Exception(f"Could not compile {model_name}. Please create an issue with the detailed log at https://github.com/nod-ai/SHARK/issues")
+
+        compiled_clip = None
+        compiled_unet = None
+        compiled_vae = None
+        compiled_vae_encode = None
+        compiled_stencil_adaptor = None
+
+        self.inputs = dict()
+
+        # 1. Process UNET.
+        if vmfbs[1]:
+            compiled_unet = vmfbs[1]
+        else:
+            unet_inputs = base_models["stencil_unet"] if need_stencil else base_models["unet"]
+            if self.base_model_id != "":
+                self.inputs["unet"] = self.get_input_info_for(unet_inputs[self.base_model_id])
+                compiled_unet = self.compile_unet_variants(need_stencil)
+            else:
+                for model_id in unet_inputs:
+                    self.base_model_id = model_id
+                    self.inputs["unet"] = self.get_input_info_for(unet_inputs[model_id])
+                    try:
+                        compiled_unet = self.compile_unet_variants(need_stencil)
+                    except Exception as e:
+                        print(e)
+                        print("Retrying with a different base model configuration")
+                        continue
+                    # -- Once a successful compilation has taken place we'd want to store
+                    #    the base model's configuration inferred.
+                    fetch_and_update_base_model_id(model_to_run, model_id)
+                    # This is done just because in main.py we are basing the choice of tokenizer and scheduler
+                    # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
+                    # model and rely on retrying method to find the input configuration, we should also update
+                    # the knowledge of base model id accordingly into `args.hf_model_id`.
+                    if args.ckpt_loc != "":
+                        args.hf_model_id = model_id
+                    break
+        check_compilation(compiled_unet, "Unet")
+
+        # 2. Process VAE.
+        vae_input = base_models["vae"]
+        is_base_vae = self.base_vae
+        if self.is_upscaler:
+            self.base_vae = True
+        if vmfbs[2]:
+            compiled_vae = vmfbs[2]
+        else:
+            if self.is_upscaler:
+                vae_input = vae_input["vae_upscaler"]
+            else:
+                vae_input = vae_input["vae"]
+            self.inputs["vae"] = self.get_input_info_for(vae_input)
+            compiled_vae = self.get_vae()
+        self.base_vae = is_base_vae
+        check_compilation(compiled_vae, "Vae")
        
+        # 3. Process CLIP.
+        self.inputs["clip"] = self.get_input_info_for(base_models["clip"])
+        compiled_clip = vmfbs[0] if vmfbs[0] else self.get_clip()
+        check_compilation(compiled_clip, "Clip")
+
+        # 4. Process VAE_ENCODE.
+        if need_vae_encode:
+            self.inputs["vae_encode"] = self.get_input_info_for(base_models["vae_encode"])
+            compiled_vae_encode = vmfbs[3] if vmfbs[3] else self.get_vae_encode()
+            check_compilation(compiled_vae_encode, "Vae Encode")
+        
+        # 5. Process STENCIL.
+        if need_stencil:
+            self.inputs["stencil_adaptor"] = self.get_input_info_for(base_models["stencil_adaptor"])
+            compiled_stencil_adaptor = vmfbs[3] if vmfbs[3] else self.get_control_net()
+            check_compilation(compiled_stencil_adaptor, "Stencil")
+
+        if need_stencil:
+            return compiled_clip, compiled_unet, compiled_vae, compiled_stencil_adaptor
+        if need_vae_encode:
+            return compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode
        return compiled_clip, compiled_unet, compiled_vae

    def __call__(self):
        # Step 1:
        # --  Fetch all vmfbs for the model, if present, else delete the lot.
-        vmfbs = fetch_or_delete_vmfbs(
-            self.model_name, self.base_vae, self.precision
-        )   
-        if vmfbs[0]:
-            # -- If all vmfbs are indeed present, we also try and fetch the base
-            #    model configuration for running SD with custom checkpoints.
-            if self.custom_weights != "":
-                args.hf_model_id = fetch_and_update_base_model_id(self.custom_weights)
-            if args.hf_model_id == "":
-                sys.exit("Base model configuration for the custom model is missing. Use `--clear_all` and re-run.")
-            print("Loaded vmfbs from cache and successfully fetched base model configuration.")
-            return vmfbs
-
-        # Step 2:
-        # -- If vmfbs weren't found, we try to see if the base model configuration
-        #    for the required SD run is known to us and bypass the retry mechanism.
+        need_vae_encode, need_stencil = False, False
+        if not self.is_upscaler and args.img_path is not None:
+            if self.use_stencil is not None:
+                need_stencil = True
+            else:
+                need_vae_encode = True
+        # `mask_to_fetch` prepares a mask to pick a combination out of :-
+        # ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
+        mask_to_fetch = [True, True, False, True, False, False]
+        if need_vae_encode:
+            mask_to_fetch = [True, True, False, True, True, False]
+        elif need_stencil:
+            mask_to_fetch = [True, False, True, True, False, True]
+        self.models_to_compile = mask_to_fetch
+        self.model_name = self.get_extended_name_for_all_model(mask_to_fetch)
+        vmfbs = fetch_vmfbs(self.model_name, self.precision)
+        # We try to see if the base model configuration for the required SD run is
+        # known to us and bypass the retry mechanism.
        model_to_run = ""
        if self.custom_weights != "":
            model_to_run = self.custom_weights
            assert self.custom_weights.lower().endswith(
                (".ckpt", ".safetensors")
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
-            preprocessCKPT(self.custom_weights)
+            preprocessCKPT(self.custom_weights, self.is_inpaint)
        else:
            model_to_run = args.hf_model_id
-        base_model_fetched = fetch_and_update_base_model_id(model_to_run)
-        if base_model_fetched != "":
-            print("Compiling all the models with the fetched base model configuration.")
-            if args.ckpt_loc != "":
-                args.hf_model_id = base_model_fetched
-            return self.compile_all(base_model_fetched)
-
-        # Step 3:
-        # -- This is the retry mechanism where the base model's configuration is not
-        #    known to us and figure that out by trial and error.
-        print("Inferring base model configuration.")
-        for model_id in base_models:
-            try:
-                compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id)
-            except Exception as e:
-                if args.enable_stack_trace:
-                    traceback.print_exc()
-                print("Retrying with a different base model configuration")
-                continue
-            # -- Once a successful compilation has taken place we'd want to store
-            #    the base model's configuration inferred.
-            fetch_and_update_base_model_id(model_to_run, model_id)
-            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
-            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
-            # model and rely on retrying method to find the input configuration, we should also update
-            # the knowledge of base model id accordingly into `args.hf_model_id`.
-            if args.ckpt_loc != "":
-                args.hf_model_id = model_id
-            return compiled_clip, compiled_unet, compiled_vae
-        sys.exit(
-            "Cannot compile the model. Please re-run the command with `--enable_stack_trace` flag and create an issue with detailed log at https://github.com/nod-ai/SHARK/issues"
-        )
+        # For custom Vae user can provide either the repo-id or a checkpoint file,
+        # and for a checkpoint file we'd need to process it via Diffusers' script.
+        self.custom_vae = self.process_custom_vae()
+        self.base_model_id = fetch_and_update_base_model_id(model_to_run)
+        if self.base_model_id != "" and args.ckpt_loc != "":
+            args.hf_model_id = self.base_model_id
+        try:
+            return self.compile_models(vmfbs, need_stencil, need_vae_encode, model_to_run)
+        except Exception as e:
+            sys.exit(e)
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -9,15 +9,26 @@ from apps.stable_diffusion.src.utils import (


 hf_model_variant_map = {
-    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
-    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
-    "prompthero/openjourney": ["openjourney", "v2_1base"],
-    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
+    "Linaqruf/anything-v3.0": ["anythingv3", "v1_4"],
+    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v1_4"],
+    "prompthero/openjourney": ["openjourney", "v1_4"],
+    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v1_4"],
    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1base"],
    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
+    "runwayml/stable-diffusion-inpainting": ["stablediffusion", "inpaint_v1"],
+    "stabilityai/stable-diffusion-2-inpainting": ["stablediffusion", "inpaint_v2"],
 }

+# TODO: Add the quantized model as a part model_db.json.
+# This is currently in experimental phase.
+def get_quantize_model():
+    bucket_key = "gs://shark_tank/prashant_nod"
+    model_key = "unet_int8"
+    iree_flags = get_opt_flags("unet", precision="fp16")
+    if args.height != 512 and args.width != 512 and args.max_length != 77:
+        sys.exit("The int8 quantized model currently requires the height and width to be 512, and max_length to be 77")
+    return bucket_key, model_key, iree_flags

 def get_variant_version(hf_model_id):
    return hf_model_variant_map[hf_model_id]
@@ -39,6 +50,12 @@ def get_unet():
    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
    is_tuned = "tuned" if args.use_tuned else "untuned"
+
+    # TODO: Get the quantize model from model_db.json
+    if args.use_quantize == "int8":
+        bk, mk, flags = get_quantize_model()
+        return get_shark_model(bk, mk, flags)
+
    if "vulkan" not in args.device and args.use_tuned:
        bucket_key = f"{variant}/{is_tuned}/{args.device}"
        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
@@ -52,6 +69,23 @@ def get_unet():
    return get_shark_model(bucket, model_name, iree_flags)


+def get_vae_encode():
+    variant, version = get_variant_version(args.hf_model_id)
+    # Tuned model is present only for `fp16` precision.
+    is_tuned = "tuned" if args.use_tuned else "untuned"
+    if "vulkan" not in args.device and args.use_tuned:
+        bucket_key = f"{variant}/{is_tuned}/{args.device}"
+        model_key = f"{variant}/{version}/vae_encode/{args.precision}/length_77/{is_tuned}/{args.device}"
+    else:
+        bucket_key = f"{variant}/{is_tuned}"
+        model_key = f"{variant}/{version}/vae_encode/{args.precision}/length_77/{is_tuned}"
+
+    bucket, model_name, iree_flags = get_params(
+        bucket_key, model_key, "vae", is_tuned, args.precision
+    )
+    return get_shark_model(bucket, model_name, iree_flags)
+
+
 def get_vae():
    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -1,3 +1,18 @@
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
    Text2ImagePipeline,
 )
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_img2img import (
+    Image2ImagePipeline,
+)
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_inpaint import (
+    InpaintPipeline,
+)
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_outpaint import (
+    OutpaintPipeline,
+)
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_stencil import (
+    StencilPipeline,
+)
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_upscaler import (
+    UpscalerPipeline,
+)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -0,0 +1,172 @@
+import torch
+import time
+import numpy as np
+from tqdm.auto import tqdm
+from random import randint
+from PIL import Image
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+
+
+class Image2ImagePipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_encode: SharkInference,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.vae_encode = vae_encode
+
+    def prepare_image_latents(
+        self,
+        image,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        strength,
+        dtype,
+    ):
+        # Pre process image -> get image encoded -> process latents
+
+        # TODO: process with variable HxW combos
+
+        # Pre process image
+        image = image.resize((width, height))
+        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
+        image_arr = image_arr / 255.0
+        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(dtype)
+        image_arr = 2 * (image_arr - 0.5)
+
+        # set scheduler steps
+        self.scheduler.set_timesteps(num_inference_steps)
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps
+        )
+        t_start = max(num_inference_steps - init_timestep, 0)
+        # timesteps reduced as per strength
+        timesteps = self.scheduler.timesteps[t_start:]
+        # new number of steps to be used as per strength will be
+        # num_inference_steps = num_inference_steps - t_start
+
+        # image encode
+        latents = self.encode_image((image_arr,))
+        latents = torch.from_numpy(latents).to(dtype)
+        # add noise to data
+        noise = torch.randn(latents.shape, generator=generator, dtype=dtype)
+        latents = self.scheduler.add_noise(
+            latents, noise, timesteps[0].repeat(1)
+        )
+
+        return latents, timesteps
+
+    def encode_image(self, input_image):
+        vae_encode_start = time.time()
+        latents = self.vae_encode("forward", input_image)
+        vae_inf_time = (time.time() - vae_encode_start) * 1000
+        self.log += f"\nVAE Encode Inference time (ms): {vae_inf_time:.3f}"
+
+        return latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        strength,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+        use_stencil,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Prepare input image latent
+        image_latents, final_timesteps = self.prepare_image_latents(
+            image=image,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            strength=strength,
+            dtype=dtype,
+        )
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=image_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=final_timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
@@ -0,0 +1,445 @@
+import torch
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from PIL import Image, ImageOps
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+
+
+class InpaintPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_encode: SharkInference,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.vae_encode = vae_encode
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_crop_region(self, mask, pad=0):
+        h, w = mask.shape
+
+        crop_left = 0
+        for i in range(w):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_left += 1
+
+        crop_right = 0
+        for i in reversed(range(w)):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_right += 1
+
+        crop_top = 0
+        for i in range(h):
+            if not (mask[i] == 0).all():
+                break
+            crop_top += 1
+
+        crop_bottom = 0
+        for i in reversed(range(h)):
+            if not (mask[i] == 0).all():
+                break
+            crop_bottom += 1
+
+        return (
+            int(max(crop_left - pad, 0)),
+            int(max(crop_top - pad, 0)),
+            int(min(w - crop_right + pad, w)),
+            int(min(h - crop_bottom + pad, h)),
+        )
+
+    def expand_crop_region(
+        self,
+        crop_region,
+        processing_width,
+        processing_height,
+        image_width,
+        image_height,
+    ):
+        x1, y1, x2, y2 = crop_region
+
+        ratio_crop_region = (x2 - x1) / (y2 - y1)
+        ratio_processing = processing_width / processing_height
+
+        if ratio_crop_region > ratio_processing:
+            desired_height = (x2 - x1) / ratio_processing
+            desired_height_diff = int(desired_height - (y2 - y1))
+            y1 -= desired_height_diff // 2
+            y2 += desired_height_diff - desired_height_diff // 2
+            if y2 >= image_height:
+                diff = y2 - image_height
+                y2 -= diff
+                y1 -= diff
+            if y1 < 0:
+                y2 -= y1
+                y1 -= y1
+            if y2 >= image_height:
+                y2 = image_height
+        else:
+            desired_width = (y2 - y1) * ratio_processing
+            desired_width_diff = int(desired_width - (x2 - x1))
+            x1 -= desired_width_diff // 2
+            x2 += desired_width_diff - desired_width_diff // 2
+            if x2 >= image_width:
+                diff = x2 - image_width
+                x2 -= diff
+                x1 -= diff
+            if x1 < 0:
+                x2 -= x1
+                x1 -= x1
+            if x2 >= image_width:
+                x2 = image_width
+
+        return x1, y1, x2, y2
+
+    def resize_image(self, resize_mode, im, width, height):
+        """
+        resize_mode:
+            0: Resize the image to fill the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
+            1: Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
+        """
+
+        if resize_mode == 0:
+            ratio = width / height
+            src_ratio = im.width / im.height
+
+            src_w = (
+                width if ratio > src_ratio else im.width * height // im.height
+            )
+            src_h = (
+                height if ratio <= src_ratio else im.height * width // im.width
+            )
+
+            resized = im.resize((src_w, src_h), resample=Image.LANCZOS)
+            res = Image.new("RGB", (width, height))
+            res.paste(
+                resized,
+                box=(width // 2 - src_w // 2, height // 2 - src_h // 2),
+            )
+
+        else:
+            ratio = width / height
+            src_ratio = im.width / im.height
+
+            src_w = (
+                width if ratio < src_ratio else im.width * height // im.height
+            )
+            src_h = (
+                height if ratio >= src_ratio else im.height * width // im.width
+            )
+
+            resized = im.resize((src_w, src_h), resample=Image.LANCZOS)
+            res = Image.new("RGB", (width, height))
+            res.paste(
+                resized,
+                box=(width // 2 - src_w // 2, height // 2 - src_h // 2),
+            )
+
+            if ratio < src_ratio:
+                fill_height = height // 2 - src_h // 2
+                res.paste(
+                    resized.resize((width, fill_height), box=(0, 0, width, 0)),
+                    box=(0, 0),
+                )
+                res.paste(
+                    resized.resize(
+                        (width, fill_height),
+                        box=(0, resized.height, width, resized.height),
+                    ),
+                    box=(0, fill_height + src_h),
+                )
+            elif ratio > src_ratio:
+                fill_width = width // 2 - src_w // 2
+                res.paste(
+                    resized.resize(
+                        (fill_width, height), box=(0, 0, 0, height)
+                    ),
+                    box=(0, 0),
+                )
+                res.paste(
+                    resized.resize(
+                        (fill_width, height),
+                        box=(resized.width, 0, resized.width, height),
+                    ),
+                    box=(fill_width + src_w, 0),
+                )
+
+        return res
+
+    def prepare_mask_and_masked_image(
+        self,
+        image,
+        mask,
+        height,
+        width,
+        inpaint_full_res,
+        inpaint_full_res_padding,
+    ):
+        # preprocess image
+        image = image.resize((width, height))
+        mask = mask.resize((width, height))
+
+        paste_to = ()
+        overlay_image = None
+        if inpaint_full_res:
+            # prepare overlay image
+            overlay_image = Image.new("RGB", (image.width, image.height))
+            overlay_image.paste(
+                image.convert("RGB"),
+                mask=ImageOps.invert(mask.convert("L")),
+            )
+
+            # prepare mask
+            mask = mask.convert("L")
+            crop_region = self.get_crop_region(
+                np.array(mask), inpaint_full_res_padding
+            )
+            crop_region = self.expand_crop_region(
+                crop_region, width, height, mask.width, mask.height
+            )
+            x1, y1, x2, y2 = crop_region
+            mask = mask.crop(crop_region)
+            mask = self.resize_image(1, mask, width, height)
+            paste_to = (x1, y1, x2 - x1, y2 - y1)
+
+            # prepare image
+            image = image.crop(crop_region)
+            image = self.resize_image(1, image, width, height)
+
+        if isinstance(image, (Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], Image.Image):
+            mask = np.concatenate(
+                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
+            )
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+        masked_image = image * (mask < 0.5)
+
+        return mask, masked_image, paste_to, overlay_image
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+    ):
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // 8, width // 8)
+        )
+        mask = mask.to(dtype)
+
+        masked_image = masked_image.to(dtype)
+        masked_image_latents = self.vae_encode("forward", (masked_image,))
+        masked_image_latents = torch.from_numpy(masked_image_latents)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(
+                batch_size // masked_image_latents.shape[0], 1, 1, 1
+            )
+        return mask, masked_image_latents
+
+    def apply_overlay(self, image, paste_loc, overlay):
+        x, y, w, h = paste_loc
+        image = self.resize_image(0, image, w, h)
+        overlay.paste(image, (x, y))
+
+        return overlay
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        mask_image,
+        batch_size,
+        height,
+        width,
+        inpaint_full_res,
+        inpaint_full_res_padding,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Preprocess mask and image
+        (
+            mask,
+            masked_image,
+            paste_to,
+            overlay_image,
+        ) = self.prepare_mask_and_masked_image(
+            image,
+            mask_image,
+            height,
+            width,
+            inpaint_full_res,
+            inpaint_full_res_padding,
+        )
+
+        # Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask=mask,
+            masked_image=masked_image,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            dtype=dtype,
+        )
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=self.scheduler.timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+            mask=mask,
+            masked_image_latents=masked_image_latents,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        if inpaint_full_res:
+            output_image = self.apply_overlay(
+                all_imgs[0], paste_to, overlay_image
+            )
+            return [output_image]
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
@@ -0,0 +1,541 @@
+import torch
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from PIL import Image, ImageDraw, ImageFilter
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+import math
+
+
+class OutpaintPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_encode: SharkInference,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.vae_encode = vae_encode
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_mask_and_masked_image(
+        self, image, mask, mask_blur, width, height
+    ):
+        if mask_blur > 0:
+            mask = mask.filter(ImageFilter.GaussianBlur(mask_blur))
+        image = image.resize((width, height))
+        mask = mask.resize((width, height))
+
+        # preprocess image
+        if isinstance(image, (Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], Image.Image):
+            mask = np.concatenate(
+                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
+            )
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+        masked_image = image * (mask < 0.5)
+
+        return mask, masked_image
+
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+    ):
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // 8, width // 8)
+        )
+        mask = mask.to(dtype)
+
+        masked_image = masked_image.to(dtype)
+        masked_image_latents = self.vae_encode("forward", (masked_image,))
+        masked_image_latents = torch.from_numpy(masked_image_latents)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(
+                batch_size // masked_image_latents.shape[0], 1, 1, 1
+            )
+        return mask, masked_image_latents
+
+    def get_matched_noise(
+        self, _np_src_image, np_mask_rgb, noise_q=1, color_variation=0.05
+    ):
+        # helper fft routines that keep ortho normalization and auto-shift before and after fft
+        def _fft2(data):
+            if data.ndim > 2:  # has channels
+                out_fft = np.zeros(
+                    (data.shape[0], data.shape[1], data.shape[2]),
+                    dtype=np.complex128,
+                )
+                for c in range(data.shape[2]):
+                    c_data = data[:, :, c]
+                    out_fft[:, :, c] = np.fft.fft2(
+                        np.fft.fftshift(c_data), norm="ortho"
+                    )
+                    out_fft[:, :, c] = np.fft.ifftshift(out_fft[:, :, c])
+            else:  # one channel
+                out_fft = np.zeros(
+                    (data.shape[0], data.shape[1]), dtype=np.complex128
+                )
+                out_fft[:, :] = np.fft.fft2(
+                    np.fft.fftshift(data), norm="ortho"
+                )
+                out_fft[:, :] = np.fft.ifftshift(out_fft[:, :])
+
+            return out_fft
+
+        def _ifft2(data):
+            if data.ndim > 2:  # has channels
+                out_ifft = np.zeros(
+                    (data.shape[0], data.shape[1], data.shape[2]),
+                    dtype=np.complex128,
+                )
+                for c in range(data.shape[2]):
+                    c_data = data[:, :, c]
+                    out_ifft[:, :, c] = np.fft.ifft2(
+                        np.fft.fftshift(c_data), norm="ortho"
+                    )
+                    out_ifft[:, :, c] = np.fft.ifftshift(out_ifft[:, :, c])
+            else:  # one channel
+                out_ifft = np.zeros(
+                    (data.shape[0], data.shape[1]), dtype=np.complex128
+                )
+                out_ifft[:, :] = np.fft.ifft2(
+                    np.fft.fftshift(data), norm="ortho"
+                )
+                out_ifft[:, :] = np.fft.ifftshift(out_ifft[:, :])
+
+            return out_ifft
+
+        def _get_gaussian_window(width, height, std=3.14, mode=0):
+            window_scale_x = float(width / min(width, height))
+            window_scale_y = float(height / min(width, height))
+
+            window = np.zeros((width, height))
+            x = (np.arange(width) / width * 2.0 - 1.0) * window_scale_x
+            for y in range(height):
+                fy = (y / height * 2.0 - 1.0) * window_scale_y
+                if mode == 0:
+                    window[:, y] = np.exp(-(x**2 + fy**2) * std)
+                else:
+                    window[:, y] = (
+                        1 / ((x**2 + 1.0) * (fy**2 + 1.0))
+                    ) ** (std / 3.14)
+
+            return window
+
+        def _get_masked_window_rgb(np_mask_grey, hardness=1.0):
+            np_mask_rgb = np.zeros(
+                (np_mask_grey.shape[0], np_mask_grey.shape[1], 3)
+            )
+            if hardness != 1.0:
+                hardened = np_mask_grey[:] ** hardness
+            else:
+                hardened = np_mask_grey[:]
+            for c in range(3):
+                np_mask_rgb[:, :, c] = hardened[:]
+            return np_mask_rgb
+
+        def _match_cumulative_cdf(source, template):
+            src_values, src_unique_indices, src_counts = np.unique(
+                source.ravel(), return_inverse=True, return_counts=True
+            )
+            tmpl_values, tmpl_counts = np.unique(
+                template.ravel(), return_counts=True
+            )
+
+            # calculate normalized quantiles for each array
+            src_quantiles = np.cumsum(src_counts) / source.size
+            tmpl_quantiles = np.cumsum(tmpl_counts) / template.size
+
+            interp_a_values = np.interp(
+                src_quantiles, tmpl_quantiles, tmpl_values
+            )
+            return interp_a_values[src_unique_indices].reshape(source.shape)
+
+        def _match_histograms(image, reference):
+            if image.ndim != reference.ndim:
+                raise ValueError(
+                    "Image and reference must have the same number of channels."
+                )
+
+            if image.shape[-1] != reference.shape[-1]:
+                raise ValueError(
+                    "Number of channels in the input image and reference image must match!"
+                )
+
+            matched = np.empty(image.shape, dtype=image.dtype)
+            for channel in range(image.shape[-1]):
+                matched_channel = _match_cumulative_cdf(
+                    image[..., channel], reference[..., channel]
+                )
+                matched[..., channel] = matched_channel
+
+            matched = matched.astype(np.float64, copy=False)
+            return matched
+
+        width = _np_src_image.shape[0]
+        height = _np_src_image.shape[1]
+        num_channels = _np_src_image.shape[2]
+
+        np_src_image = _np_src_image[:] * (1.0 - np_mask_rgb)
+        np_mask_grey = np.sum(np_mask_rgb, axis=2) / 3.0
+        img_mask = np_mask_grey > 1e-6
+        ref_mask = np_mask_grey < 1e-3
+
+        # rather than leave the masked area black, we get better results from fft by filling the average unmasked color
+        windowed_image = _np_src_image * (
+            1.0 - _get_masked_window_rgb(np_mask_grey)
+        )
+        windowed_image /= np.max(windowed_image)
+        windowed_image += np.average(_np_src_image) * np_mask_rgb
+
+        src_fft = _fft2(
+            windowed_image
+        )  # get feature statistics from masked src img
+        src_dist = np.absolute(src_fft)
+        src_phase = src_fft / src_dist
+
+        # create a generator with a static seed to make outpainting deterministic / only follow global seed
+        rng = np.random.default_rng(0)
+
+        noise_window = _get_gaussian_window(
+            width, height, mode=1
+        )  # start with simple gaussian noise
+        noise_rgb = rng.random((width, height, num_channels))
+        noise_grey = np.sum(noise_rgb, axis=2) / 3.0
+        # the colorfulness of the starting noise is blended to greyscale with a parameter
+        noise_rgb *= color_variation
+        for c in range(num_channels):
+            noise_rgb[:, :, c] += (1.0 - color_variation) * noise_grey
+
+        noise_fft = _fft2(noise_rgb)
+        for c in range(num_channels):
+            noise_fft[:, :, c] *= noise_window
+        noise_rgb = np.real(_ifft2(noise_fft))
+        shaped_noise_fft = _fft2(noise_rgb)
+        shaped_noise_fft[:, :, :] = (
+            np.absolute(shaped_noise_fft[:, :, :]) ** 2
+            * (src_dist**noise_q)
+            * src_phase
+        )  # perform the actual shaping
+
+        # color_variation
+        brightness_variation = 0.0
+        contrast_adjusted_np_src = (
+            _np_src_image[:] * (brightness_variation + 1.0)
+            - brightness_variation * 2.0
+        )
+
+        shaped_noise = np.real(_ifft2(shaped_noise_fft))
+        shaped_noise -= np.min(shaped_noise)
+        shaped_noise /= np.max(shaped_noise)
+        shaped_noise[img_mask, :] = _match_histograms(
+            shaped_noise[img_mask, :] ** 1.0,
+            contrast_adjusted_np_src[ref_mask, :],
+        )
+        shaped_noise = (
+            _np_src_image[:] * (1.0 - np_mask_rgb) + shaped_noise * np_mask_rgb
+        )
+
+        matched_noise = shaped_noise[:]
+
+        return np.clip(matched_noise, 0.0, 1.0)
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        pixels,
+        mask_blur,
+        is_left,
+        is_right,
+        is_top,
+        is_bottom,
+        noise_q,
+        color_variation,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        process_width = width
+        process_height = height
+        left = pixels if is_left else 0
+        right = pixels if is_right else 0
+        up = pixels if is_top else 0
+        down = pixels if is_bottom else 0
+        target_w = math.ceil((image.width + left + right) / 64) * 64
+        target_h = math.ceil((image.height + up + down) / 64) * 64
+
+        if left > 0:
+            left = left * (target_w - image.width) // (left + right)
+        if right > 0:
+            right = target_w - image.width - left
+        if up > 0:
+            up = up * (target_h - image.height) // (up + down)
+        if down > 0:
+            down = target_h - image.height - up
+
+        def expand(
+            init_img,
+            expand_pixels,
+            is_left=False,
+            is_right=False,
+            is_top=False,
+            is_bottom=False,
+        ):
+            is_horiz = is_left or is_right
+            is_vert = is_top or is_bottom
+            pixels_horiz = expand_pixels if is_horiz else 0
+            pixels_vert = expand_pixels if is_vert else 0
+
+            res_w = init_img.width + pixels_horiz
+            res_h = init_img.height + pixels_vert
+            process_res_w = math.ceil(res_w / 64) * 64
+            process_res_h = math.ceil(res_h / 64) * 64
+
+            img = Image.new("RGB", (process_res_w, process_res_h))
+            img.paste(
+                init_img,
+                (pixels_horiz if is_left else 0, pixels_vert if is_top else 0),
+            )
+
+            msk = Image.new("RGB", (process_res_w, process_res_h), "white")
+            draw = ImageDraw.Draw(msk)
+            draw.rectangle(
+                (
+                    expand_pixels + mask_blur if is_left else 0,
+                    expand_pixels + mask_blur if is_top else 0,
+                    msk.width - expand_pixels - mask_blur
+                    if is_right
+                    else res_w,
+                    msk.height - expand_pixels - mask_blur
+                    if is_bottom
+                    else res_h,
+                ),
+                fill="black",
+            )
+
+            np_image = (np.asarray(img) / 255.0).astype(np.float64)
+            np_mask = (np.asarray(msk) / 255.0).astype(np.float64)
+            noised = self.get_matched_noise(
+                np_image, np_mask, noise_q, color_variation
+            )
+            output_image = Image.fromarray(
+                np.clip(noised * 255.0, 0.0, 255.0).astype(np.uint8),
+                mode="RGB",
+            )
+
+            target_width = (
+                min(width, init_img.width + pixels_horiz)
+                if is_horiz
+                else img.width
+            )
+            target_height = (
+                min(height, init_img.height + pixels_vert)
+                if is_vert
+                else img.height
+            )
+            crop_region = (
+                0 if is_left else output_image.width - target_width,
+                0 if is_top else output_image.height - target_height,
+                target_width if is_left else output_image.width,
+                target_height if is_top else output_image.height,
+            )
+            mask_to_process = msk.crop(crop_region)
+            image_to_process = output_image.crop(crop_region)
+
+            # Preprocess mask and image
+            mask, masked_image = self.prepare_mask_and_masked_image(
+                image_to_process, mask_to_process, mask_blur, width, height
+            )
+
+            # Prepare mask latent variables
+            mask, masked_image_latents = self.prepare_mask_latents(
+                mask=mask,
+                masked_image=masked_image,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                dtype=dtype,
+            )
+
+            # Get Image latents
+            latents = self.produce_img_latents(
+                latents=init_latents,
+                text_embeddings=text_embeddings,
+                guidance_scale=guidance_scale,
+                total_timesteps=self.scheduler.timesteps,
+                dtype=dtype,
+                cpu_scheduling=cpu_scheduling,
+                mask=mask,
+                masked_image_latents=masked_image_latents,
+            )
+
+            # Img latents -> PIL images
+            all_imgs = []
+            for i in tqdm(range(0, latents.shape[0], batch_size)):
+                imgs = self.decode_latents(
+                    latents=latents[i : i + batch_size],
+                    use_base_vae=use_base_vae,
+                    cpu_scheduling=cpu_scheduling,
+                )
+                all_imgs.extend(imgs)
+
+            res_img = all_imgs[0].resize(
+                (image_to_process.width, image_to_process.height)
+            )
+            output_image.paste(
+                res_img,
+                (
+                    0 if is_left else output_image.width - res_img.width,
+                    0 if is_top else output_image.height - res_img.height,
+                ),
+            )
+            output_image = output_image.crop((0, 0, res_w, res_h))
+
+            return output_image
+
+        img = image.resize((width, height))
+        if left > 0:
+            img = expand(img, left, is_left=True)
+        if right > 0:
+            img = expand(img, right, is_right=True)
+        if up > 0:
+            img = expand(img, up, is_top=True)
+        if down > 0:
+            img = expand(img, down, is_bottom=True)
+
+        return [img]
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -0,0 +1,150 @@
+import torch
+import time
+import numpy as np
+from tqdm.auto import tqdm
+from random import randint
+from PIL import Image
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+from apps.stable_diffusion.src.utils import controlnet_hint_conversion
+
+
+class StencilPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        controlnet: SharkInference,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.controlnet = controlnet
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        strength,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+        use_stencil,
+    ):
+        # Control Embedding check & conversion
+        # TODO: 1. Change `num_images_per_prompt`.
+        controlnet_hint = controlnet_hint_conversion(
+            image, use_stencil, height, width, dtype, num_images_per_prompt=1
+        )
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Prepare initial latent.
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+        final_timesteps = self.scheduler.timesteps
+
+        # Get Image latents
+        latents = self.produce_stencil_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=final_timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+            controlnet_hint=controlnet_hint,
+            controlnet=self.controlnet,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -9,9 +9,11 @@ from diffusers import (
    DDIMScheduler,
    PNDMScheduler,
    LMSDiscreteScheduler,
+    KDPM2DiscreteScheduler,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
 )
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
@@ -30,10 +32,12 @@ class Text2ImagePipeline(StableDiffusionPipeline):
            DDIMScheduler,
            PNDMScheduler,
            LMSDiscreteScheduler,
+            KDPM2DiscreteScheduler,
            EulerDiscreteScheduler,
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
        ],
    ):
        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
@@ -0,0 +1,310 @@
+import inspect
+import torch
+import time
+from tqdm.auto import tqdm
+import numpy as np
+from random import randint
+from transformers import CLIPTokenizer
+from typing import Union
+from shark.shark_inference import SharkInference
+from diffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
+)
+from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+from apps.stable_diffusion.src.utils import (
+    start_profiling,
+    end_profiling,
+)
+from PIL import Image
+
+
+def preprocess(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, Image.Image):
+        image = [image]
+
+    if isinstance(image[0], Image.Image):
+        w, h = image[0].size
+        w, h = map(
+            lambda x: x - x % 64, (w, h)
+        )  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class UpscalerPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: SharkInference,
+        text_encoder: SharkInference,
+        tokenizer: CLIPTokenizer,
+        unet: SharkInference,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
+        ],
+        low_res_scheduler: Union[
+            DDIMScheduler,
+            DDPMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
+        ],
+    ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.low_res_scheduler = low_res_scheduler
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def decode_latents(self, latents, use_base_vae, cpu_scheduling):
+        latents = 1 / 0.08333 * (latents.float())
+        latents_numpy = latents
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+
+        profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = self.vae("forward", (latents_numpy,))
+        vae_inf_time = (time.time() - vae_start) * 1000
+        end_profiling(profile_device)
+        self.log += f"\nVAE Inference time (ms): {vae_inf_time:.3f}"
+
+        images = torch.from_numpy(images)
+        images = (images.detach().cpu() * 255.0).numpy()
+        images = images.round()
+
+        images = torch.from_numpy(images).to(torch.uint8).permute(0, 2, 3, 1)
+        pil_images = [Image.fromarray(image) for image in images.numpy()]
+        return pil_images
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height,
+                width,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def produce_img_latents(
+        self,
+        latents,
+        image,
+        text_embeddings,
+        guidance_scale,
+        noise_level,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        extra_step_kwargs,
+        return_all_latents=False,
+    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t
+            )
+            latent_model_input = torch.cat([latent_model_input, image], dim=1)
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    noise_level,
+                ),
+            )
+            end_profiling(profile_device)
+            noise_pred = torch.from_numpy(noise_pred)
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+
+            if cpu_scheduling:
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                )
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        image,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        noise_level,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        # TODO: Wouldn't it be preferable to just report an error instead of modifying the seed on the fly?
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get text embeddings from prompts
+        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+
+        # 4. Preprocess image
+        image = preprocess(image).to(dtype)
+
+        # 5. Add noise to image
+        noise_level = torch.tensor([noise_level], dtype=torch.long)
+        noise = torch.randn(
+            image.shape,
+            generator=generator,
+        ).to(dtype)
+        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
+        image = torch.cat([image] * 2)
+        noise_level = torch.cat([noise_level] * image.shape[0])
+
+        height, width = image.shape[2:]
+        # Get initial latents
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        eta = 0.0
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # guidance scale as a float32 tensor.
+        #  guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
+
+        # Get Image latents
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            image=image,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            noise_level=noise_level,
+            total_timesteps=self.scheduler.timesteps,
+            dtype=dtype,
+            cpu_scheduling=cpu_scheduling,
+            extra_step_kwargs=extra_step_kwargs,
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        for i in tqdm(range(0, latents.shape[0], batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + batch_size],
+                use_base_vae=use_base_vae,
+                cpu_scheduling=cpu_scheduling,
+            )
+            all_imgs.extend(imgs)
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -1,4 +1,5 @@
 import torch
+import numpy as np
 from transformers import CLIPTokenizer
 from PIL import Image
 from tqdm.auto import tqdm
@@ -6,16 +7,20 @@ import time
 from typing import Union
 from diffusers import (
    DDIMScheduler,
+    DDPMScheduler,
    PNDMScheduler,
    LMSDiscreteScheduler,
+    KDPM2DiscreteScheduler,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
 )
 from shark.shark_inference import SharkInference
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.models import (
    SharkifyStableDiffusionModel,
+    get_vae_encode,
    get_vae,
    get_clip,
    get_unet,
@@ -26,6 +31,9 @@ from apps.stable_diffusion.src.utils import (
    end_profiling,
 )

+SD_STATE_IDLE = "idle"
+SD_STATE_CANCEL = "cancel"
+

 class StableDiffusionPipeline:
    def __init__(
@@ -38,10 +46,12 @@ class StableDiffusionPipeline:
            DDIMScheduler,
            PNDMScheduler,
            LMSDiscreteScheduler,
+            KDPM2DiscreteScheduler,
            EulerDiscreteScheduler,
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
        ],
    ):
        self.vae = vae
@@ -51,6 +61,7 @@ class StableDiffusionPipeline:
        self.scheduler = scheduler
        # TODO: Implement using logging python utility.
        self.log = ""
+        self.status = SD_STATE_IDLE

    def encode_prompts(self, prompts, neg_prompts, max_length):
        # Tokenize text and get embeddings
@@ -104,7 +115,7 @@ class StableDiffusionPipeline:
        pil_images = [Image.fromarray(image) for image in images.numpy()]
        return pil_images

-    def produce_img_latents(
+    def produce_stencil_latents(
        self,
        latents,
        text_embeddings,
@@ -112,8 +123,114 @@ class StableDiffusionPipeline:
        total_timesteps,
        dtype,
        cpu_scheduling,
+        controlnet_hint=None,
+        controlnet=None,
+        controlnet_conditioning_scale: float = 1.0,
+        mask=None,
+        masked_image_latents=None,
        return_all_latents=False,
    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype)
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            if not torch.is_tensor(latent_model_input):
+                latent_model_input_1 = torch.from_numpy(
+                    np.asarray(latent_model_input)
+                ).to(dtype)
+            else:
+                latent_model_input_1 = latent_model_input
+            control = controlnet(
+                "forward",
+                (
+                    latent_model_input_1,
+                    timestep,
+                    text_embeddings,
+                    controlnet_hint,
+                ),
+                send_to_host=False,
+            )
+            timestep = timestep.detach().numpy()
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                    control[0],
+                    control[1],
+                    control[2],
+                    control[3],
+                    control[4],
+                    control[5],
+                    control[6],
+                    control[7],
+                    control[8],
+                    control[9],
+                    control[10],
+                    control[11],
+                    control[12],
+                ),
+                send_to_host=False,
+            )
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(
+                    noise_pred, t, latents
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents)
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
+    def produce_img_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        mask=None,
+        masked_image_latents=None,
+        return_all_latents=False,
+    ):
+        self.status = SD_STATE_IDLE
        step_time_sum = 0
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
@@ -122,6 +239,15 @@ class StableDiffusionPipeline:
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype).detach().numpy()
            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
            if cpu_scheduling:
                latent_model_input = latent_model_input.detach().numpy()

@@ -154,6 +280,9 @@ class StableDiffusionPipeline:
            #  )
            step_time_sum += step_time

+            if self.status == SD_STATE_CANCEL:
+                break
+
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -169,14 +298,17 @@ class StableDiffusionPipeline:
            DDIMScheduler,
            PNDMScheduler,
            LMSDiscreteScheduler,
+            KDPM2DiscreteScheduler,
            EulerDiscreteScheduler,
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
+            DEISMultistepScheduler,
        ],
        import_mlir: bool,
        model_id: str,
        ckpt_loc: str,
+        custom_vae: str,
        precision: str,
        max_length: int,
        batch_size: int,
@@ -184,13 +316,27 @@ class StableDiffusionPipeline:
        width: int,
        use_base_vae: bool,
        use_tuned: bool,
+        low_cpu_mem_usage: bool = False,
+        debug: bool = False,
+        use_stencil: str = None,
+        use_lora: str = "",
+        ddpm_scheduler: DDPMScheduler = None,
+        use_quantize=None,
    ):
-        if import_mlir:
-            # TODO: Delet this when on-the-fly tuning of models work.
-            use_tuned = False
+        is_inpaint = cls.__name__ in [
+            "InpaintPipeline",
+            "OutpaintPipeline",
+        ]
+        is_upscaler = cls.__name__ in ["UpscalerPipeline"]
+        if import_mlir or use_lora:
+            if not import_mlir:
+                print(
+                    "Warning: LoRA provided but import_mlir not specified. Importing MLIR anyways."
+                )
            mlir_import = SharkifyStableDiffusionModel(
                model_id,
                ckpt_loc,
+                custom_vae,
                precision,
                max_len=max_length,
                batch_size=batch_size,
@@ -198,9 +344,89 @@ class StableDiffusionPipeline:
                width=width,
                use_base_vae=use_base_vae,
                use_tuned=use_tuned,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                debug=debug,
+                is_inpaint=is_inpaint,
+                is_upscaler=is_upscaler,
+                use_stencil=use_stencil,
+                use_lora=use_lora,
+                use_quantize=use_quantize,
            )
+            if cls.__name__ in [
+                "Image2ImagePipeline",
+                "InpaintPipeline",
+                "OutpaintPipeline",
+            ]:
+                clip, unet, vae, vae_encode = mlir_import()
+                return cls(
+                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
+                )
+            if cls.__name__ in ["StencilPipeline"]:
+                clip, unet, vae, controlnet = mlir_import()
+                return cls(
+                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
+                )
+            if cls.__name__ in ["UpscalerPipeline"]:
+                clip, unet, vae = mlir_import()
+                return cls(
+                    vae, clip, get_tokenizer(), unet, scheduler, ddpm_scheduler
+                )
+
+            clip, unet, vae = mlir_import()
+            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+        try:
+            if cls.__name__ in [
+                "Image2ImagePipeline",
+                "InpaintPipeline",
+                "OutpaintPipeline",
+            ]:
+                return cls(
+                    get_vae_encode(),
+                    get_vae(),
+                    get_clip(),
+                    get_tokenizer(),
+                    get_unet(),
+                    scheduler,
+                )
+            if cls.__name__ == "StencilPipeline":
+                import sys
+
+                sys.exit(
+                    "StencilPipeline not supported with SharkTank currently."
+                )
+            return cls(
+                get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
+            )
+        except:
+            print("download pipeline failed, falling back to import_mlir")
+            mlir_import = SharkifyStableDiffusionModel(
+                model_id,
+                ckpt_loc,
+                custom_vae,
+                precision,
+                max_len=max_length,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                use_base_vae=use_base_vae,
+                use_tuned=use_tuned,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                is_inpaint=is_inpaint,
+                is_upscaler=is_upscaler,
+            )
+            if cls.__name__ in [
+                "Image2ImagePipeline",
+                "InpaintPipeline",
+                "OutpaintPipeline",
+            ]:
+                clip, unet, vae, vae_encode = mlir_import()
+                return cls(
+                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
+                )
+            if cls.__name__ == "StencilPipeline":
+                clip, unet, vae, controlnet = mlir_import()
+                return cls(
+                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
+                )
            clip, unet, vae = mlir_import()
            return cls(vae, clip, get_tokenizer(), unet, scheduler)
-        return cls(
-            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
-        )
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -1,10 +1,13 @@
 from diffusers import (
    LMSDiscreteScheduler,
    PNDMScheduler,
+    DDPMScheduler,
    DDIMScheduler,
    DPMSolverMultistepScheduler,
+    KDPM2DiscreteScheduler,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
+    DEISMultistepScheduler,
 )
 from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
    SharkEulerDiscreteScheduler,
@@ -17,6 +20,14 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
+    schedulers["DDPM"] = DDPMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
@@ -41,6 +52,10 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
+    schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
    schedulers[
        "SharkEulerDiscrete"
    ] = SharkEulerDiscreteScheduler.from_pretrained(
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -87,11 +87,11 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
        if sys.platform == "darwin":
            iree_flags.append("-iree-stream-fuse-binding=false")

-        if args.import_mlir:
+        def _import(self):
            scaling_model = ScalingModel()
            self.scaling_model = compile_through_fx(
-                scaling_model,
-                (example_latent, example_sigma),
+                model=scaling_model,
+                inputs=(example_latent, example_sigma),
                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
                + args.precision,
                extra_args=iree_flags,
@@ -105,15 +105,28 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
                + args.precision,
                extra_args=iree_flags,
            )
+
+        if args.import_mlir:
+            _import(self)
+
        else:
-            self.scaling_model = get_shark_model(
-                SCHEDULER_BUCKET,
-                "euler_scale_model_input_" + args.precision,
-                iree_flags,
-            )
-            self.step_model = get_shark_model(
-                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
-            )
+            try:
+                self.scaling_model = get_shark_model(
+                    SCHEDULER_BUCKET,
+                    "euler_scale_model_input_" + args.precision,
+                    iree_flags,
+                )
+                self.step_model = get_shark_model(
+                    SCHEDULER_BUCKET,
+                    "euler_step_" + args.precision,
+                    iree_flags,
+                )
+            except:
+                print(
+                    "failed to download model, falling back and using import_mlir"
+                )
+                args.import_mlir = True
+                _import(self)

    def scale_model_input(self, sample, timestep):
        step_index = (self.timesteps == timestep).nonzero().item()
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -11,6 +11,10 @@ from apps.stable_diffusion.src.utils.resources import (
 )
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
 from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.stencils.stencil_utils import (
+    controlnet_hint_conversion,
+    get_stencil_model_id,
+)
 from apps.stable_diffusion.src.utils.utils import (
    get_shark_model,
    compile_through_fx,
@@ -20,8 +24,14 @@ from apps.stable_diffusion.src.utils.utils import (
    get_available_devices,
    get_opt_flags,
    preprocessCKPT,
-    fetch_or_delete_vmfbs,
+    fetch_vmfbs,
    fetch_and_update_base_model_id,
    get_path_to_diffusers_checkpoint,
    sanitize_seed,
+    get_path_stem,
+    get_extended_name,
+    clear_all,
+    save_output_img,
+    get_generation_text_info,
+    update_lora_weight,
 )
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -1,6 +1,41 @@
 {
-    "stabilityai/stable-diffusion-2-1": {
-        "unet": {
+    "clip": {
+        "token" : {
+            "shape" : [
+                "2*batch_size",
+                "max_len"
+            ],
+            "dtype":"i64"
+        }
+    },
+    "vae_encode": {
+        "image" : {
+            "shape" : [
+                "1*batch_size",3,"8*height","8*width"
+            ],
+            "dtype":"f32"
+        }
+    },
+    "vae": {
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "vae_upscaler": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"8*height","8*width"
+                ],
+                "dtype":"f32"
+            }
+        }
+    },
+    "unet": {
+        "stabilityai/stable-diffusion-2-1": {
            "latents": {
                "shape": [
                    "1*batch_size",
@@ -29,26 +64,7 @@
                "dtype": "f32"
            }
        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
-        }
-    },
-    "CompVis/stable-diffusion-v1-4": {
-        "unet": {
+        "CompVis/stable-diffusion-v1-4": {
            "latents": {
                "shape": [
                    "1*batch_size",
@@ -77,22 +93,204 @@
                "dtype": "f32"
            }
        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
+        "stabilityai/stable-diffusion-2-inpainting": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    9,
+                    "height",
+                    "width"
                ],
-                "dtype":"f32"
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
            }
        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
+        "runwayml/stable-diffusion-inpainting": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    9,
+                    "height",
+                    "width"
                ],
-                "dtype":"i64"
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "stabilityai/stable-diffusion-x4-upscaler": {
+            "latents": {
+                "shape": [
+                    "2*batch_size",
+                    7,
+                    "8*height",
+                    "8*width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "noise_level": {
+                "shape": [2],
+                "dtype": "i64"
+            }
+        }
+    },
+    "stencil_adaptor": {
+        "latents": {
+            "shape": [
+                "1*batch_size",
+                4,
+                "height",
+                "width"
+            ],
+            "dtype": "f32"
+        },
+        "timesteps": {
+            "shape": [
+                1
+            ],
+            "dtype": "f32"
+        },
+        "embedding": {
+            "shape": [
+                "2*batch_size",
+                "max_len",
+                768
+            ],
+            "dtype": "f32"
+        },
+        "controlnet_hint": {
+            "shape": [1, 3, "8*height", "8*width"],
+            "dtype": "f32"
+        }
+    },
+    "stencil_unet": {
+        "CompVis/stable-diffusion-v1-4": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            },
+            "control1": {
+                "shape": [2, 320, "height", "width"],
+                "dtype": "f32"
+            },
+            "control2": {
+                "shape": [2, 320, "height", "width"],
+                "dtype": "f32"
+            },
+            "control3": {
+                "shape": [2, 320, "height", "width"],
+                "dtype": "f32"
+            },
+            "control4": {
+                "shape": [2, 320, "height/2", "width/2"],
+                "dtype": "f32"
+            },
+            "control5": {
+                "shape": [2, 640, "height/2", "width/2"],
+                "dtype": "f32"
+            },
+            "control6": {
+                "shape": [2, 640, "height/2", "width/2"],
+                "dtype": "f32"
+            },
+            "control7": {
+                "shape": [2, 640, "height/4", "width/4"],
+                "dtype": "f32"
+            },
+            "control8": {
+                "shape": [2, 1280, "height/4", "width/4"],
+                "dtype": "f32"
+            },
+            "control9": {
+                "shape": [2, 1280, "height/4", "width/4"],
+                "dtype": "f32"
+            },
+            "control10": {
+                "shape": [2, 1280, "height/8", "width/8"],
+                "dtype": "f32"
+            },
+            "control11": {
+                "shape": [2, 1280, "height/8", "width/8"],
+                "dtype": "f32"
+            },
+            "control12": {
+                "shape": [2, 1280, "height/8", "width/8"],
+                "dtype": "f32"
+            },
+            "control13": {
+                "shape": [2, 1280, "height/8", "width/8"],
+                "dtype": "f32"
            }
        }
    }
-}
+}
--- a/apps/stable_diffusion/src/utils/resources/model_config.json
+++ b/apps/stable_diffusion/src/utils/resources/model_config.json
@@ -3,6 +3,8 @@
    "stablediffusion/v1_4":"CompVis/stable-diffusion-v1-4",
    "stablediffusion/v2_1base":"stabilityai/stable-diffusion-2-1-base",
    "stablediffusion/v2_1":"stabilityai/stable-diffusion-2-1",
+    "stablediffusion/inpaint_v1":"runwayml/stable-diffusion-inpainting",
+    "stablediffusion/inpaint_v2":"stabilityai/stable-diffusion-2-inpainting",
    "anythingv3/v1_4":"Linaqruf/anything-v3.0",
    "analogdiffusion/v1_4":"wavymulder/Analog-Diffusion",
    "openjourney/v1_4":"prompthero/openjourney",
--- a/apps/stable_diffusion/src/utils/resources/model_db.json
+++ b/apps/stable_diffusion/src/utils/resources/model_db.json
@@ -18,12 +18,15 @@
    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/unet/fp32/length_64/untuned":"unet_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
-    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
+    "stablediffusion/v1_4/vae/fp32/length_64/untuned":"vae_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_64/untuned":"clip_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
@@ -42,41 +45,41 @@
    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
-    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
-    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
-    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
-    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
-    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
-    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
-    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
-    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
-    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
-    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
-    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
-    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
-    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
-    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
-    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
-    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
-    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
-    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
-    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
-    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
-    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
-    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
-    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
-    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
-    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
-    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
-    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
-    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
-    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
-    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
-    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
-    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
-    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
-    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
-    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+    "anythingv3/v1_4/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v1_4/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v1_4/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v1_4/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v1_4/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v1_4/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v1_4/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v1_4/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v1_4/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v1_4/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v1_4/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v1_4/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v1_4/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v1_4/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v1_4/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v1_4/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v1_4/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v1_4/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v1_4/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v1_4/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v1_4/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v1_4/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v1_4/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v1_4/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v1_4/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v1_4/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v1_4/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v1_4/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v1_4/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v1_4/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v1_4/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v1_4/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v1_4/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v1_4/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v1_4/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v1_4/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
  }
 ]
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -45,12 +45,12 @@
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
        ]
      }
    }
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -20,6 +20,22 @@ def get_device():
    return device


+def get_device_args():
+    device = get_device()
+    device_spec_args = []
+    if device == "cuda":
+        from shark.iree_utils.gpu_utils import get_iree_gpu_args
+
+        gpu_flags = get_iree_gpu_args()
+        for flag in gpu_flags:
+            device_spec_args.append(flag)
+    elif device == "vulkan":
+        device_spec_args.append(
+            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
+        )
+    return device, device_spec_args
+
+
 # Download the model (Unet or VAE fp16) from shark_tank
 def load_model_from_tank():
    from apps.stable_diffusion.src.models import (
@@ -54,28 +70,64 @@ def load_winograd_configs():
    config_bucket = "gs://shark_tank/sd_tuned/configs/"
    config_name = f"{args.annotation_model}_winograd_{device}.json"
    full_gs_url = config_bucket + config_name
-    winograd_config_dir = f"{WORKDIR}configs/" + config_name
+    winograd_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading Winograd config file from ", winograd_config_dir)
    download_public_file(full_gs_url, winograd_config_dir, True)
    return winograd_config_dir


-def load_lower_configs():
+def load_lower_configs(base_model_id=None):
    from apps.stable_diffusion.src.models import get_variant_version
+    from apps.stable_diffusion.src.utils.utils import (
+        fetch_and_update_base_model_id,
+    )

-    variant, version = get_variant_version(args.hf_model_id)
+    if not base_model_id:
+        if args.ckpt_loc != "":
+            base_model_id = fetch_and_update_base_model_id(args.ckpt_loc)
+        else:
+            base_model_id = fetch_and_update_base_model_id(args.hf_model_id)
+            if base_model_id == "":
+                base_model_id = args.hf_model_id
+
+    variant, version = get_variant_version(base_model_id)
+
+    if version == "inpaint_v1":
+        version = "v1_4"
+    elif version == "inpaint_v2":
+        version = "v2_1base"
+
+    config_bucket = "gs://shark_tank/sd_tuned_configs/"
+
+    device, device_spec_args = get_device_args()
+    spec = ""
+    if device_spec_args:
+        spec = device_spec_args[-1].split("=")[-1].strip()
+        if device == "vulkan":
+            spec = spec.split("-")[0]

-    config_bucket = "gs://shark_tank/sd_tuned/configs/"
-    config_version = version
-    if variant in ["anythingv3", "analogdiffusion"]:
-        args.max_length = 77
-        config_version = "v1_4"
    if args.annotation_model == "vae":
-        args.max_length = 77
-    device = get_device()
-    config_name = f"{args.annotation_model}_{config_version}_{args.precision}_len{args.max_length}_{device}.json"
+        if not spec or spec in ["rdna3", "sm_80"]:
+            config_name = (
+                f"{args.annotation_model}_{args.precision}_{device}.json"
+            )
+        else:
+            config_name = f"{args.annotation_model}_{args.precision}_{device}_{spec}.json"
+    else:
+        if not spec or spec in ["rdna3", "sm_80"]:
+            if (
+                version in ["v2_1", "v2_1base"]
+                and args.height == 768
+                and args.width == 768
+            ):
+                config_name = f"{args.annotation_model}_v2_1_768_{args.precision}_{device}.json"
+            else:
+                config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}.json"
+        else:
+            config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}_{spec}.json"
+
    full_gs_url = config_bucket + config_name
-    lowering_config_dir = f"{WORKDIR}configs/" + config_name
+    lowering_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading lowering config file from ", lowering_config_dir)
    download_public_file(full_gs_url, lowering_config_dir, True)
    return lowering_config_dir
@@ -83,13 +135,6 @@ def load_lower_configs():

 # Annotate the model with Winograd attribute on selected conv ops
 def annotate_with_winograd(input_mlir, winograd_config_dir, model_name):
-    if model_name.split("_")[-1] != "tuned":
-        out_file_path = (
-            f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
-        )
-    else:
-        out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
-
    with create_context() as ctx:
        winograd_model = model_annotation(
            ctx,
@@ -103,59 +148,41 @@ def annotate_with_winograd(input_mlir, winograd_config_dir, model_name):
    winograd_model.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()

-    with open(out_file_path, "w") as f:
-        f.write(str(winograd_model))
-        f.close()
-    return bytecode, out_file_path
+    if args.save_annotation:
+        if model_name.split("_")[-1] != "tuned":
+            out_file_path = os.path.join(
+                args.annotation_output, model_name + "_tuned_torch.mlir"
+            )
+        else:
+            out_file_path = os.path.join(
+                args.annotation_output, model_name + "_torch.mlir"
+            )
+        with open(out_file_path, "w") as f:
+            f.write(str(winograd_model))
+            f.close()
+
+    return bytecode


-def dump_after_mlir(input_mlir, model_name, use_winograd):
+def dump_after_mlir(input_mlir, use_winograd):
+    import iree.compiler as ireec
+
+    device, device_spec_args = get_device_args()
    if use_winograd:
-        dump_after = "iree-linalg-ext-convert-conv2d-to-winograd"
-        preprocess_flag = (
-            "--iree-preprocessing-pass-pipeline='builtin.module"
-            "(func.func(iree-flow-detach-elementwise-from-named-ops,"
-            "iree-flow-convert-1x1-filter-conv2d-to-matmul,"
-            "iree-preprocessing-convert-conv2d-to-img2col,"
-            "iree-preprocessing-pad-linalg-ops{pad-size=32},"
-            "iree-linalg-ext-convert-conv2d-to-winograd))' "
-        )
+        preprocess_flag = "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
    else:
-        dump_after = "iree-preprocessing-pad-linalg-ops"
-        preprocess_flag = (
-            "--iree-preprocessing-pass-pipeline='builtin.module"
-            "(func.func(iree-flow-detach-elementwise-from-named-ops,"
-            "iree-flow-convert-1x1-filter-conv2d-to-matmul,"
-            "iree-preprocessing-convert-conv2d-to-img2col,"
-            "iree-preprocessing-pad-linalg-ops{pad-size=32}))' "
-        )
+        preprocess_flag = "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"

-    device_spec_args = ""
-    device = get_device()
-    if device == "cuda":
-        from shark.iree_utils.gpu_utils import get_iree_gpu_args
-
-        gpu_flags = get_iree_gpu_args()
-        for flag in gpu_flags:
-            device_spec_args += flag + " "
-    elif device == "vulkan":
-        device_spec_args = (
-            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
-        )
-    print("Applying tuned configs on", model_name)
-
-    run_cmd(
-        f"iree-compile {input_mlir} "
-        "--iree-input-type=tm_tensor "
-        f"--iree-hal-target-backends={iree_target_map(device)} "
-        f"{device_spec_args}"
-        f"{preprocess_flag}"
-        "--iree-stream-resource-index-bits=64 "
-        "--iree-vm-target-index-bits=64 "
-        f"--mlir-print-ir-after={dump_after} "
-        "--compile-to=flow "
-        f"2>{args.annotation_output}/dump_after_winograd.mlir "
+    dump_module = ireec.compile_str(
+        input_mlir,
+        target_backends=[iree_target_map(device)],
+        extra_args=device_spec_args
+        + [
+            preprocess_flag,
+            "--compile-to=preprocessing",
+        ],
    )
+    return dump_module


 # For Unet annotate the model with tuned lowering configs
@@ -163,72 +190,63 @@ def annotate_with_lower_configs(
    input_mlir, lowering_config_dir, model_name, use_winograd
 ):
    # Dump IR after padding/img2col/winograd passes
-    dump_after_mlir(input_mlir, model_name, use_winograd)
+    dump_module = dump_after_mlir(input_mlir, use_winograd)
+    print("Applying tuned configs on", model_name)

    # Annotate the model with lowering configs in the config file
    with create_context() as ctx:
        tuned_model = model_annotation(
            ctx,
-            input_contents=f"{args.annotation_output}/dump_after_winograd.mlir",
+            input_contents=dump_module,
            config_path=lowering_config_dir,
            search_op="all",
        )

-    # Remove the intermediate mlir and save the final annotated model
-    os.remove(f"{args.annotation_output}/dump_after_winograd.mlir")
-    if model_name.split("_")[-1] != "tuned":
-        out_file_path = (
-            f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
-        )
-    else:
-        out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
-
    bytecode_stream = io.BytesIO()
    tuned_model.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()

-    with open(out_file_path, "w") as f:
-        f.write(str(tuned_model))
-        f.close()
-    return bytecode, out_file_path
+    if args.save_annotation:
+        if model_name.split("_")[-1] != "tuned":
+            out_file_path = (
+                f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
+            )
+        else:
+            out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+        with open(out_file_path, "w") as f:
+            f.write(str(tuned_model))
+            f.close()
+
+    return bytecode


-def sd_model_annotation(mlir_model, model_name, model_from_tank=False):
+def sd_model_annotation(mlir_model, model_name, base_model_id=None):
    device = get_device()
    if args.annotation_model == "unet" and device == "vulkan":
        use_winograd = True
        winograd_config_dir = load_winograd_configs()
-        winograd_model, model_path = annotate_with_winograd(
+        winograd_model = annotate_with_winograd(
            mlir_model, winograd_config_dir, model_name
        )
-        lowering_config_dir = load_lower_configs()
-        tuned_model, output_path = annotate_with_lower_configs(
-            model_path, lowering_config_dir, model_name, use_winograd
+        lowering_config_dir = load_lower_configs(base_model_id)
+        tuned_model = annotate_with_lower_configs(
+            winograd_model, lowering_config_dir, model_name, use_winograd
        )
    elif args.annotation_model == "vae" and device == "vulkan":
        use_winograd = True
        winograd_config_dir = load_winograd_configs()
-        tuned_model, output_path = annotate_with_winograd(
+        tuned_model = annotate_with_winograd(
            mlir_model, winograd_config_dir, model_name
        )
    else:
        use_winograd = False
-        if model_from_tank:
-            mlir_model = f"{WORKDIR}{model_name}_torch/{model_name}_torch.mlir"
-        else:
-            # Just use this function to convert bytecode to string
-            orig_model, model_path = annotate_with_winograd(
-                mlir_model, "", model_name
-            )
-            mlir_model = model_path
-        lowering_config_dir = load_lower_configs()
-        tuned_model, output_path = annotate_with_lower_configs(
+        lowering_config_dir = load_lower_configs(base_model_id)
+        tuned_model = annotate_with_lower_configs(
            mlir_model, lowering_config_dir, model_name, use_winograd
        )
-    print(f"Saved the annotated mlir in {output_path}.")
    return tuned_model


 if __name__ == "__main__":
    mlir_model, model_name = load_model_from_tank()
-    sd_model_annotation(mlir_model, model_name, model_from_tank=True)
+    sd_model_annotation(mlir_model, model_name)
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 from pathlib import Path


@@ -6,6 +7,13 @@ def path_expand(s):
    return Path(s).expanduser().resolve()


+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
 p = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )
@@ -14,21 +22,33 @@ p = argparse.ArgumentParser(
 ### Stable Diffusion Params
 ##############################################################################

+p.add_argument(
+    "-a",
+    "--app",
+    default="txt2img",
+    help="which app to use, one of: txt2img, img2img, outpaint, inpaint",
+)
 p.add_argument(
    "-p",
    "--prompts",
-    action="append",
-    default=[],
+    nargs="+",
+    default=["cyberpunk forest by Salvador Dali"],
    help="text of which images to be generated.",
 )

 p.add_argument(
    "--negative_prompts",
    nargs="+",
-    default=[""],
+    default=["trees, green"],
    help="text you don't want to see in the generated image.",
 )

+p.add_argument(
+    "--img_path",
+    type=str,
+    help="Path to the image input for img2img/inpainting",
+)
+
 p.add_argument(
    "--steps",
    type=int,
@@ -39,8 +59,8 @@ p.add_argument(
 p.add_argument(
    "--seed",
    type=int,
-    default=42,
-    help="the seed to use.",
+    default=-1,
+    help="the seed to use. -1 for a random one.",
 )

 p.add_argument(
@@ -48,13 +68,14 @@ p.add_argument(
    type=int,
    default=1,
    choices=range(1, 4),
-    help="the number of inferences to be made in a single `run`.",
+    help="the number of inferences to be made in a single `batch_count`.",
 )

 p.add_argument(
    "--height",
    type=int,
    default=512,
+    choices=range(128, 769, 8),
    help="the height of the output image.",
 )

@@ -62,6 +83,7 @@ p.add_argument(
    "--width",
    type=int,
    default=512,
+    choices=range(128, 769, 8),
    help="the width of the output image.",
 )

@@ -72,6 +94,13 @@ p.add_argument(
    help="the value to be used for guidance scaling.",
 )

+p.add_argument(
+    "--noise_level",
+    type=int,
+    default=20,
+    help="the value to be used for noise level of upscaler.",
+)
+
 p.add_argument(
    "--max_length",
    type=int,
@@ -79,6 +108,121 @@ p.add_argument(
    help="max length of the tokenizer output, options are 64 and 77.",
 )

+p.add_argument(
+    "--strength",
+    type=float,
+    default=0.8,
+    help="the strength of change applied on the given input image for img2img",
+)
+
+##############################################################################
+### Stable Diffusion Training Params
+##############################################################################
+
+p.add_argument(
+    "--lora_save_dir",
+    type=str,
+    default="models/lora/",
+    help="Directory to save the lora fine tuned model",
+)
+
+p.add_argument(
+    "--training_images_dir",
+    type=str,
+    default="models/lora/training_images/",
+    help="Directory containing images that are an example of the prompt",
+)
+
+p.add_argument(
+    "--training_steps",
+    type=int,
+    default=2000,
+    help="The no. of steps to train",
+)
+
+##############################################################################
+### Inpainting and Outpainting Params
+##############################################################################
+
+p.add_argument(
+    "--mask_path",
+    type=str,
+    help="Path to the mask image input for inpainting",
+)
+
+p.add_argument(
+    "--inpaint_full_res",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="If inpaint only masked area or whole picture",
+)
+
+p.add_argument(
+    "--inpaint_full_res_padding",
+    type=int,
+    default=32,
+    choices=range(0, 257, 4),
+    help="Number of pixels for only masked padding",
+)
+
+p.add_argument(
+    "--pixels",
+    type=int,
+    default=128,
+    choices=range(8, 257, 8),
+    help="Number of expended pixels for one direction for outpainting",
+)
+
+p.add_argument(
+    "--mask_blur",
+    type=int,
+    default=8,
+    choices=range(0, 65),
+    help="Number of blur pixels for outpainting",
+)
+
+p.add_argument(
+    "--left",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="If expend left for outpainting",
+)
+
+p.add_argument(
+    "--right",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="If expend right for outpainting",
+)
+
+p.add_argument(
+    "--top",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="If expend top for outpainting",
+)
+
+p.add_argument(
+    "--bottom",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="If expend bottom for outpainting",
+)
+
+p.add_argument(
+    "--noise_q",
+    type=float,
+    default=1.0,
+    help="Fall-off exponent for outpainting (lower=higher detail) (min=0.0, max=4.0)",
+)
+
+p.add_argument(
+    "--color_variation",
+    type=float,
+    default=0.05,
+    help="Color variation for outpainting (min=0.0, max=1.0)",
+)
+
 ##############################################################################
 ### Model Config and Usage Params
 ##############################################################################
@@ -148,10 +292,10 @@ p.add_argument(
 )

 p.add_argument(
-    "--runs",
+    "--batch_count",
    type=int,
    default=1,
-    help="number of images to be generated with random seeds in single execution",
+    help="number of batch to be generated with random seeds in single execution",
 )

 p.add_argument(
@@ -161,6 +305,13 @@ p.add_argument(
    help="Path to SD's .ckpt file.",
 )

+p.add_argument(
+    "--custom_vae",
+    type=str,
+    default="",
+    help="HuggingFace repo-id or path to SD model's checkpoint whose Vae needs to be plugged in.",
+)
+
 p.add_argument(
    "--hf_model_id",
    type=str,
@@ -169,10 +320,38 @@ p.add_argument(
 )

 p.add_argument(
-    "--enable_stack_trace",
+    "--low_cpu_mem_usage",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="Enable showing the stack trace when retrying the base model configuration",
+    help="Use the accelerate package to reduce cpu memory consumption",
+)
+
+p.add_argument(
+    "--attention_slicing",
+    type=str,
+    default="none",
+    help="Amount of attention slicing to use (one of 'max', 'auto', 'none', or an integer)",
+)
+
+p.add_argument(
+    "--use_stencil",
+    choices=["canny", "openpose", "scribble"],
+    help="Enable the stencil feature.",
+)
+
+p.add_argument(
+    "--use_lora",
+    type=str,
+    default="",
+    help="Use standalone LoRA weight using a HF ID or a checkpoint file (~3 MB)",
+)
+
+p.add_argument(
+    "--use_quantize",
+    type=str,
+    default="none",
+    help="""Runs the quantized version of stable diffusion model. This is currently in experimental phase.
+            Currently, only runs the stable-diffusion-2-1-base model in int8 quantization.""",
 )

 ##############################################################################
@@ -180,7 +359,7 @@ p.add_argument(
 ##############################################################################

 p.add_argument(
-    "--iree-vulkan-target-triple",
+    "--iree_vulkan_target_triple",
    type=str,
    default="",
    help="Specify target triple for vulkan",
@@ -195,7 +374,7 @@ p.add_argument(

 p.add_argument(
    "--vulkan_large_heap_block_size",
-    default="4147483648",
+    default="2073741824",
    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
 )

@@ -279,11 +458,17 @@ p.add_argument(

 p.add_argument(
    "--write_metadata_to_png",
-    default=False,
+    default=True,
    action=argparse.BooleanOptionalAction,
    help="flag for whether or not to save generation information in PNG chunk text to generated images.",
 )

+p.add_argument(
+    "--import_debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="if import_mlir is True, saves mlir via the debug option in shark importer. Does nothing if import_mlir is false (the default)",
+)
 ##############################################################################
 ### Web UI flags
 ##############################################################################
@@ -292,7 +477,7 @@ p.add_argument(
    "--progress_bar",
    default=True,
    action=argparse.BooleanOptionalAction,
-    help="flag for removing the pregress bar animation during image generation",
+    help="flag for removing the progress bar animation during image generation",
 )

 p.add_argument(
@@ -336,10 +521,14 @@ p.add_argument(
 )

 p.add_argument(
-    "--use_winograd",
+    "--save_annotation",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="Apply Winograd on selected conv ops.",
+    help="Save annotated mlir file",
 )

 args, unknown = p.parse_known_args()
+if args.import_debug:
+    os.environ["IREE_SAVE_TEMPS"] = os.path.join(
+        os.getcwd(), args.hf_model_id.replace("/", "_")
+    )
--- a/apps/stable_diffusion/src/utils/stencils/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/init.py
@@ -0,0 +1,2 @@
+from apps.stable_diffusion.src.utils.stencils.canny import CannyDetector
+from apps.stable_diffusion.src.utils.stencils.openpose import OpenposeDetector
--- a/apps/stable_diffusion/src/utils/stencils/canny/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/canny/init.py
@@ -0,0 +1,6 @@
+import cv2
+
+
+class CannyDetector:
+    def __call__(self, img, low_threshold, high_threshold):
+        return cv2.Canny(img, low_threshold, high_threshold)
--- a/apps/stable_diffusion/src/utils/stencils/openpose/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/openpose/init.py
@@ -0,0 +1,62 @@
+import requests
+from pathlib import Path
+
+import torch
+import numpy as np
+
+# from annotator.util import annotator_ckpts_path
+from apps.stable_diffusion.src.utils.stencils.openpose.body import Body
+from apps.stable_diffusion.src.utils.stencils.openpose.hand import Hand
+from apps.stable_diffusion.src.utils.stencils.openpose.openpose_util import (
+    draw_bodypose,
+    draw_handpose,
+    handDetect,
+)
+
+
+body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
+hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
+
+
+class OpenposeDetector:
+    def __init__(self):
+        cwd = Path.cwd()
+        ckpt_path = Path(cwd, "stencil_annotator")
+        ckpt_path.mkdir(parents=True, exist_ok=True)
+        body_modelpath = ckpt_path / "body_pose_model.pth"
+        hand_modelpath = ckpt_path / "hand_pose_model.pth"
+
+        if not body_modelpath.is_file():
+            r = requests.get(body_model_path, allow_redirects=True)
+            open(body_modelpath, "wb").write(r.content)
+        if not hand_modelpath.is_file():
+            r = requests.get(hand_model_path, allow_redirects=True)
+            open(hand_modelpath, "wb").write(r.content)
+
+        self.body_estimation = Body(body_modelpath)
+        self.hand_estimation = Hand(hand_modelpath)
+
+    def __call__(self, oriImg, hand=False):
+        oriImg = oriImg[:, :, ::-1].copy()
+        with torch.no_grad():
+            candidate, subset = self.body_estimation(oriImg)
+            canvas = np.zeros_like(oriImg)
+            canvas = draw_bodypose(canvas, candidate, subset)
+            if hand:
+                hands_list = handDetect(candidate, subset, oriImg)
+                all_hand_peaks = []
+                for x, y, w, is_left in hands_list:
+                    peaks = self.hand_estimation(
+                        oriImg[y : y + w, x : x + w, :]
+                    )
+                    peaks[:, 0] = np.where(
+                        peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x
+                    )
+                    peaks[:, 1] = np.where(
+                        peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y
+                    )
+                    all_hand_peaks.append(peaks)
+                canvas = draw_handpose(canvas, all_hand_peaks)
+            return canvas, dict(
+                candidate=candidate.tolist(), subset=subset.tolist()
+            )
--- a/apps/stable_diffusion/src/utils/stencils/openpose/body.py
+++ b/apps/stable_diffusion/src/utils/stencils/openpose/body.py
@@ -0,0 +1,499 @@
+import cv2
+import numpy as np
+import math
+from scipy.ndimage.filters import gaussian_filter
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+from apps.stable_diffusion.src.utils.stencils.openpose.openpose_util import (
+    make_layers,
+    transfer,
+    padRightDownCorner,
+)
+
+
+class BodyPoseModel(nn.Module):
+    def __init__(self):
+        super(BodyPoseModel, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = [
+            "conv5_5_CPM_L1",
+            "conv5_5_CPM_L2",
+            "Mconv7_stage2_L1",
+            "Mconv7_stage2_L2",
+            "Mconv7_stage3_L1",
+            "Mconv7_stage3_L2",
+            "Mconv7_stage4_L1",
+            "Mconv7_stage4_L2",
+            "Mconv7_stage5_L1",
+            "Mconv7_stage5_L2",
+            "Mconv7_stage6_L1",
+            "Mconv7_stage6_L1",
+        ]
+        blocks = {}
+        block0 = OrderedDict(
+            [
+                ("conv1_1", [3, 64, 3, 1, 1]),
+                ("conv1_2", [64, 64, 3, 1, 1]),
+                ("pool1_stage1", [2, 2, 0]),
+                ("conv2_1", [64, 128, 3, 1, 1]),
+                ("conv2_2", [128, 128, 3, 1, 1]),
+                ("pool2_stage1", [2, 2, 0]),
+                ("conv3_1", [128, 256, 3, 1, 1]),
+                ("conv3_2", [256, 256, 3, 1, 1]),
+                ("conv3_3", [256, 256, 3, 1, 1]),
+                ("conv3_4", [256, 256, 3, 1, 1]),
+                ("pool3_stage1", [2, 2, 0]),
+                ("conv4_1", [256, 512, 3, 1, 1]),
+                ("conv4_2", [512, 512, 3, 1, 1]),
+                ("conv4_3_CPM", [512, 256, 3, 1, 1]),
+                ("conv4_4_CPM", [256, 128, 3, 1, 1]),
+            ]
+        )
+
+        # Stage 1
+        block1_1 = OrderedDict(
+            [
+                ("conv5_1_CPM_L1", [128, 128, 3, 1, 1]),
+                ("conv5_2_CPM_L1", [128, 128, 3, 1, 1]),
+                ("conv5_3_CPM_L1", [128, 128, 3, 1, 1]),
+                ("conv5_4_CPM_L1", [128, 512, 1, 1, 0]),
+                ("conv5_5_CPM_L1", [512, 38, 1, 1, 0]),
+            ]
+        )
+
+        block1_2 = OrderedDict(
+            [
+                ("conv5_1_CPM_L2", [128, 128, 3, 1, 1]),
+                ("conv5_2_CPM_L2", [128, 128, 3, 1, 1]),
+                ("conv5_3_CPM_L2", [128, 128, 3, 1, 1]),
+                ("conv5_4_CPM_L2", [128, 512, 1, 1, 0]),
+                ("conv5_5_CPM_L2", [512, 19, 1, 1, 0]),
+            ]
+        )
+        blocks["block1_1"] = block1_1
+        blocks["block1_2"] = block1_2
+
+        self.model0 = make_layers(block0, no_relu_layers)
+
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks["block%d_1" % i] = OrderedDict(
+                [
+                    ("Mconv1_stage%d_L1" % i, [185, 128, 7, 1, 3]),
+                    ("Mconv2_stage%d_L1" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv3_stage%d_L1" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv4_stage%d_L1" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv5_stage%d_L1" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv6_stage%d_L1" % i, [128, 128, 1, 1, 0]),
+                    ("Mconv7_stage%d_L1" % i, [128, 38, 1, 1, 0]),
+                ]
+            )
+
+            blocks["block%d_2" % i] = OrderedDict(
+                [
+                    ("Mconv1_stage%d_L2" % i, [185, 128, 7, 1, 3]),
+                    ("Mconv2_stage%d_L2" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv3_stage%d_L2" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv4_stage%d_L2" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv5_stage%d_L2" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv6_stage%d_L2" % i, [128, 128, 1, 1, 0]),
+                    ("Mconv7_stage%d_L2" % i, [128, 19, 1, 1, 0]),
+                ]
+            )
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_1 = blocks["block1_1"]
+        self.model2_1 = blocks["block2_1"]
+        self.model3_1 = blocks["block3_1"]
+        self.model4_1 = blocks["block4_1"]
+        self.model5_1 = blocks["block5_1"]
+        self.model6_1 = blocks["block6_1"]
+
+        self.model1_2 = blocks["block1_2"]
+        self.model2_2 = blocks["block2_2"]
+        self.model3_2 = blocks["block3_2"]
+        self.model4_2 = blocks["block4_2"]
+        self.model5_2 = blocks["block5_2"]
+        self.model6_2 = blocks["block6_2"]
+
+    def forward(self, x):
+        out1 = self.model0(x)
+
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+
+        return out6_1, out6_2
+
+
+class Body(object):
+    def __init__(self, model_path):
+        self.model = BodyPoseModel()
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+        model_dict = transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+
+    def __call__(self, oriImg):
+        scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre1 = 0.1
+        thre2 = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(
+                oriImg,
+                (0, 0),
+                fx=scale,
+                fy=scale,
+                interpolation=cv2.INTER_CUBIC,
+            )
+            imageToTest_padded, pad = padRightDownCorner(
+                imageToTest, stride, padValue
+            )
+            im = (
+                np.transpose(
+                    np.float32(imageToTest_padded[:, :, :, np.newaxis]),
+                    (3, 2, 0, 1),
+                )
+                / 256
+                - 0.5
+            )
+            im = np.ascontiguousarray(im)
+
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            with torch.no_grad():
+                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(
+                np.squeeze(Mconv7_stage6_L2), (1, 2, 0)
+            )  # output 1 is heatmaps
+            heatmap = cv2.resize(
+                heatmap,
+                (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC,
+            )
+            heatmap = heatmap[
+                : imageToTest_padded.shape[0] - pad[2],
+                : imageToTest_padded.shape[1] - pad[3],
+                :,
+            ]
+            heatmap = cv2.resize(
+                heatmap,
+                (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC,
+            )
+
+            # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0))  # output 0 is PAFs
+            paf = np.transpose(
+                np.squeeze(Mconv7_stage6_L1), (1, 2, 0)
+            )  # output 0 is PAFs
+            paf = cv2.resize(
+                paf,
+                (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC,
+            )
+            paf = paf[
+                : imageToTest_padded.shape[0] - pad[2],
+                : imageToTest_padded.shape[1] - pad[3],
+                :,
+            ]
+            paf = cv2.resize(
+                paf,
+                (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC,
+            )
+
+            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+            paf_avg += +paf / len(multiplier)
+
+        all_peaks = []
+        peak_counter = 0
+
+        for part in range(18):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+            map_left = np.zeros(one_heatmap.shape)
+            map_left[1:, :] = one_heatmap[:-1, :]
+            map_right = np.zeros(one_heatmap.shape)
+            map_right[:-1, :] = one_heatmap[1:, :]
+            map_up = np.zeros(one_heatmap.shape)
+            map_up[:, 1:] = one_heatmap[:, :-1]
+            map_down = np.zeros(one_heatmap.shape)
+            map_down[:, :-1] = one_heatmap[:, 1:]
+
+            peaks_binary = np.logical_and.reduce(
+                (
+                    one_heatmap >= map_left,
+                    one_heatmap >= map_right,
+                    one_heatmap >= map_up,
+                    one_heatmap >= map_down,
+                    one_heatmap > thre1,
+                )
+            )
+            peaks = list(
+                zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])
+            )  # note reverse
+            peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
+            peak_id = range(peak_counter, peak_counter + len(peaks))
+            peaks_with_score_and_id = [
+                peaks_with_score[i] + (peak_id[i],)
+                for i in range(len(peak_id))
+            ]
+
+            all_peaks.append(peaks_with_score_and_id)
+            peak_counter += len(peaks)
+
+        # find connection in the specified sequence, center 29 is in the position 15
+        limbSeq = [
+            [2, 3],
+            [2, 6],
+            [3, 4],
+            [4, 5],
+            [6, 7],
+            [7, 8],
+            [2, 9],
+            [9, 10],
+            [10, 11],
+            [2, 12],
+            [12, 13],
+            [13, 14],
+            [2, 1],
+            [1, 15],
+            [15, 17],
+            [1, 16],
+            [16, 18],
+            [3, 17],
+            [6, 18],
+        ]
+        # the middle joints heatmap correpondence
+        mapIdx = [
+            [31, 32],
+            [39, 40],
+            [33, 34],
+            [35, 36],
+            [41, 42],
+            [43, 44],
+            [19, 20],
+            [21, 22],
+            [23, 24],
+            [25, 26],
+            [27, 28],
+            [29, 30],
+            [47, 48],
+            [49, 50],
+            [53, 54],
+            [51, 52],
+            [55, 56],
+            [37, 38],
+            [45, 46],
+        ]
+
+        connection_all = []
+        special_k = []
+        mid_num = 10
+
+        for k in range(len(mapIdx)):
+            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+            candA = all_peaks[limbSeq[k][0] - 1]
+            candB = all_peaks[limbSeq[k][1] - 1]
+            nA = len(candA)
+            nB = len(candB)
+            indexA, indexB = limbSeq[k]
+            if nA != 0 and nB != 0:
+                connection_candidate = []
+                for i in range(nA):
+                    for j in range(nB):
+                        vec = np.subtract(candB[j][:2], candA[i][:2])
+                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+                        norm = max(0.001, norm)
+                        vec = np.divide(vec, norm)
+
+                        startend = list(
+                            zip(
+                                np.linspace(
+                                    candA[i][0], candB[j][0], num=mid_num
+                                ),
+                                np.linspace(
+                                    candA[i][1], candB[j][1], num=mid_num
+                                ),
+                            )
+                        )
+
+                        vec_x = np.array(
+                            [
+                                score_mid[
+                                    int(round(startend[I][1])),
+                                    int(round(startend[I][0])),
+                                    0,
+                                ]
+                                for I in range(len(startend))
+                            ]
+                        )
+                        vec_y = np.array(
+                            [
+                                score_mid[
+                                    int(round(startend[I][1])),
+                                    int(round(startend[I][0])),
+                                    1,
+                                ]
+                                for I in range(len(startend))
+                            ]
+                        )
+
+                        score_midpts = np.multiply(
+                            vec_x, vec[0]
+                        ) + np.multiply(vec_y, vec[1])
+                        score_with_dist_prior = sum(score_midpts) / len(
+                            score_midpts
+                        ) + min(0.5 * oriImg.shape[0] / norm - 1, 0)
+                        criterion1 = len(
+                            np.nonzero(score_midpts > thre2)[0]
+                        ) > 0.8 * len(score_midpts)
+                        criterion2 = score_with_dist_prior > 0
+                        if criterion1 and criterion2:
+                            connection_candidate.append(
+                                [
+                                    i,
+                                    j,
+                                    score_with_dist_prior,
+                                    score_with_dist_prior
+                                    + candA[i][2]
+                                    + candB[j][2],
+                                ]
+                            )
+
+                connection_candidate = sorted(
+                    connection_candidate, key=lambda x: x[2], reverse=True
+                )
+                connection = np.zeros((0, 5))
+                for c in range(len(connection_candidate)):
+                    i, j, s = connection_candidate[c][0:3]
+                    if i not in connection[:, 3] and j not in connection[:, 4]:
+                        connection = np.vstack(
+                            [connection, [candA[i][3], candB[j][3], s, i, j]]
+                        )
+                        if len(connection) >= min(nA, nB):
+                            break
+
+                connection_all.append(connection)
+            else:
+                special_k.append(k)
+                connection_all.append([])
+
+        # last number in each row is the total parts number of that person
+        # the second last number in each row is the score of the overall configuration
+        subset = -1 * np.ones((0, 20))
+        candidate = np.array(
+            [item for sublist in all_peaks for item in sublist]
+        )
+
+        for k in range(len(mapIdx)):
+            if k not in special_k:
+                partAs = connection_all[k][:, 0]
+                partBs = connection_all[k][:, 1]
+                indexA, indexB = np.array(limbSeq[k]) - 1
+
+                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
+                    found = 0
+                    subset_idx = [-1, -1]
+                    for j in range(len(subset)):  # 1:size(subset,1):
+                        if (
+                            subset[j][indexA] == partAs[i]
+                            or subset[j][indexB] == partBs[i]
+                        ):
+                            subset_idx[found] = j
+                            found += 1
+
+                    if found == 1:
+                        j = subset_idx[0]
+                        if subset[j][indexB] != partBs[i]:
+                            subset[j][indexB] = partBs[i]
+                            subset[j][-1] += 1
+                            subset[j][-2] += (
+                                candidate[partBs[i].astype(int), 2]
+                                + connection_all[k][i][2]
+                            )
+                    elif found == 2:  # if found 2 and disjoint, merge them
+                        j1, j2 = subset_idx
+                        membership = (
+                            (subset[j1] >= 0).astype(int)
+                            + (subset[j2] >= 0).astype(int)
+                        )[:-2]
+                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
+                            subset[j1][:-2] += subset[j2][:-2] + 1
+                            subset[j1][-2:] += subset[j2][-2:]
+                            subset[j1][-2] += connection_all[k][i][2]
+                            subset = np.delete(subset, j2, 0)
+                        else:  # as like found == 1
+                            subset[j1][indexB] = partBs[i]
+                            subset[j1][-1] += 1
+                            subset[j1][-2] += (
+                                candidate[partBs[i].astype(int), 2]
+                                + connection_all[k][i][2]
+                            )
+
+                    # if find no partA in the subset, create a new subset
+                    elif not found and k < 17:
+                        row = -1 * np.ones(20)
+                        row[indexA] = partAs[i]
+                        row[indexB] = partBs[i]
+                        row[-1] = 2
+                        row[-2] = (
+                            sum(
+                                candidate[
+                                    connection_all[k][i, :2].astype(int), 2
+                                ]
+                            )
+                            + connection_all[k][i][2]
+                        )
+                        subset = np.vstack([subset, row])
+        # delete some rows of subset which has few parts occur
+        deleteIdx = []
+        for i in range(len(subset)):
+            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+                deleteIdx.append(i)
+        subset = np.delete(subset, deleteIdx, axis=0)
+
+        # candidate: x, y, score, id
+        return candidate, subset
--- a/apps/stable_diffusion/src/utils/stencils/openpose/hand.py
+++ b/apps/stable_diffusion/src/utils/stencils/openpose/hand.py
@@ -0,0 +1,205 @@
+import cv2
+import numpy as np
+from scipy.ndimage.filters import gaussian_filter
+import torch
+import torch.nn as nn
+from skimage.measure import label
+from collections import OrderedDict
+from apps.stable_diffusion.src.utils.stencils.openpose.openpose_util import (
+    make_layers,
+    transfer,
+    padRightDownCorner,
+    npmax,
+)
+
+
+class HandPoseModel(nn.Module):
+    def __init__(self):
+        super(HandPoseModel, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = [
+            "conv6_2_CPM",
+            "Mconv7_stage2",
+            "Mconv7_stage3",
+            "Mconv7_stage4",
+            "Mconv7_stage5",
+            "Mconv7_stage6",
+        ]
+        # stage 1
+        block1_0 = OrderedDict(
+            [
+                ("conv1_1", [3, 64, 3, 1, 1]),
+                ("conv1_2", [64, 64, 3, 1, 1]),
+                ("pool1_stage1", [2, 2, 0]),
+                ("conv2_1", [64, 128, 3, 1, 1]),
+                ("conv2_2", [128, 128, 3, 1, 1]),
+                ("pool2_stage1", [2, 2, 0]),
+                ("conv3_1", [128, 256, 3, 1, 1]),
+                ("conv3_2", [256, 256, 3, 1, 1]),
+                ("conv3_3", [256, 256, 3, 1, 1]),
+                ("conv3_4", [256, 256, 3, 1, 1]),
+                ("pool3_stage1", [2, 2, 0]),
+                ("conv4_1", [256, 512, 3, 1, 1]),
+                ("conv4_2", [512, 512, 3, 1, 1]),
+                ("conv4_3", [512, 512, 3, 1, 1]),
+                ("conv4_4", [512, 512, 3, 1, 1]),
+                ("conv5_1", [512, 512, 3, 1, 1]),
+                ("conv5_2", [512, 512, 3, 1, 1]),
+                ("conv5_3_CPM", [512, 128, 3, 1, 1]),
+            ]
+        )
+
+        block1_1 = OrderedDict(
+            [
+                ("conv6_1_CPM", [128, 512, 1, 1, 0]),
+                ("conv6_2_CPM", [512, 22, 1, 1, 0]),
+            ]
+        )
+
+        blocks = {}
+        blocks["block1_0"] = block1_0
+        blocks["block1_1"] = block1_1
+
+        # stage 2-6
+        for i in range(2, 7):
+            blocks["block%d" % i] = OrderedDict(
+                [
+                    ("Mconv1_stage%d" % i, [150, 128, 7, 1, 3]),
+                    ("Mconv2_stage%d" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv3_stage%d" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv4_stage%d" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv5_stage%d" % i, [128, 128, 7, 1, 3]),
+                    ("Mconv6_stage%d" % i, [128, 128, 1, 1, 0]),
+                    ("Mconv7_stage%d" % i, [128, 22, 1, 1, 0]),
+                ]
+            )
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_0 = blocks["block1_0"]
+        self.model1_1 = blocks["block1_1"]
+        self.model2 = blocks["block2"]
+        self.model3 = blocks["block3"]
+        self.model4 = blocks["block4"]
+        self.model5 = blocks["block5"]
+        self.model6 = blocks["block6"]
+
+    def forward(self, x):
+        out1_0 = self.model1_0(x)
+        out1_1 = self.model1_1(out1_0)
+        concat_stage2 = torch.cat([out1_1, out1_0], 1)
+        out_stage2 = self.model2(concat_stage2)
+        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+        out_stage3 = self.model3(concat_stage3)
+        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+        out_stage4 = self.model4(concat_stage4)
+        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+        out_stage5 = self.model5(concat_stage5)
+        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+        out_stage6 = self.model6(concat_stage6)
+        return out_stage6
+
+
+class Hand(object):
+    def __init__(self, model_path):
+        self.model = HandPoseModel()
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+        model_dict = transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+
+    def __call__(self, oriImg):
+        scale_search = [0.5, 1.0, 1.5, 2.0]
+        # scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
+        # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(
+                oriImg,
+                (0, 0),
+                fx=scale,
+                fy=scale,
+                interpolation=cv2.INTER_CUBIC,
+            )
+            imageToTest_padded, pad = padRightDownCorner(
+                imageToTest, stride, padValue
+            )
+            im = (
+                np.transpose(
+                    np.float32(imageToTest_padded[:, :, :, np.newaxis]),
+                    (3, 2, 0, 1),
+                )
+                / 256
+                - 0.5
+            )
+            im = np.ascontiguousarray(im)
+
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+            with torch.no_grad():
+                output = self.model(data).cpu().numpy()
+                # output = self.model(data).numpy()q
+
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(
+                np.squeeze(output), (1, 2, 0)
+            )  # output 1 is heatmaps
+            heatmap = cv2.resize(
+                heatmap,
+                (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC,
+            )
+            heatmap = heatmap[
+                : imageToTest_padded.shape[0] - pad[2],
+                : imageToTest_padded.shape[1] - pad[3],
+                :,
+            ]
+            heatmap = cv2.resize(
+                heatmap,
+                (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC,
+            )
+
+            heatmap_avg += heatmap / len(multiplier)
+
+        all_peaks = []
+        for part in range(21):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+            binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+            # 全部小于阈值
+            if np.sum(binary) == 0:
+                all_peaks.append([0, 0])
+                continue
+            label_img, label_numbers = label(
+                binary, return_num=True, connectivity=binary.ndim
+            )
+            max_index = (
+                np.argmax(
+                    [
+                        np.sum(map_ori[label_img == i])
+                        for i in range(1, label_numbers + 1)
+                    ]
+                )
+                + 1
+            )
+            label_img[label_img != max_index] = 0
+            map_ori[label_img == 0] = 0
+
+            y, x = npmax(map_ori)
+            all_peaks.append([x, y])
+        return np.array(all_peaks)
--- a/apps/stable_diffusion/src/utils/stencils/openpose/openpose_util.py
+++ b/apps/stable_diffusion/src/utils/stencils/openpose/openpose_util.py
@@ -0,0 +1,272 @@
+import math
+import numpy as np
+import matplotlib
+import cv2
+from collections import OrderedDict
+import torch.nn as nn
+
+
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if "pool" in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(
+                in_channels=v[0],
+                out_channels=v[1],
+                kernel_size=v[2],
+                stride=v[3],
+                padding=v[4],
+            )
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(("relu_" + layer_name, nn.ReLU(inplace=True)))
+
+    return nn.Sequential(OrderedDict(layers))
+
+
+def padRightDownCorner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+
+    pad = 4 * [None]
+    pad[0] = 0  # up
+    pad[1] = 0  # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
+
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+    return img_padded, pad
+
+
+# transfer caffe model to pytorch which will match the layer name
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights[
+            ".".join(weights_name.split(".")[1:])
+        ]
+    return transfered_model_weights
+
+
+# draw the body keypoint and lims
+def draw_bodypose(canvas, candidate, subset):
+    stickwidth = 4
+    limbSeq = [
+        [2, 3],
+        [2, 6],
+        [3, 4],
+        [4, 5],
+        [6, 7],
+        [7, 8],
+        [2, 9],
+        [9, 10],
+        [10, 11],
+        [2, 12],
+        [12, 13],
+        [13, 14],
+        [2, 1],
+        [1, 15],
+        [15, 17],
+        [1, 16],
+        [16, 18],
+        [3, 17],
+        [6, 18],
+    ]
+
+    colors = [
+        [255, 0, 0],
+        [255, 85, 0],
+        [255, 170, 0],
+        [255, 255, 0],
+        [170, 255, 0],
+        [85, 255, 0],
+        [0, 255, 0],
+        [0, 255, 85],
+        [0, 255, 170],
+        [0, 255, 255],
+        [0, 170, 255],
+        [0, 85, 255],
+        [0, 0, 255],
+        [85, 0, 255],
+        [170, 0, 255],
+        [255, 0, 255],
+        [255, 0, 170],
+        [255, 0, 85],
+    ]
+    for i in range(18):
+        for n in range(len(subset)):
+            index = int(subset[n][i])
+            if index == -1:
+                continue
+            x, y = candidate[index][0:2]
+            cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
+    for i in range(17):
+        for n in range(len(subset)):
+            index = subset[n][np.array(limbSeq[i]) - 1]
+            if -1 in index:
+                continue
+            cur_canvas = canvas.copy()
+            Y = candidate[index.astype(int), 0]
+            X = candidate[index.astype(int), 1]
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly(
+                (int(mY), int(mX)),
+                (int(length / 2), stickwidth),
+                int(angle),
+                0,
+                360,
+                1,
+            )
+            cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
+            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+    return canvas
+
+
+# image drawed by opencv is not good.
+def draw_handpose(canvas, all_hand_peaks, show_number=False):
+    edges = [
+        [0, 1],
+        [1, 2],
+        [2, 3],
+        [3, 4],
+        [0, 5],
+        [5, 6],
+        [6, 7],
+        [7, 8],
+        [0, 9],
+        [9, 10],
+        [10, 11],
+        [11, 12],
+        [0, 13],
+        [13, 14],
+        [14, 15],
+        [15, 16],
+        [0, 17],
+        [17, 18],
+        [18, 19],
+        [19, 20],
+    ]
+
+    for peaks in all_hand_peaks:
+        for ie, e in enumerate(edges):
+            if np.sum(np.all(peaks[e], axis=1) == 0) == 0:
+                x1, y1 = peaks[e[0]]
+                x2, y2 = peaks[e[1]]
+                cv2.line(
+                    canvas,
+                    (x1, y1),
+                    (x2, y2),
+                    matplotlib.colors.hsv_to_rgb(
+                        [ie / float(len(edges)), 1.0, 1.0]
+                    )
+                    * 255,
+                    thickness=2,
+                )
+
+        for i, keyponit in enumerate(peaks):
+            x, y = keyponit
+            cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+            if show_number:
+                cv2.putText(
+                    canvas,
+                    str(i),
+                    (x, y),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.3,
+                    (0, 0, 0),
+                    lineType=cv2.LINE_AA,
+                )
+    return canvas
+
+
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(candidate, subset, oriImg):
+    # right hand: wrist 4, elbow 3, shoulder 2
+    # left hand: wrist 7, elbow 6, shoulder 5
+    ratioWristElbow = 0.33
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        # if any of three not detected
+        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
+        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
+        if not (has_left or has_right):
+            continue
+        hands = []
+        # left hand
+        if has_left:
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[
+                [5, 6, 7]
+            ]
+            x1, y1 = candidate[left_shoulder_index][:2]
+            x2, y2 = candidate[left_elbow_index][:2]
+            x3, y3 = candidate[left_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, True])
+        # right hand
+        if has_right:
+            (
+                right_shoulder_index,
+                right_elbow_index,
+                right_wrist_index,
+            ) = person[[2, 3, 4]]
+            x1, y1 = candidate[right_shoulder_index][:2]
+            x2, y2 = candidate[right_elbow_index][:2]
+            x3, y3 = candidate[right_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, False])
+
+        for x1, y1, x2, y2, x3, y3, is_left in hands:
+            x = x3 + ratioWristElbow * (x3 - x2)
+            y = y3 + ratioWristElbow * (y3 - y2)
+            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+            # x-y refers to the center --> offset to topLeft point
+            x -= width / 2
+            y -= width / 2  # width = height
+            # overflow the image
+            if x < 0:
+                x = 0
+            if y < 0:
+                y = 0
+            width1 = width
+            width2 = width
+            if x + width > image_width:
+                width1 = image_width - x
+            if y + width > image_height:
+                width2 = image_height - y
+            width = min(width1, width2)
+            # the max hand box value is 20 pixels
+            if width >= 20:
+                detect_result.append([int(x), int(y), int(width), is_left])
+
+    """
+    return value: [[x, y, w, True if left hand else False]].
+    width=height since the network require squared input.
+    x, y is the coordinate of top left 
+    """
+    return detect_result
+
+
+# get max index of 2d array
+def npmax(array):
+    arrayindex = array.argmax(1)
+    arrayvalue = array.max(1)
+    i = arrayvalue.argmax()
+    j = arrayindex[i]
+    return (i,)
--- a/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
+++ b/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
@@ -0,0 +1,186 @@
+import numpy as np
+from PIL import Image
+import torch
+from apps.stable_diffusion.src.utils.stencils import (
+    CannyDetector,
+    OpenposeDetector,
+)
+
+stencil = {}
+
+
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+
+
+def controlnet_hint_shaping(
+    controlnet_hint, height, width, dtype, num_images_per_prompt=1
+):
+    channels = 3
+    if isinstance(controlnet_hint, torch.Tensor):
+        # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
+        shape_chw = (channels, height, width)
+        shape_bchw = (1, channels, height, width)
+        shape_nchw = (num_images_per_prompt, channels, height, width)
+        if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
+            controlnet_hint = controlnet_hint.to(
+                dtype=dtype, device=torch.device("cpu")
+            )
+            if controlnet_hint.shape != shape_nchw:
+                controlnet_hint = controlnet_hint.repeat(
+                    num_images_per_prompt, 1, 1, 1
+                )
+            return controlnet_hint
+        else:
+            raise ValueError(
+                f"Acceptble shape of `stencil` are any of ({channels}, {height}, {width}),"
+                + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
+                + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
+            )
+    elif isinstance(controlnet_hint, np.ndarray):
+        # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
+        # hwc is opencv compatible image format. Color channel must be BGR Format.
+        if controlnet_hint.shape == (height, width):
+            controlnet_hint = np.repeat(
+                controlnet_hint[:, :, np.newaxis], channels, axis=2
+            )  # hw -> hwc(c==3)
+        shape_hwc = (height, width, channels)
+        shape_bhwc = (1, height, width, channels)
+        shape_nhwc = (num_images_per_prompt, height, width, channels)
+        if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
+            controlnet_hint = torch.from_numpy(controlnet_hint.copy())
+            controlnet_hint = controlnet_hint.to(
+                dtype=dtype, device=torch.device("cpu")
+            )
+            controlnet_hint /= 255.0
+            if controlnet_hint.shape != shape_nhwc:
+                controlnet_hint = controlnet_hint.repeat(
+                    num_images_per_prompt, 1, 1, 1
+                )
+            controlnet_hint = controlnet_hint.permute(
+                0, 3, 1, 2
+            )  # b h w c -> b c h w
+            return controlnet_hint
+        else:
+            raise ValueError(
+                f"Acceptble shape of `stencil` are any of ({width}, {channels}), "
+                + f"({height}, {width}, {channels}), "
+                + f"(1, {height}, {width}, {channels}) or "
+                + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
+            )
+    elif isinstance(controlnet_hint, Image.Image):
+        if controlnet_hint.size == (width, height):
+            controlnet_hint = controlnet_hint.convert(
+                "RGB"
+            )  # make sure 3 channel RGB format
+            controlnet_hint = np.array(controlnet_hint)  # to numpy
+            controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
+            return controlnet_hint_shaping(
+                controlnet_hint, height, width, num_images_per_prompt
+            )
+        else:
+            raise ValueError(
+                f"Acceptable image size of `stencil` is ({width}, {height}) but is {controlnet_hint.size}"
+            )
+    else:
+        raise ValueError(
+            f"Acceptable type of `stencil` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
+        )
+
+
+def controlnet_hint_conversion(
+    image, use_stencil, height, width, dtype, num_images_per_prompt=1
+):
+    controlnet_hint = None
+    match use_stencil:
+        case "canny":
+            print("Detecting edge with canny")
+            controlnet_hint = hint_canny(image)
+        case "openpose":
+            print("Detecting human pose")
+            controlnet_hint = hint_openpose(image)
+        case "scribble":
+            print("Working with scribble")
+            controlnet_hint = hint_scribble(image)
+        case _:
+            return None
+    controlnet_hint = controlnet_hint_shaping(
+        controlnet_hint, height, width, dtype, num_images_per_prompt
+    )
+    return controlnet_hint
+
+
+stencil_to_model_id_map = {
+    "canny": "lllyasviel/sd-controlnet-canny",
+    "depth": "lllyasviel/sd-controlnet-depth",
+    "hed": "lllyasviel/sd-controlnet-hed",
+    "mlsd": "lllyasviel/sd-controlnet-mlsd",
+    "normal": "lllyasviel/sd-controlnet-normal",
+    "openpose": "lllyasviel/sd-controlnet-openpose",
+    "scribble": "lllyasviel/sd-controlnet-scribble",
+    "seg": "lllyasviel/sd-controlnet-seg",
+}
+
+
+def get_stencil_model_id(use_stencil):
+    if use_stencil in stencil_to_model_id_map:
+        return stencil_to_model_id_map[use_stencil]
+    return None
+
+
+# Stencil 1. Canny
+def hint_canny(
+    image: Image.Image,
+    low_threshold=100,
+    high_threshold=200,
+):
+    with torch.no_grad():
+        input_image = np.array(image)
+
+        if not "canny" in stencil:
+            stencil["canny"] = CannyDetector()
+        detected_map = stencil["canny"](
+            input_image, low_threshold, high_threshold
+        )
+        detected_map = HWC3(detected_map)
+        return detected_map
+
+
+# Stencil 2. OpenPose.
+def hint_openpose(
+    image: Image.Image,
+):
+    with torch.no_grad():
+        input_image = np.array(image)
+
+        if not "openpose" in stencil:
+            stencil["openpose"] = OpenposeDetector()
+
+        detected_map, _ = stencil["openpose"](input_image)
+        detected_map = HWC3(detected_map)
+        return detected_map
+
+
+# Stencil 3. Scribble.
+def hint_scribble(image: Image.Image):
+    with torch.no_grad():
+        input_image = np.array(image)
+
+        detected_map = np.zeros_like(input_image, dtype=np.uint8)
+        detected_map[np.min(input_image, axis=2) < 127] = 255
+        return detected_map
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -1,9 +1,16 @@
 import os
 import gc
 import json
+import re
+from PIL import PngImagePlugin
+from datetime import datetime as dt
+from csv import DictWriter
 from pathlib import Path
 import numpy as np
 from random import randint
+import tempfile
+import torch
+from safetensors.torch import load_file
 from shark.shark_inference import SharkInference
 from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
@@ -14,26 +21,26 @@ from shark.iree_utils.gpu_utils import get_cuda_sm_cc
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.resources import opt_flags
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
-import sys, functools, operator
+import sys
 from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
-    load_pipeline_from_original_stable_diffusion_ckpt,
+    download_from_original_stable_diffusion_ckpt,
 )


-def get_vmfb_path_name(model_name):
-    device = (
-        args.device
-        if "://" not in args.device
-        else "-".join(args.device.split("://"))
-    )
+def get_extended_name(model_name):
+    device = args.device.split("://", 1)[0]
    extended_name = "{}_{}".format(model_name, device)
-    vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
-    return [vmfb_path, extended_name]
+    return extended_name
+
+
+def get_vmfb_path_name(model_name):
+    vmfb_path = os.path.join(os.getcwd(), model_name + ".vmfb")
+    return vmfb_path


 def _compile_module(shark_module, model_name, extra_args=[]):
    if args.load_vmfb or args.save_vmfb:
-        [vmfb_path, extended_name] = get_vmfb_path_name(model_name)
+        vmfb_path = get_vmfb_path_name(model_name)
        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
            print(f"loading existing vmfb from: {vmfb_path}")
            shark_module.load_module(vmfb_path, extra_args=extra_args)
@@ -47,7 +54,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
                    )
                )
            path = shark_module.save_module(
-                os.getcwd(), extended_name, extra_args
+                os.getcwd(), model_name, extra_args
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -73,7 +80,7 @@ def get_shark_model(tank_url, model_name, extra_args=[]):
        frontend="torch",
    )
    shark_module = SharkInference(
-        mlir_model, device=args.device, mlir_dialect="linalg"
+        mlir_model, device=args.device, mlir_dialect="tm_tensor"
    )
    return _compile_module(shark_module, model_name, extra_args)

@@ -86,38 +93,59 @@ def compile_through_fx(
    is_f16=False,
    f16_input_mask=None,
    use_tuned=False,
+    save_dir=tempfile.gettempdir(),
+    debug=False,
+    generate_vmfb=True,
    extra_args=[],
+    base_model_id=None,
 ):
    from shark.parser import shark_args

    if "cuda" in args.device:
        shark_args.enable_tf32 = True

-    mlir_module, func_name = import_with_fx(
-        model, inputs, is_f16, f16_input_mask
+    (
+        mlir_module,
+        func_name,
+    ) = import_with_fx(
+        model=model,
+        inputs=inputs,
+        is_f16=is_f16,
+        f16_input_mask=f16_input_mask,
+        debug=debug,
+        model_name=model_name,
+        save_dir=save_dir,
    )
-
    if use_tuned:
        if "vae" in model_name.split("_")[0]:
            args.annotation_model = "vae"
-        mlir_module = sd_model_annotation(mlir_module, model_name)
+        mlir_module = sd_model_annotation(
+            mlir_module, model_name, base_model_id
+        )

    shark_module = SharkInference(
        mlir_module,
        device=args.device,
-        mlir_dialect="linalg",
+        mlir_dialect="tm_tensor",
    )

+    if generate_vmfb:
+        shark_module = SharkInference(
+            mlir_module,
+            device=args.device,
+            mlir_dialect="tm_tensor",
+        )
+        del mlir_module
+        gc.collect()
+        return _compile_module(shark_module, model_name, extra_args)
+
    del mlir_module
    gc.collect()

-    return _compile_module(shark_module, model_name, extra_args)
-

 def set_iree_runtime_flags():
    vulkan_runtime_flags = [
        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
-        f"--device_allocator=caching",
        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
    ]
    if args.enable_rgp:
@@ -232,24 +260,43 @@ def set_init_device_flags():
        args.max_length = 64

    # Use tuned models in the case of fp16, vulkan rdna3 or cuda sm devices.
+    if args.ckpt_loc != "":
+        base_model_id = fetch_and_update_base_model_id(args.ckpt_loc)
+    else:
+        base_model_id = fetch_and_update_base_model_id(args.hf_model_id)
+        if base_model_id == "":
+            base_model_id = args.hf_model_id
+
    if (
-        args.hf_model_id == "prompthero/openjourney"
-        or args.ckpt_loc != ""
-        or args.precision != "fp16"
-        or args.height != 512
-        or args.width != 512
+        args.precision != "fp16"
+        or args.height not in [512, 768]
+        or (args.height == 512 and args.width != 512)
+        or (args.height == 768 and args.width != 768)
        or args.batch_size != 1
        or ("vulkan" not in args.device and "cuda" not in args.device)
    ):
        args.use_tuned = False

-    elif (
-        "vulkan" in args.device
-        and "rdna3" not in args.iree_vulkan_target_triple
+    elif base_model_id not in [
+        "Linaqruf/anything-v3.0",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+        "prompthero/openjourney",
+        "wavymulder/Analog-Diffusion",
+        "stabilityai/stable-diffusion-2-1",
+        "stabilityai/stable-diffusion-2-1-base",
+        "CompVis/stable-diffusion-v1-4",
+        "runwayml/stable-diffusion-v1-5",
+        "runwayml/stable-diffusion-inpainting",
+        "stabilityai/stable-diffusion-2-inpainting",
+    ]:
+        args.use_tuned = False
+
+    elif "vulkan" in args.device and not any(
+        x in args.iree_vulkan_target_triple for x in ["rdna2", "rdna3"]
    ):
        args.use_tuned = False

-    elif "cuda" in args.device and get_cuda_sm_cc() not in ["sm_80"]:
+    elif "cuda" in args.device and get_cuda_sm_cc() not in ["sm_80", "sm_89"]:
        args.use_tuned = False

    elif args.use_base_vae and args.hf_model_id not in [
@@ -258,8 +305,22 @@ def set_init_device_flags():
    ]:
        args.use_tuned = False

+    elif (
+        args.height == 768
+        and args.width == 768
+        and (
+            base_model_id
+            not in [
+                "stabilityai/stable-diffusion-2-1",
+                "stabilityai/stable-diffusion-2-1-base",
+            ]
+            or "rdna3" not in args.iree_vulkan_target_triple
+        )
+    ):
+        args.use_tuned = False
+
    if args.use_tuned:
-        print(f"Using tuned models for {args.hf_model_id}/fp16/{args.device}.")
+        print(f"Using tuned models for {base_model_id}/fp16/{args.device}.")
    else:
        print("Tuned models are currently not supported for this setting.")

@@ -281,6 +342,27 @@ def set_init_device_flags():
    elif args.height != 512 or args.width != 512 or args.batch_size != 1:
        args.import_mlir = True

+    elif args.use_tuned and args.hf_model_id in [
+        "dreamlike-art/dreamlike-diffusion-1.0",
+        "prompthero/openjourney",
+        "stabilityai/stable-diffusion-2-1",
+    ]:
+        args.import_mlir = True
+
+    elif (
+        args.use_tuned
+        and "vulkan" in args.device
+        and "rdna2" in args.iree_vulkan_target_triple
+    ):
+        args.import_mlir = True
+
+    elif (
+        args.use_tuned
+        and "cuda" in args.device
+        and get_cuda_sm_cc() == "sm_89"
+    ):
+        args.import_mlir = True
+

 # Utility to get list of devices available.
 def get_available_devices():
@@ -306,7 +388,7 @@ def get_available_devices():
    available_devices.extend(vulkan_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
-    available_devices.append("cpu")
+    available_devices.append("device => cpu")
    return available_devices


@@ -355,6 +437,11 @@ def get_opt_flags(model, precision="fp16"):
    return iree_flags


+def get_path_stem(path):
+    path = Path(path)
+    return path.stem
+
+
 def get_path_to_diffusers_checkpoint(custom_weights):
    path = Path(custom_weights)
    diffusers_path = path.parent.absolute()
@@ -365,7 +452,7 @@ def get_path_to_diffusers_checkpoint(custom_weights):
    return path_to_diffusers


-def preprocessCKPT(custom_weights):
+def preprocessCKPT(custom_weights, is_inpaint=False):
    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights)
    if next(Path(path_to_diffusers).iterdir(), None):
        print("Checkpoint already loaded at : ", path_to_diffusers)
@@ -386,17 +473,129 @@ def preprocessCKPT(custom_weights):
    print(
        "Loading diffusers' pipeline from original stable diffusion checkpoint"
    )
-    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
+    num_in_channels = 9 if is_inpaint else 4
+    pipe = download_from_original_stable_diffusion_ckpt(
        checkpoint_path=custom_weights,
        extract_ema=extract_ema,
        from_safetensors=from_safetensors,
+        num_in_channels=num_in_channels,
    )
    pipe.save_pretrained(path_to_diffusers)
    print("Loading complete")


+def processLoRA(model, use_lora, splitting_prefix):
+    state_dict = ""
+    if ".safetensors" in use_lora:
+        state_dict = load_file(use_lora)
+    else:
+        state_dict = torch.load(use_lora)
+    alpha = 0.75
+    visited = []
+
+    # directly update weight in model
+    process_unet = "te" not in splitting_prefix
+    for key in state_dict:
+        if ".alpha" in key or key in visited:
+            continue
+
+        curr_layer = model
+        if ("text" not in key and process_unet) or (
+            "text" in key and not process_unet
+        ):
+            layer_infos = (
+                key.split(".")[0].split(splitting_prefix)[-1].split("_")
+            )
+        else:
+            continue
+
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = (
+                state_dict[pair_keys[0]]
+                .squeeze(3)
+                .squeeze(2)
+                .to(torch.float32)
+            )
+            weight_down = (
+                state_dict[pair_keys[1]]
+                .squeeze(3)
+                .squeeze(2)
+                .to(torch.float32)
+            )
+            curr_layer.weight.data += alpha * torch.mm(
+                weight_up, weight_down
+            ).unsqueeze(2).unsqueeze(3)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return model
+
+
+def update_lora_weight_for_unet(unet, use_lora):
+    extensions = [".bin", ".safetensors", ".pt"]
+    if not any([extension in use_lora for extension in extensions]):
+        # We assume if it is a HF ID with standalone LoRA weights.
+        unet.load_attn_procs(use_lora)
+        return unet
+
+    main_file_name = get_path_stem(use_lora)
+    if ".bin" in use_lora:
+        main_file_name += ".bin"
+    elif ".safetensors" in use_lora:
+        main_file_name += ".safetensors"
+    elif ".pt" in use_lora:
+        main_file_name += ".pt"
+    else:
+        sys.exit("Only .bin and .safetensors format for LoRA is supported")
+
+    try:
+        dir_name = os.path.dirname(use_lora)
+        unet.load_attn_procs(dir_name, weight_name=main_file_name)
+        return unet
+    except:
+        return processLoRA(unet, use_lora, "lora_unet_")
+
+
+def update_lora_weight(model, use_lora, model_name):
+    if "unet" in model_name:
+        return update_lora_weight_for_unet(model, use_lora)
+    try:
+        return processLoRA(model, use_lora, "lora_te_")
+    except:
+        return None
+
+
 def load_vmfb(vmfb_path, model, precision):
-    model = "vae" if "base_vae" in model else model
+    model = "vae" if "base_vae" in model or "vae_encode" in model else model
+    model = "unet" if "stencil" in model else model
    precision = "fp32" if "clip" in model else precision
    extra_args = get_opt_flags(model, precision)
    shark_module = SharkInference(mlir_module=None, device=args.device)
@@ -404,24 +603,23 @@ def load_vmfb(vmfb_path, model, precision):
    return shark_module


-# This utility returns vmfbs of Clip, Unet and Vae, in case all three of them
-# are present; deletes them otherwise.
-def fetch_or_delete_vmfbs(basic_model_name, use_base_vae, precision="fp32"):
-    model_name = ["clip", "unet", "base_vae" if use_base_vae else "vae"]
+# This utility returns vmfbs of sub-models of the SD pipeline, if present.
+def fetch_vmfbs(extended_model_name, precision="fp32"):
    vmfb_path = [
-        get_vmfb_path_name(model + basic_model_name)[0] for model in model_name
+        get_vmfb_path_name(extended_model_name[model])
+        for model in extended_model_name
    ]
+    number_of_vmfbs = len(vmfb_path)
    vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
-    all_vmfb_present = functools.reduce(operator.__and__, vmfb_present)
-    compiled_models = [None] * 3
-    # We need to delete vmfbs only if some of the models were compiled.
-    if not all_vmfb_present:
-        for i in range(len(vmfb_path)):
-            if vmfb_present[i]:
-                os.remove(vmfb_path[i])
-                print("Deleted: ", vmfb_path[i])
-    else:
-        for i in range(len(vmfb_path)):
+    all_vmfb_present = True
+    compiled_models = [None] * number_of_vmfbs
+
+    for i in range(number_of_vmfbs):
+        all_vmfb_present = all_vmfb_present and vmfb_present[i]
+
+    model_name = [model for model in extended_model_name.keys()]
+    for i in range(number_of_vmfbs):
+        if vmfb_present[i]:
            compiled_models[i] = load_vmfb(
                vmfb_path[i], model_name[i], precision
            )
@@ -459,3 +657,108 @@ def sanitize_seed(seed):
    if seed < uint32_min or seed >= uint32_max:
        seed = randint(uint32_min, uint32_max)
    return seed
+
+
+# clear all the cached objects to recompile cleanly.
+def clear_all():
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
+    # TODO: Remove this once we have better weight updation logic.
+    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
+    for yaml in inference_yaml:
+        if os.path.exists(yaml):
+            os.remove(yaml)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# save output images and the inputs corresponding to it.
+def save_output_img(output_img, img_seed, extra_info={}):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    generated_imgs_path = Path(
+        output_path, "generated_imgs", dt.now().strftime("%Y%m%d")
+    )
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{img_seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+
+    img_model = args.hf_model_id
+    if args.ckpt_loc:
+        img_model = Path(os.path.basename(args.ckpt_loc)).stem
+
+    if args.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(out_img_path, quality=95, subsampling=0)
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        pngInfo = PngImagePlugin.PngInfo()
+
+        if args.write_metadata_to_png:
+            pngInfo.add_text(
+                "parameters",
+                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {img_seed}, Size: {args.width}x{args.height}, Model: {img_model}",
+            )
+
+        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
+
+        if args.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {args.output_img_format} is not supported yet."
+                "Image saved as png instead. Supported formats: png / jpg"
+            )
+
+    new_entry = {
+        "VARIANT": img_model,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "SEED": img_seed,
+        "CFG_SCALE": args.guidance_scale,
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "HEIGHT": args.height,
+        "WIDTH": args.width,
+        "MAX_LENGTH": args.max_length,
+        "OUTPUT": out_img_path,
+    }
+
+    new_entry.update(extra_info)
+
+    with open(csv_path, "a", encoding="utf-8") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+    if args.save_metadata_to_json:
+        del new_entry["OUTPUT"]
+        json_path = Path(generated_imgs_path, f"{out_img_name}.json")
+        with open(json_path, "w") as f:
+            json.dump(new_entry, f, indent=4)
+
+
+def get_generation_text_info(seeds, device):
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seeds}"
+    text_output += f"\nsize={args.height}x{args.width}, batch_count={args.batch_count}, batch_size={args.batch_size}, max_length={args.max_length}"
+
+    return text_output
--- a/apps/stable_diffusion/stable_diffusion_amd.md
+++ b/apps/stable_diffusion/stable_diffusion_amd.md
@@ -1,70 +0,0 @@
-# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
-
-Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! Please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
-If it works well for you, please "star" the following GitHub projects... this is one of the best ways to help and spread the word!
-
-* https://github.com/nod-ai/SHARK
-* https://github.com/iree-org/iree
-
-## Install this specific AMD Drivers (AMD latest may not have all the fixes).
-
-### AMD KB Drivers for RDNA2 and RDNA3:
-
-*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
-
-First, for RDNA2 users, download this special driver in a folder of your choice. We recommend you keep the installation files around, since you may need to re-install it later, if Windows Update decides to overwrite it:
-https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
-
-For RDNA3, the latest driver 23.1.2 supports MLIR/IREE as well: https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-1-2-kb
-
-KNOWN ISSUES with this special AMD driver:
-* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
-* Some people using this special driver experience mouse pointer accuracy issues, especially if using a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".
-
-## Installation
-
-Download the latest Windows SHARK SD binary [492 here](https://github.com/nod-ai/SHARK/releases/download/20230203.492/shark_sd_20230203_492.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.
-
-Notes:
-* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clear_all` flag once to clean all the old files. 
-* If you recently updated the driver or this binary (EXE file), we recommend you:
-  * clear all the local artifacts with `--clear_all` OR 
-  * clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
-  * clear the `huggingface` cache. In Windows, this is `C:\Users\%username%\.cache\huggingface`.
-
-## Running
-
-* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE to start the web browser)
-* The first run may take about 10-15 minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
-* If successful, you will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
-* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/?__theme=dark.
-
-## Stopping
-
-* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment. The application should stop. 
-* Please make sure to do the above step before you attempt to update the EXE to a new version.
-
-# Results
-
-<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
-
-
-Here are some samples generated:
-
-![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
-
-![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
-
-
-The output on a 7900XTX would like:
-
-```shell 
-Stats for run 0:
-Average step time: 47.19188690185547ms/it
-Clip Inference time (ms) = 109.531
-VAE Inference time (ms): 78.590
-
-Total image generation time: 2.5788655281066895sec
-```
-
-Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
--- a/apps/stable_diffusion/web/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/css/sd_dark_theme.css
@@ -1,209 +0,0 @@
-
-/* Overwrite the Gradio default theme with their .dark theme declarations */
-
-:root {
-    --color-focus-primary: var(--color-grey-700);
-    --color-focus-secondary: var(--color-grey-600);
-    --color-focus-ring: rgb(55 65 81);
-    --color-background-primary: var(--color-grey-950);
-    --color-background-secondary: var(--color-grey-900);
-    --color-background-tertiary: var(--color-grey-800);
-    --color-text-body: var(--color-grey-100);
-    --color-text-label: var(--color-grey-200);
-    --color-text-placeholder: var(--color-grey);
-    --color-text-subdued: var(--color-grey-400);
-    --color-text-link-base: var(--color-blue-500);
-    --color-text-link-hover: var(--color-blue-400);
-    --color-text-link-visited: var(--color-blue-600);
-    --color-text-link-active: var(--color-blue-500);
-    --color-text-code-background: var(--color-grey-800);
-    --color-text-code-border: color.border-primary;
-    --color-border-primary: var(--color-grey-700);
-    --color-border-secondary: var(--color-grey-600);
-    --color-border-highlight: var(--color-accent-base);
-    --color-accent-base: var(--color-orange-500);
-    --color-accent-light: var(--color-orange-300);
-    --color-accent-dark: var(--color-orange-700);
-    --color-functional-error-base: var(--color-red-400);
-    --color-functional-error-subdued: var(--color-red-300);
-    --color-functional-error-background: var(--color-background-primary);
-    --color-functional-info-base: var(--color-yellow);
-    --color-functional-info-subdued: var(--color-yellow-300);
-    --color-functional-success-base: var(--color-green);
-    --color-functional-success-subdued: var(--color-green-300);
-    --shadow-spread: 2px;
-    --api-background: linear-gradient(to bottom, rgba(255, 216, 180, .05), transparent);
-    --api-pill-background: var(--color-orange-400);
-    --api-pill-border: var(--color-orange-600);
-    --api-pill-text: var(--color-orange-900);
-    --block-border-color: var(--color-border-primary);
-    --block-background: var(--color-background-tertiary);
-    --uploadable-border-color-hover: var(--color-border-primary);
-    --uploadable-border-color-loaded: var(--color-functional-success);
-    --uploadable-text-color: var(--color-text-subdued);
-    --block_label-border-color: var(--color-border-primary);
-    --block_label-icon-color: var(--color-text-label);
-    --block_label-shadow: var(--shadow-drop);
-    --block_label-background: var(--color-background-secondary);
-    --icon_button-icon-color-base: var(--color-text-label);
-    --icon_button-icon-color-hover: var(--color-text-label);
-    --icon_button-background-base: var(--color-background-primary);
-    --icon_button-background-hover: var(--color-background-primary);
-    --icon_button-border-color-base: var(--color-background-primary);
-    --icon_button-border-color-hover: var(--color-border-secondary);
-    --input-text-color: var(--color-text-body);
-    --input-border-color-base: var(--color-border-primary);
-    --input-border-color-hover: var(--color-border-primary);
-    --input-border-color-focus: var(--color-border-primary);
-    --input-background-base: var(--color-background-tertiary);
-    --input-background-hover: var(--color-background-tertiary);
-    --input-background-focus: var(--color-background-tertiary);
-    --input-shadow: var(--shadow-inset);
-    --checkbox-border-color-base: var(--color-border-primary);
-    --checkbox-border-color-hover: var(--color-focus-primary);
-    --checkbox-border-color-focus: var(--color-blue-500);
-    --checkbox-background-base: var(--color-background-primary);
-    --checkbox-background-hover: var(--color-background-primary);
-    --checkbox-background-focus: var(--color-background-primary);
-    --checkbox-background-selected: var(--color-blue-600);
-    --checkbox-label-border-color-base: var(--color-border-primary);
-    --checkbox-label-border-color-hover: var(--color-border-primary);
-    --checkbox-label-border-color-focus: var(--color-border-secondary);
-    --checkbox-label-background-base: linear-gradient(to top, var(--color-grey-900), var(--color-grey-800));
-    --checkbox-label-background-hover: linear-gradient(to top, var(--color-grey-900), var(--color-grey-800));
-    --checkbox-label-background-focus: linear-gradient(to top, var(--color-grey-900), var(--color-grey-800));
-    --form-seperator-color: var(--color-border-primary);
-    --button-primary-border-color-base: var(--color-orange-600);
-    --button-primary-border-color-hover: var(--color-orange-600);
-    --button-primary-border-color-focus: var(--color-orange-600);
-    --button-primary-text-color-base: white;
-    --button-primary-text-color-hover: white;
-    --button-primary-text-color-focus: white;
-    --button-primary-background-base: linear-gradient(to bottom right, var(--color-orange-700), var(--color-orange-700));
-    --button-primary-background-hover: linear-gradient(to bottom right, var(--color-orange-700), var(--color-orange-500));
-    --button-primary-background-focus: linear-gradient(to bottom right, var(--color-orange-700), var(--color-orange-500));
-    --button-secondary-border-color-base: var(--color-grey-600);
-    --button-secondary-border-color-hover: var(--color-grey-600);
-    --button-secondary-border-color-focus: var(--color-grey-600);
-    --button-secondary-text-color-base: white;
-    --button-secondary-text-color-hover: white;
-    --button-secondary-text-color-focus: white;
-    --button-secondary-background-base: linear-gradient(to bottom right, var(--color-grey-600), var(--color-grey-700));
-    --button-secondary-background-hover: linear-gradient(to bottom right, var(--color-grey-600), var(--color-grey-600));
-    --button-secondary-background-focus: linear-gradient(to bottom right, var(--color-grey-600), var(--color-grey-600));
-    --button-cancel-border-color-base: var(--color-red-600);
-    --button-cancel-border-color-hover: var(--color-red-600);
-    --button-cancel-border-color-focus: var(--color-red-600);
-    --button-cancel-text-color-base: white;
-    --button-cancel-text-color-hover: white;
-    --button-cancel-text-color-focus: white;
-    --button-cancel-background-base: linear-gradient(to bottom right, var(--color-red-700), var(--color-red-700));
-    --button-cancel-background-focus: linear-gradient(to bottom right, var(--color-red-700), var(--color-red-500));
-    --button-cancel-background-hover: linear-gradient(to bottom right, var(--color-red-700), var(--color-red-500));
-    --button-plain-border-color-base: var(--color-grey-600);
-    --button-plain-border-color-hover: var(--color-grey-500);
-    --button-plain-border-color-focus: var(--color-grey-500);
-    --button-plain-text-color-base: var(--color-text-body);
-    --button-plain-text-color-hover: var(--color-text-body);
-    --button-plain-text-color-focus: var(--color-text-body);
-    --button-plain-background-base: var(--color-grey-700);
-    --button-plain-background-hover: var(--color-grey-700);
-    --button-plain-background-focus: var(--color-grey-700);
-    --gallery-label-background-base: var(--color-grey-50);
-    --gallery-label-background-hover: var(--color-grey-50);
-    --gallery-label-border-color-base: var(--color-border-primary);
-    --gallery-label-border-color-hover: var(--color-border-primary);
-    --gallery-thumb-background-base: var(--color-grey-900);
-    --gallery-thumb-background-hover: var(--color-grey-900);
-    --gallery-thumb-border-color-base: var(--color-border-primary);
-    --gallery-thumb-border-color-hover: var(--color-accent-base);
-    --gallery-thumb-border-color-focus: var(--color-blue-500);
-    --gallery-thumb-border-color-selected: var(--color-accent-base);
-    --chatbot-border-border-color-base: transparent;
-    --chatbot-border-border-color-latest: transparent;
-    --chatbot-user-background-base: ;
-    --chatbot-user-background-latest: ;
-    --chatbot-user-text-color-base: white;
-    --chatbot-user-text-color-latest: white;
-    --chatbot-bot-background-base: ;
-    --chatbot-bot-background-latest: ;
-    --chatbot-bot-text-color-base: white;
-    --chatbot-bot-text-color-latest: white;
-    --label-gradient-from: var(--color-orange-400);
-    --label-gradient-to: var(--color-orange-600);
-    --table-odd-background: var(--color-grey-900);
-    --table-even-background: var(--color-grey-950);
-    --table-background-edit: transparent;
-    --dataset-gallery-background-base: var(--color-background-primary);
-    --dataset-gallery-background-hover: var(--color-grey-800);
-    --dataset-dataframe-border-base: var(--color-border-primary);
-    --dataset-dataframe-border-hover: var(--color-border-secondary);
-    --dataset-table-background-base: transparent;
-    --dataset-table-background-hover: var(--color-grey-700);
-    --dataset-table-border-base: var(--color-grey-800);
-    --dataset-table-border-hover: var(--color-grey-800);
-}
-
-/* SHARK theme customization */
-
-.gradio-container {
-    background-color: var(--color-background-primary);
-}
-
-.container {
-    background-color: black !important;
-    padding-top: 20px !important;
-}
-
-#ui_title {
-    padding: 10px !important;
-}
-
-#top_logo {
-    background-color: transparent;
-    border-radius: 0 !important;
-    border: 0;
-}
-
-#demo_title {
-    background-color: var(--color-background-primary);
-    border-radius: 0 !important;
-    border: 0;
-    padding-top: 15px;
-    padding-bottom: 0px;
-    width: 350px !important;
-}
-
-#demo_title_outer {
-    border-radius: 0;
-}
-
-#prompt_box_outer div:first-child {
-    border-radius: 0 !important
-}
-
-#prompt_box textarea {
-    background-color: var(--color-background-primary) !important;
-}
-
-#prompt_examples {
-    margin: 0 !important;
-}
-
-#prompt_examples svg {
-    display: none !important;
-}
-
-#ui_body {
-    background-color: var(--color-background-secondary) !important;
-    padding: 10px !important;
-    border-radius: 0.5em !important;
-}
-
-#img_result+div {
-    display: none !important;
-}
-
-footer {
-    display: none !important;
-}
--- a/apps/stable_diffusion/web/gradio/img2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/img2img_ui.py
--- a/apps/stable_diffusion/web/gradio/txt2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/txt2img_ui.py
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,14 +1,28 @@
 import os
 import sys
-from pathlib import Path
-import glob
-
-if "AMD_ENABLE_LLPC" not in os.environ:
-    os.environ["AMD_ENABLE_LLPC"] = "1"
+import transformers

 if sys.platform == "darwin":
    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"

+import gradio as gr
+import apps.stable_diffusion.web.utils.global_obj as global_obj
+from apps.stable_diffusion.src import args, clear_all
+from apps.stable_diffusion.web.utils.gradio_configs import (
+    clear_gradio_tmp_imgs_folder,
+)
+from apps.stable_diffusion.web.ui.utils import get_custom_model_path
+
+# Clear all gradio tmp images from the last session
+clear_gradio_tmp_imgs_folder()
+# Create the custom model folder if it doesn't already exist
+dir = ["models", "vae", "lora"]
+for root in dir:
+    get_custom_model_path(root).mkdir(parents=True, exist_ok=True)
+
+if args.clear_all:
+    clear_all()
+

 def resource_path(relative_path):
    """Get absolute path to resource, works for dev and for PyInstaller"""
@@ -18,245 +32,176 @@ def resource_path(relative_path):
    return os.path.join(base_path, relative_path)


-import gradio as gr
-from PIL import Image
-from apps.stable_diffusion.src import (
-    prompt_examples,
-    args,
-    get_available_devices,
+dark_theme = resource_path("ui/css/sd_dark_theme.css")
+
+from apps.stable_diffusion.web.ui import (
+    txt2img_web,
+    txt2img_gallery,
+    txt2img_sendto_img2img,
+    txt2img_sendto_inpaint,
+    txt2img_sendto_outpaint,
+    txt2img_sendto_upscaler,
+    img2img_web,
+    img2img_gallery,
+    img2img_init_image,
+    img2img_sendto_inpaint,
+    img2img_sendto_outpaint,
+    img2img_sendto_upscaler,
+    inpaint_web,
+    inpaint_gallery,
+    inpaint_init_image,
+    inpaint_sendto_img2img,
+    inpaint_sendto_outpaint,
+    inpaint_sendto_upscaler,
+    outpaint_web,
+    outpaint_gallery,
+    outpaint_init_image,
+    outpaint_sendto_img2img,
+    outpaint_sendto_inpaint,
+    outpaint_sendto_upscaler,
+    upscaler_web,
+    upscaler_gallery,
+    upscaler_init_image,
+    upscaler_sendto_img2img,
+    upscaler_sendto_inpaint,
+    upscaler_sendto_outpaint,
+    lora_train_web,
 )
-from apps.stable_diffusion.scripts import txt2img_inf

-nodlogo_loc = resource_path("logos/nod-logo.png")
-sdlogo_loc = resource_path("logos/sd-demo-logo.png")
+# init global sd pipeline and config
+global_obj._init()


-demo_css = resource_path("css/sd_dark_theme.css")
+def register_button_click(button, selectedid, inputs, outputs):
+    button.click(
+        lambda x: (
+            x[0]["name"] if len(x) != 0 else None,
+            gr.Tabs.update(selected=selectedid),
+        ),
+        inputs,
+        outputs,
+    )


-with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        logo2 = Image.open(sdlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=100)
-            with gr.Column(scale=5, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=logo2,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="demo_title",
-                ).style(width=150, height=100)
+with gr.Blocks(
+    css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
+) as sd_web:
+    with gr.Tabs() as tabs:
+        with gr.TabItem(label="Text-to-Image", id=0):
+            txt2img_web.render()
+        with gr.TabItem(label="Image-to-Image", id=1):
+            img2img_web.render()
+        with gr.TabItem(label="Inpainting", id=2):
+            inpaint_web.render()
+        with gr.TabItem(label="Outpainting", id=3):
+            outpaint_web.render()
+        with gr.TabItem(label="Upscaler", id=4):
+            upscaler_web.render()

-    with gr.Row(elem_id="ui_body"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Row():
-                    ckpt_path = (
-                        Path(args.ckpt_dir)
-                        if args.ckpt_dir
-                        else Path(Path.cwd(), "models")
-                    )
-                    ckpt_path.mkdir(parents=True, exist_ok=True)
-                    types = (
-                        "*.ckpt",
-                        "*.safetensors",
-                    )  # the tuple of file types
-                    ckpt_files = ["None"]
-                    for extn in types:
-                        files = glob.glob(os.path.join(ckpt_path, extn))
-                        ckpt_files.extend(files)
-                    custom_model = gr.Dropdown(
-                        label=f"Models (Custom Model path: {ckpt_path})",
-                        value="None",
-                        choices=ckpt_files
-                        + [
-                            "Linaqruf/anything-v3.0",
-                            "prompthero/openjourney",
-                            "wavymulder/Analog-Diffusion",
-                            "stabilityai/stable-diffusion-2-1",
-                            "stabilityai/stable-diffusion-2-1-base",
-                            "CompVis/stable-diffusion-v1-4",
-                        ],
-                    )
-                    hf_model_id = gr.Textbox(
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
-                        value="",
-                        label="HuggingFace Model ID",
-                    )
+    with gr.Tabs(visible=False) as experimental_tabs:
+        with gr.TabItem(label="LoRA Training", id=5):
+            lora_train_web.render()

-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value="cyberpunk forest by Salvador Dali",
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value="trees, green",
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                with gr.Accordion(label="Advanced Options", open=False):
-                    with gr.Row():
-                        scheduler = gr.Dropdown(
-                            label="Scheduler",
-                            value="SharkEulerDiscrete",
-                            choices=[
-                                "DDIM",
-                                "PNDM",
-                                "LMSDiscrete",
-                                "DPMSolverMultistep",
-                                "EulerDiscrete",
-                                "EulerAncestralDiscrete",
-                                "SharkEulerDiscrete",
-                            ],
-                        )
-                        with gr.Group():
-                            save_metadata_to_png = gr.Checkbox(
-                                label="Save prompt information to PNG",
-                                value=True,
-                                interactive=True,
-                            )
-                            save_metadata_to_json = gr.Checkbox(
-                                label="Save prompt information to JSON file",
-                                value=False,
-                                interactive=True,
-                            )
-                    with gr.Row():
-                        height = gr.Slider(
-                            384, 786, value=512, step=8, label="Height"
-                        )
-                        width = gr.Slider(
-                            384, 786, value=512, step=8, label="Width"
-                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value="fp16",
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=False,
-                        )
-                        max_length = gr.Radio(
-                            label="Max Length",
-                            value=64,
-                            choices=[
-                                64,
-                                77,
-                            ],
-                            visible=False,
-                        )
-                    with gr.Row():
-                        steps = gr.Slider(
-                            1, 100, value=50, step=1, label="Steps"
-                        )
-                        guidance_scale = gr.Slider(
-                            0,
-                            50,
-                            value=7.5,
-                            step=0.1,
-                            label="CFG Scale",
-                        )
-                    with gr.Row():
-                        batch_count = gr.Slider(
-                            1,
-                            10,
-                            value=1,
-                            step=1,
-                            label="Batch Count",
-                            interactive=True,
-                        )
-                        batch_size = gr.Slider(
-                            1,
-                            4,
-                            value=1,
-                            step=1,
-                            label="Batch Size",
-                            interactive=True,
-                        )
-                with gr.Row():
-                    seed = gr.Number(value=-1, precision=0, label="Seed")
-                    available_devices = get_available_devices()
-                    device = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image")
-                with gr.Accordion(label="Prompt Examples!", open=False):
-                    ex = gr.Examples(
-                        examples=prompt_examples,
-                        inputs=prompt,
-                        cache_examples=False,
-                        elem_id="prompt_examples",
-                    )
+    register_button_click(
+        txt2img_sendto_img2img,
+        1,
+        [txt2img_gallery],
+        [img2img_init_image, tabs],
+    )
+    register_button_click(
+        txt2img_sendto_inpaint,
+        2,
+        [txt2img_gallery],
+        [inpaint_init_image, tabs],
+    )
+    register_button_click(
+        txt2img_sendto_outpaint,
+        3,
+        [txt2img_gallery],
+        [outpaint_init_image, tabs],
+    )
+    register_button_click(
+        txt2img_sendto_upscaler,
+        4,
+        [txt2img_gallery],
+        [upscaler_init_image, tabs],
+    )
+    register_button_click(
+        img2img_sendto_inpaint,
+        2,
+        [img2img_gallery],
+        [inpaint_init_image, tabs],
+    )
+    register_button_click(
+        img2img_sendto_outpaint,
+        3,
+        [img2img_gallery],
+        [outpaint_init_image, tabs],
+    )
+    register_button_click(
+        img2img_sendto_upscaler,
+        4,
+        [img2img_gallery],
+        [upscaler_init_image, tabs],
+    )
+    register_button_click(
+        inpaint_sendto_img2img,
+        1,
+        [inpaint_gallery],
+        [img2img_init_image, tabs],
+    )
+    register_button_click(
+        inpaint_sendto_outpaint,
+        3,
+        [inpaint_gallery],
+        [outpaint_init_image, tabs],
+    )
+    register_button_click(
+        inpaint_sendto_upscaler,
+        4,
+        [inpaint_gallery],
+        [upscaler_init_image, tabs],
+    )
+    register_button_click(
+        outpaint_sendto_img2img,
+        1,
+        [outpaint_gallery],
+        [img2img_init_image, tabs],
+    )
+    register_button_click(
+        outpaint_sendto_inpaint,
+        2,
+        [outpaint_gallery],
+        [inpaint_init_image, tabs],
+    )
+    register_button_click(
+        outpaint_sendto_upscaler,
+        4,
+        [outpaint_gallery],
+        [upscaler_init_image, tabs],
+    )
+    register_button_click(
+        upscaler_sendto_img2img,
+        1,
+        [upscaler_gallery],
+        [img2img_init_image, tabs],
+    )
+    register_button_click(
+        upscaler_sendto_inpaint,
+        2,
+        [upscaler_gallery],
+        [inpaint_init_image, tabs],
+    )
+    register_button_click(
+        upscaler_sendto_outpaint,
+        3,
+        [upscaler_gallery],
+        [outpaint_init_image, tabs],
+    )

-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    gallery = gr.Gallery(
-                        label="Generated images",
-                        show_label=False,
-                        elem_id="gallery",
-                    ).style(grid=[2], height="auto")
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=4,
-                        show_label=False,
-                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
-        kwargs = dict(
-            fn=txt2img_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                height,
-                width,
-                steps,
-                guidance_scale,
-                seed,
-                batch_count,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[gallery, std_output],
-            show_progress=args.progress_bar,
-        )

-        prompt.submit(**kwargs)
-        stable_diffusion.click(**kwargs)
-
-shark_web.queue()
-shark_web.launch(
+sd_web.queue()
+sd_web.launch(
    share=args.share,
    inbrowser=True,
    server_name="0.0.0.0",
--- a/apps/stable_diffusion/web/logos/Nod_logo.png
+++ b/apps/stable_diffusion/web/logos/Nod_logo.png
--- a/apps/stable_diffusion/web/logos/sd-demo-logo.png
+++ b/apps/stable_diffusion/web/logos/sd-demo-logo.png
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -0,0 +1,41 @@
+from apps.stable_diffusion.web.ui.txt2img_ui import (
+    txt2img_web,
+    txt2img_gallery,
+    txt2img_sendto_img2img,
+    txt2img_sendto_inpaint,
+    txt2img_sendto_outpaint,
+    txt2img_sendto_upscaler,
+)
+from apps.stable_diffusion.web.ui.img2img_ui import (
+    img2img_web,
+    img2img_gallery,
+    img2img_init_image,
+    img2img_sendto_inpaint,
+    img2img_sendto_outpaint,
+    img2img_sendto_upscaler,
+)
+from apps.stable_diffusion.web.ui.inpaint_ui import (
+    inpaint_web,
+    inpaint_gallery,
+    inpaint_init_image,
+    inpaint_sendto_img2img,
+    inpaint_sendto_outpaint,
+    inpaint_sendto_upscaler,
+)
+from apps.stable_diffusion.web.ui.outpaint_ui import (
+    outpaint_web,
+    outpaint_gallery,
+    outpaint_init_image,
+    outpaint_sendto_img2img,
+    outpaint_sendto_inpaint,
+    outpaint_sendto_upscaler,
+)
+from apps.stable_diffusion.web.ui.upscaler_ui import (
+    upscaler_web,
+    upscaler_gallery,
+    upscaler_init_image,
+    upscaler_sendto_img2img,
+    upscaler_sendto_inpaint,
+    upscaler_sendto_outpaint,
+)
+from apps.stable_diffusion.web.ui.lora_train_ui import lora_train_web
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -0,0 +1,199 @@
+/*
+Apply Gradio dark theme to the default Gradio theme.
+Procedure to upgrade the dark theme:
+- Using your browser, visit http://localhost:8080/?__theme=dark
+- Open your browser inspector, search for the .dark css class
+- Copy .dark class declarations, apply them here into :root
+*/
+
+:root {
+    --body-background-fill: var(--background-fill-primary);
+    --body-text-color: var(--neutral-100);
+    --color-accent-soft: var(--neutral-700);
+    --background-fill-primary: var(--neutral-950);
+    --background-fill-secondary: var(--neutral-900);
+    --border-color-accent: var(--neutral-600);
+    --border-color-primary: var(--neutral-700);
+    --link-text-color-active: var(--secondary-500);
+    --link-text-color: var(--secondary-500);
+    --link-text-color-hover: var(--secondary-400);
+    --link-text-color-visited: var(--secondary-600);
+    --body-text-color-subdued: var(--neutral-400);
+    --shadow-spread: 1px;
+    --block-background-fill: var(--neutral-800);
+    --block-border-color: var(--border-color-primary);
+    --block_border_width: None;
+    --block-info-text-color: var(--body-text-color-subdued);
+    --block-label-background-fill: var(--background-fill-secondary);
+    --block-label-border-color: var(--border-color-primary);
+    --block_label_border_width: None;
+    --block-label-text-color: var(--neutral-200);
+    --block_shadow: None;
+    --block_title_background_fill: None;
+    --block_title_border_color: None;
+    --block_title_border_width: None;
+    --block-title-text-color: var(--neutral-200);
+    --panel-background-fill: var(--background-fill-secondary);
+    --panel-border-color: var(--border-color-primary);
+    --panel_border_width: None;
+    --checkbox-background-color: var(--neutral-800);
+    --checkbox-background-color-focus: var(--checkbox-background-color);
+    --checkbox-background-color-hover: var(--checkbox-background-color);
+    --checkbox-background-color-selected: var(--secondary-600);
+    --checkbox-border-color: var(--neutral-700);
+    --checkbox-border-color-focus: var(--secondary-500);
+    --checkbox-border-color-hover: var(--neutral-600);
+    --checkbox-border-color-selected: var(--secondary-600);
+    --checkbox-border-width: var(--input-border-width);
+    --checkbox-label-background-fill: linear-gradient(to top, var(--neutral-900), var(--neutral-800));
+    --checkbox-label-background-fill-hover: linear-gradient(to top, var(--neutral-900), var(--neutral-800));
+    --checkbox-label-background-fill-selected: var(--checkbox-label-background-fill);
+    --checkbox-label-border-color: var(--border-color-primary);
+    --checkbox-label-border-color-hover: var(--checkbox-label-border-color);
+    --checkbox-label-border-width: var(--input-border-width);
+    --checkbox-label-text-color: var(--body-text-color);
+    --checkbox-label-text-color-selected: var(--checkbox-label-text-color);
+    --error-background-fill: var(--background-fill-primary);
+    --error-border-color: var(--border-color-primary);
+    --error_border_width: None;
+    --error-text-color: #ef4444;
+    --input-background-fill: var(--neutral-800);
+    --input-background-fill-focus: var(--secondary-600);
+    --input-background-fill-hover: var(--input-background-fill);
+    --input-border-color: var(--border-color-primary);
+    --input-border-color-focus: var(--neutral-700);
+    --input-border-color-hover: var(--input-border-color);
+    --input_border_width: None;
+    --input-placeholder-color: var(--neutral-500);
+    --input_shadow: None;
+    --input-shadow-focus: 0 0 0 var(--shadow-spread) var(--neutral-700), var(--shadow-inset);
+    --loader_color: None;
+    --slider_color: None;
+    --stat-background-fill: linear-gradient(to right, var(--primary-400), var(--primary-600));
+    --table-border-color: var(--neutral-700);
+    --table-even-background-fill: var(--neutral-950);
+    --table-odd-background-fill: var(--neutral-900);
+    --table-row-focus: var(--color-accent-soft);
+    --button-border-width: var(--input-border-width);
+    --button-cancel-background-fill: linear-gradient(to bottom right, #dc2626, #b91c1c);
+    --button-cancel-background-fill-hover: linear-gradient(to bottom right, #dc2626, #dc2626);
+    --button-cancel-border-color: #dc2626;
+    --button-cancel-border-color-hover: var(--button-cancel-border-color);
+    --button-cancel-text-color: white;
+    --button-cancel-text-color-hover: var(--button-cancel-text-color);
+    --button-primary-background-fill: linear-gradient(to bottom right, var(--primary-500), var(--primary-600));
+    --button-primary-background-fill-hover: linear-gradient(to bottom right, var(--primary-500), var(--primary-500));
+    --button-primary-border-color: var(--primary-500);
+    --button-primary-border-color-hover: var(--button-primary-border-color);
+    --button-primary-text-color: white;
+    --button-primary-text-color-hover: var(--button-primary-text-color);
+    --button-secondary-background-fill: linear-gradient(to bottom right, var(--neutral-600), var(--neutral-700));
+    --button-secondary-background-fill-hover: linear-gradient(to bottom right, var(--neutral-600), var(--neutral-600));
+    --button-secondary-border-color: var(--neutral-600);
+    --button-secondary-border-color-hover: var(--button-secondary-border-color);
+    --button-secondary-text-color: white;
+    --button-secondary-text-color-hover: var(--button-secondary-text-color);
+    --block-border-width: 1px;
+    --block-label-border-width: 1px;
+    --form-gap-width: 1px;
+    --error-border-width: 1px;
+    --input-border-width: 1px;
+}
+
+/* SHARK theme */
+
+/* display in full width for desktop devices */
+@media (min-width: 1536px)
+{
+    .gradio-container {
+        max-width: var(--size-full) !important;
+    }
+}
+
+.gradio-container .contain {
+    padding: 0 var(--size-4) !important;
+}
+
+.container {
+    background-color: black !important;
+    padding-top: var(--size-5) !important;
+}
+
+#ui_title {
+    padding: var(--size-2) 0 0 var(--size-1);
+}
+
+#top_logo {
+    background-color: transparent;
+    border-radius: 0 !important;
+    border: 0;
+}
+
+#demo_title_outer {
+    border-radius: 0;
+}
+
+#prompt_box_outer div:first-child {
+    border-radius: 0 !important
+}
+
+#prompt_box textarea, #negative_prompt_box textarea {
+    background-color: var(--background-fill-primary) !important;
+}
+
+#prompt_examples {
+    margin: 0 !important;
+}
+
+#prompt_examples svg {
+    display: none !important;
+}
+
+#ui_body {
+    padding: var(--size-2) !important;
+    border-radius: 0.5em !important;
+}
+
+#img_result+div {
+    display: none !important;
+}
+
+footer {
+    display: none !important;
+}
+
+#gallery + div {
+    border-radius: 0 !important;
+}
+
+/* Prevent progress bar to block gallery navigation while building images (Gradio V3.19.0) */
+#gallery .wrap.default {
+    pointer-events: none;
+}
+
+/* Import Png info box */
+#txt2img_prompt_image .fixed-height {
+    height: var(--size-32);
+}
+
+/* Hide "remove buttons" from ui dropdowns */
+#custom_model .token-remove.remove-all,
+#lora_weights .token-remove.remove-all,
+#scheduler .token-remove.remove-all,
+#device .token-remove.remove-all,
+#stencil_model .token-remove.remove-all {
+    display: none;
+}
+
+/* Hide selected items from ui dropdowns */
+#custom_model .options .item .inner-item,
+#scheduler .options .item .inner-item,
+#device .options .item .inner-item,
+#stencil_model .options .item .inner-item {
+    display:none;
+}
+
+/* Hide the download icon from the nod logo */
+#top_logo .download {
+    display: none;
+}
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -0,0 +1,261 @@
+from pathlib import Path
+import os
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.scripts import img2img_inf
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list,
+    predefined_models,
+    cancel_sd,
+)
+
+
+with gr.Blocks(title="Image-to-Image") as img2img_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=50)
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    custom_model = gr.Dropdown(
+                        label=f"Models (Custom Model path: {get_custom_model_path()})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.ckpt_loc)
+                        if args.ckpt_loc
+                        else "None",
+                        choices=["None"]
+                        + get_custom_model_files()
+                        + predefined_models,
+                    )
+                    hf_model_id = gr.Textbox(
+                        elem_id="hf_model_id",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                        value="",
+                        label="HuggingFace Model ID",
+                        lines=3,
+                    )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value=args.negative_prompts[0],
+                        lines=1,
+                        elem_id="negative_prompt_box",
+                    )
+
+                img2img_init_image = gr.Image(
+                    label="Input Image", type="pil"
+                ).style(height=300)
+
+                with gr.Accordion(label="Stencil Options", open=False):
+                    with gr.Row():
+                        use_stencil = gr.Dropdown(
+                            elem_id="stencil_model",
+                            label="Stencil model",
+                            value="None",
+                            choices=["None", "canny", "openpose", "scribble"],
+                        )
+                with gr.Accordion(label="LoRA Options", open=False):
+                    with gr.Row():
+                        lora_weights = gr.Dropdown(
+                            label=f"Standlone LoRA weights (Path: {get_custom_model_path('lora')})",
+                            elem_id="lora_weights",
+                            value="None",
+                            choices=["None"] + get_custom_model_files("lora"),
+                        )
+                        lora_hf_id = gr.Textbox(
+                            elem_id="lora_hf_id",
+                            placeholder="Select 'None' in the Standlone LoRA weights dropdown on the left if you want to use a standalone HuggingFace model ID for LoRA here e.g: sayakpaul/sd-model-finetuned-lora-t4",
+                            value="",
+                            label="HuggingFace Model ID",
+                            lines=3,
+                        )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value="PNDM",
+                            choices=scheduler_list,
+                        )
+                        with gr.Group():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=args.write_metadata_to_png,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=args.save_metadata_to_json,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 768, value=args.height, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 768, value=args.width, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=True,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=args.max_length,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=args.steps, step=1, label="Steps"
+                        )
+                        strength = gr.Slider(
+                            0,
+                            1,
+                            value=args.strength,
+                            step=0.01,
+                            label="Denoising Strength",
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            guidance_scale = gr.Slider(
+                                0,
+                                50,
+                                value=args.guidance_scale,
+                                step=0.1,
+                                label="CFG Scale",
+                            )
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        batch_size = gr.Slider(
+                            1,
+                            4,
+                            value=args.batch_size,
+                            step=1,
+                            label="Batch Size",
+                            interactive=False,
+                            visible=False,
+                        )
+                        stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    seed = gr.Number(
+                        value=args.seed, precision=0, label="Seed"
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        random_seed = gr.Button("Randomize Seed")
+                        random_seed.click(
+                            None,
+                            inputs=[],
+                            outputs=[seed],
+                            _js="() => -1",
+                        )
+                    with gr.Column(scale=6):
+                        stable_diffusion = gr.Button("Generate Image(s)")
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    img2img_gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2])
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=1,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+                with gr.Row():
+                    img2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
+                    img2img_sendto_outpaint = gr.Button(
+                        value="SendTo Outpaint"
+                    )
+                    img2img_sendto_upscaler = gr.Button(
+                        value="SendTo Upscaler"
+                    )
+
+        kwargs = dict(
+            fn=img2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                img2img_init_image,
+                height,
+                width,
+                steps,
+                strength,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                use_stencil,
+                save_metadata_to_json,
+                save_metadata_to_png,
+                lora_weights,
+                lora_hf_id,
+            ],
+            outputs=[img2img_gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt_submit = prompt.submit(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**kwargs)
+        generate_click = stable_diffusion.click(**kwargs)
+        stop_batch.click(
+            fn=cancel_sd,
+            cancels=[prompt_submit, neg_prompt_submit, generate_click],
+        )
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -0,0 +1,263 @@
+from pathlib import Path
+import os
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.scripts import inpaint_inf
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list,
+    predefined_paint_models,
+    cancel_sd,
+)
+
+
+with gr.Blocks(title="Inpainting") as inpaint_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=50)
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    custom_model = gr.Dropdown(
+                        label=f"Models (Custom Model path: {get_custom_model_path()})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.ckpt_loc)
+                        if args.ckpt_loc
+                        else "None",
+                        choices=["None"]
+                        + get_custom_model_files()
+                        + predefined_paint_models,
+                    )
+                    hf_model_id = gr.Textbox(
+                        elem_id="hf_model_id",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting",
+                        value="",
+                        label="HuggingFace Model ID",
+                        lines=3,
+                    )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value=args.negative_prompts[0],
+                        lines=1,
+                        elem_id="negative_prompt_box",
+                    )
+
+                inpaint_init_image = gr.Image(
+                    label="Masked Image",
+                    source="upload",
+                    tool="sketch",
+                    type="pil",
+                ).style(height=350)
+
+                with gr.Accordion(label="LoRA Options", open=False):
+                    with gr.Row():
+                        lora_weights = gr.Dropdown(
+                            label=f"Standlone LoRA weights (Path: {get_custom_model_path('lora')})",
+                            elem_id="lora_weights",
+                            value="None",
+                            choices=["None"] + get_custom_model_files("lora"),
+                        )
+                        lora_hf_id = gr.Textbox(
+                            elem_id="lora_hf_id",
+                            placeholder="Select 'None' in the Standlone LoRA weights dropdown on the left if you want to use a standalone HuggingFace model ID for LoRA here e.g: sayakpaul/sd-model-finetuned-lora-t4",
+                            value="",
+                            label="HuggingFace Model ID",
+                            lines=3,
+                        )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value="PNDM",
+                            choices=scheduler_list,
+                        )
+                        with gr.Group():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=args.write_metadata_to_png,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=args.save_metadata_to_json,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 768, value=args.height, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 768, value=args.width, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=args.max_length,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        inpaint_full_res = gr.Radio(
+                            choices=["Whole picture", "Only masked"],
+                            type="index",
+                            value="Whole picture",
+                            label="Inpaint area",
+                        )
+                        inpaint_full_res_padding = gr.Slider(
+                            minimum=0,
+                            maximum=256,
+                            step=4,
+                            value=32,
+                            label="Only masked padding, pixels",
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=args.steps, step=1, label="Steps"
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            guidance_scale = gr.Slider(
+                                0,
+                                50,
+                                value=args.guidance_scale,
+                                step=0.1,
+                                label="CFG Scale",
+                            )
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        batch_size = gr.Slider(
+                            1,
+                            4,
+                            value=args.batch_size,
+                            step=1,
+                            label="Batch Size",
+                            interactive=False,
+                            visible=False,
+                        )
+                        stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    seed = gr.Number(
+                        value=args.seed, precision=0, label="Seed"
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        random_seed = gr.Button("Randomize Seed")
+                        random_seed.click(
+                            None,
+                            inputs=[],
+                            outputs=[seed],
+                            _js="() => -1",
+                        )
+                    with gr.Column(scale=6):
+                        stable_diffusion = gr.Button("Generate Image(s)")
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    inpaint_gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2])
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=1,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+                with gr.Row():
+                    inpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
+                    inpaint_sendto_outpaint = gr.Button(
+                        value="SendTo Outpaint"
+                    )
+                    inpaint_sendto_upscaler = gr.Button(
+                        value="SendTo Upscaler"
+                    )
+
+        kwargs = dict(
+            fn=inpaint_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                inpaint_init_image,
+                height,
+                width,
+                inpaint_full_res,
+                inpaint_full_res_padding,
+                steps,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+                lora_weights,
+                lora_hf_id,
+            ],
+            outputs=[inpaint_gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt_submit = prompt.submit(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**kwargs)
+        generate_click = stable_diffusion.click(**kwargs)
+        stop_batch.click(
+            fn=cancel_sd,
+            cancels=[prompt_submit, neg_prompt_submit, generate_click],
+        )
--- a/apps/stable_diffusion/web/ui/logos/nod-logo.png
+++ b/apps/stable_diffusion/web/ui/logos/nod-logo.png
--- a/apps/stable_diffusion/web/ui/lora_train_ui.py
+++ b/apps/stable_diffusion/web/ui/lora_train_ui.py
@@ -0,0 +1,205 @@
+from pathlib import Path
+import os
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.scripts import lora_train
+from apps.stable_diffusion.src import prompt_examples, args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list_txt2img,
+    predefined_models,
+)
+
+with gr.Blocks(title="Lora Training") as lora_train_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=50)
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    with gr.Column(scale=10):
+                        with gr.Row():
+                            custom_model = gr.Dropdown(
+                                label=f"Models (Custom Model path: {get_custom_model_path()})",
+                                elem_id="custom_model",
+                                value=os.path.basename(args.ckpt_loc)
+                                if args.ckpt_loc
+                                else "None",
+                                choices=["None"]
+                                + get_custom_model_files()
+                                + predefined_models,
+                            )
+                            hf_model_id = gr.Textbox(
+                                elem_id="hf_model_id",
+                                placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                                value="",
+                                label="HuggingFace Model ID",
+                                lines=3,
+                            )
+
+                with gr.Group(elem_id="image_dir_box_outer"):
+                    training_images_dir = gr.Textbox(
+                        label="ImageDirectory",
+                        value=args.training_images_dir,
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value=args.scheduler,
+                            choices=scheduler_list_txt2img,
+                        )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 768, value=args.height, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 768, value=args.width, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=args.max_length,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1,
+                            2000,
+                            value=args.training_steps,
+                            step=1,
+                            label="Training Steps",
+                        )
+                        guidance_scale = gr.Slider(
+                            0,
+                            50,
+                            value=args.guidance_scale,
+                            step=0.1,
+                            label="CFG Scale",
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        with gr.Column(scale=3):
+                            batch_size = gr.Slider(
+                                1,
+                                4,
+                                value=args.batch_size,
+                                step=1,
+                                label="Batch Size",
+                                interactive=True,
+                            )
+                        stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    seed = gr.Number(
+                        value=args.seed, precision=0, label="Seed"
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        random_seed = gr.Button("Randomize Seed")
+                        random_seed.click(
+                            None,
+                            inputs=[],
+                            outputs=[seed],
+                            _js="() => -1",
+                        )
+                    with gr.Column(scale=6):
+                        train_lora = gr.Button("Train LoRA")
+
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=1,
+                        show_label=False,
+                    )
+                lora_save_dir = (
+                    args.lora_save_dir if args.lora_save_dir else Path.cwd()
+                )
+                lora_save_dir = Path(lora_save_dir, "lora")
+                output_loc = gr.Textbox(
+                    label="Saving Lora at",
+                    value=lora_save_dir,
+                )
+
+        kwargs = dict(
+            fn=lora_train,
+            inputs=[
+                prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                training_images_dir,
+                output_loc,
+            ],
+            outputs=[std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt_submit = prompt.submit(**kwargs)
+        train_click = train_lora.click(**kwargs)
+        stop_batch.click(fn=None, cancels=[prompt_submit, train_click])
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -0,0 +1,283 @@
+from pathlib import Path
+import os
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.scripts import outpaint_inf
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list,
+    predefined_paint_models,
+    cancel_sd,
+)
+
+
+with gr.Blocks(title="Outpainting") as outpaint_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=50)
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    custom_model = gr.Dropdown(
+                        label=f"Models (Custom Model path: {get_custom_model_path()})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.ckpt_loc)
+                        if args.ckpt_loc
+                        else "None",
+                        choices=["None"]
+                        + get_custom_model_files()
+                        + predefined_paint_models,
+                    )
+                    hf_model_id = gr.Textbox(
+                        elem_id="hf_model_id",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting",
+                        value="",
+                        label="HuggingFace Model ID",
+                        lines=3,
+                    )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value=args.negative_prompts[0],
+                        lines=1,
+                        elem_id="negative_prompt_box",
+                    )
+
+                outpaint_init_image = gr.Image(
+                    label="Input Image", type="pil"
+                ).style(height=300)
+
+                with gr.Accordion(label="LoRA Options", open=False):
+                    with gr.Row():
+                        lora_weights = gr.Dropdown(
+                            label=f"Standlone LoRA weights (Path: {get_custom_model_path('lora')})",
+                            elem_id="lora_weights",
+                            value="None",
+                            choices=["None"] + get_custom_model_files("lora"),
+                        )
+                        lora_hf_id = gr.Textbox(
+                            elem_id="lora_hf_id",
+                            placeholder="Select 'None' in the Standlone LoRA weights dropdown on the left if you want to use a standalone HuggingFace model ID for LoRA here e.g: sayakpaul/sd-model-finetuned-lora-t4",
+                            value="",
+                            label="HuggingFace Model ID",
+                            lines=3,
+                        )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value="PNDM",
+                            choices=scheduler_list,
+                        )
+                        with gr.Group():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=args.write_metadata_to_png,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=args.save_metadata_to_json,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        pixels = gr.Slider(
+                            8,
+                            256,
+                            value=args.pixels,
+                            step=8,
+                            label="Pixels to expand",
+                        )
+                        mask_blur = gr.Slider(
+                            0,
+                            64,
+                            value=args.mask_blur,
+                            step=1,
+                            label="Mask blur",
+                        )
+                    with gr.Row():
+                        directions = gr.CheckboxGroup(
+                            label="Outpainting direction",
+                            choices=["left", "right", "up", "down"],
+                            value=["left", "right", "up", "down"],
+                        )
+                    with gr.Row():
+                        noise_q = gr.Slider(
+                            0.0,
+                            4.0,
+                            value=1.0,
+                            step=0.01,
+                            label="Fall-off exponent (lower=higher detail)",
+                        )
+                        color_variation = gr.Slider(
+                            0.0,
+                            1.0,
+                            value=0.05,
+                            step=0.01,
+                            label="Color variation",
+                        )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 768, value=args.height, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 768, value=args.width, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=args.max_length,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=20, step=1, label="Steps"
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            guidance_scale = gr.Slider(
+                                0,
+                                50,
+                                value=args.guidance_scale,
+                                step=0.1,
+                                label="CFG Scale",
+                            )
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        batch_size = gr.Slider(
+                            1,
+                            4,
+                            value=args.batch_size,
+                            step=1,
+                            label="Batch Size",
+                            interactive=False,
+                            visible=False,
+                        )
+                        stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    seed = gr.Number(
+                        value=args.seed, precision=0, label="Seed"
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        random_seed = gr.Button("Randomize Seed")
+                        random_seed.click(
+                            None,
+                            inputs=[],
+                            outputs=[seed],
+                            _js="() => -1",
+                        )
+                    with gr.Column(scale=6):
+                        stable_diffusion = gr.Button("Generate Image(s)")
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    outpaint_gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2])
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=1,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+                with gr.Row():
+                    outpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
+                    outpaint_sendto_inpaint = gr.Button(value="SendTo Inpaint")
+                    outpaint_sendto_upscaler = gr.Button(
+                        value="SendTo Upscaler"
+                    )
+
+        kwargs = dict(
+            fn=outpaint_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                outpaint_init_image,
+                pixels,
+                mask_blur,
+                directions,
+                noise_q,
+                color_variation,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+                lora_weights,
+                lora_hf_id,
+            ],
+            outputs=[outpaint_gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt_submit = prompt.submit(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**kwargs)
+        generate_click = stable_diffusion.click(**kwargs)
+        stop_batch.click(
+            fn=cancel_sd,
+            cancels=[prompt_submit, neg_prompt_submit, generate_click],
+        )
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -0,0 +1,279 @@
+from pathlib import Path
+import os
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.scripts import txt2img_inf
+from apps.stable_diffusion.src import prompt_examples, args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list_txt2img,
+    predefined_models,
+    cancel_sd,
+)
+
+with gr.Blocks(title="Text-to-Image") as txt2img_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=50)
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    with gr.Column(scale=10):
+                        with gr.Row():
+                            custom_model = gr.Dropdown(
+                                label=f"Models (Custom Model path: {get_custom_model_path()})",
+                                elem_id="custom_model",
+                                value=os.path.basename(args.ckpt_loc)
+                                if args.ckpt_loc
+                                else "None",
+                                choices=["None"]
+                                + get_custom_model_files()
+                                + predefined_models,
+                            )
+                            hf_model_id = gr.Textbox(
+                                elem_id="hf_model_id",
+                                placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                                value="",
+                                label="HuggingFace Model ID",
+                                lines=3,
+                            )
+                    with gr.Column(scale=1, min_width=170):
+                        png_info_img = gr.Image(
+                            label="Import PNG info",
+                            elem_id="txt2img_prompt_image",
+                            type="pil",
+                            tool="None",
+                            visible=True,
+                        )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value=args.negative_prompts[0],
+                        lines=1,
+                        elem_id="negative_prompt_box",
+                    )
+                with gr.Accordion(label="LoRA Options", open=False):
+                    with gr.Row():
+                        lora_weights = gr.Dropdown(
+                            label=f"Standlone LoRA weights (Path: {get_custom_model_path('lora')})",
+                            elem_id="lora_weights",
+                            value="None",
+                            choices=["None"] + get_custom_model_files("lora"),
+                        )
+                        lora_hf_id = gr.Textbox(
+                            elem_id="lora_hf_id",
+                            placeholder="Select 'None' in the Standlone LoRA weights dropdown on the left if you want to use a standalone HuggingFace model ID for LoRA here e.g: sayakpaul/sd-model-finetuned-lora-t4",
+                            value="",
+                            label="HuggingFace Model ID",
+                            lines=3,
+                        )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value=args.scheduler,
+                            choices=scheduler_list_txt2img,
+                        )
+                        with gr.Group():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=args.write_metadata_to_png,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=args.save_metadata_to_json,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 768, value=args.height, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 768, value=args.width, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=args.max_length,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=args.steps, step=1, label="Steps"
+                        )
+                        guidance_scale = gr.Slider(
+                            0,
+                            50,
+                            value=args.guidance_scale,
+                            step=0.1,
+                            label="CFG Scale",
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        with gr.Column(scale=3):
+                            batch_size = gr.Slider(
+                                1,
+                                4,
+                                value=args.batch_size,
+                                step=1,
+                                label="Batch Size",
+                                interactive=True,
+                            )
+                        stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    seed = gr.Number(
+                        value=args.seed, precision=0, label="Seed"
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        random_seed = gr.Button("Randomize Seed")
+                        random_seed.click(
+                            None,
+                            inputs=[],
+                            outputs=[seed],
+                            _js="() => -1",
+                        )
+                    with gr.Column(scale=6):
+                        stable_diffusion = gr.Button("Generate Image(s)")
+
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    txt2img_gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2])
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=1,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+                with gr.Row():
+                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
+                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
+                    txt2img_sendto_outpaint = gr.Button(
+                        value="SendTo Outpaint"
+                    )
+                    txt2img_sendto_upscaler = gr.Button(
+                        value="SendTo Upscaler"
+                    )
+
+        kwargs = dict(
+            fn=txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+                lora_weights,
+                lora_hf_id,
+            ],
+            outputs=[txt2img_gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt_submit = prompt.submit(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**kwargs)
+        generate_click = stable_diffusion.click(**kwargs)
+        stop_batch.click(
+            fn=cancel_sd,
+            cancels=[prompt_submit, neg_prompt_submit, generate_click],
+        )
+
+        from apps.stable_diffusion.web.utils.png_metadata import (
+            import_png_metadata,
+        )
+
+        png_info_img.change(
+            fn=import_png_metadata,
+            inputs=[
+                png_info_img,
+            ],
+            outputs=[
+                png_info_img,
+                prompt,
+                negative_prompt,
+                steps,
+                scheduler,
+                guidance_scale,
+                seed,
+                width,
+                height,
+                custom_model,
+                hf_model_id,
+            ],
+        )
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -0,0 +1,256 @@
+from pathlib import Path
+import os
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.scripts import upscaler_inf
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list,
+    predefined_upscaler_models,
+)
+
+
+with gr.Blocks(title="Upscaler") as upscaler_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=50)
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    custom_model = gr.Dropdown(
+                        label=f"Models (Custom Model path: {get_custom_model_path()})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.ckpt_loc)
+                        if args.ckpt_loc
+                        else "None",
+                        choices=["None"]
+                        + get_custom_model_files()
+                        + predefined_upscaler_models,
+                    )
+                    hf_model_id = gr.Textbox(
+                        elem_id="hf_model_id",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                        value="",
+                        label="HuggingFace Model ID",
+                        lines=3,
+                    )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value=args.negative_prompts[0],
+                        lines=1,
+                        elem_id="negative_prompt_box",
+                    )
+
+                upscaler_init_image = gr.Image(
+                    label="Input Image", type="pil"
+                ).style(height=300)
+
+                with gr.Accordion(label="LoRA Options", open=False):
+                    with gr.Row():
+                        lora_weights = gr.Dropdown(
+                            label=f"Standlone LoRA weights (Path: {get_custom_model_path('lora')})",
+                            elem_id="lora_weights",
+                            value="None",
+                            choices=["None"] + get_custom_model_files("lora"),
+                        )
+                        lora_hf_id = gr.Textbox(
+                            elem_id="lora_hf_id",
+                            placeholder="Select 'None' in the Standlone LoRA weights dropdown on the left if you want to use a standalone HuggingFace model ID for LoRA here e.g: sayakpaul/sd-model-finetuned-lora-t4",
+                            value="",
+                            label="HuggingFace Model ID",
+                            lines=3,
+                        )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value="DDIM",
+                            choices=scheduler_list,
+                        )
+                        with gr.Group():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=args.write_metadata_to_png,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=args.save_metadata_to_json,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        height = gr.Slider(
+                            128,
+                            512,
+                            value=args.height,
+                            step=128,
+                            label="Height",
+                        )
+                        width = gr.Slider(
+                            128,
+                            512,
+                            value=args.width,
+                            step=128,
+                            label="Width",
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=args.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=True,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=args.max_length,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=args.steps, step=1, label="Steps"
+                        )
+                        noise_level = gr.Slider(
+                            0,
+                            100,
+                            value=args.noise_level,
+                            step=1,
+                            label="Noise Level",
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            guidance_scale = gr.Slider(
+                                0,
+                                50,
+                                value=args.guidance_scale,
+                                step=0.1,
+                                label="CFG Scale",
+                            )
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        batch_size = gr.Slider(
+                            1,
+                            4,
+                            value=args.batch_size,
+                            step=1,
+                            label="Batch Size",
+                            interactive=False,
+                            visible=False,
+                        )
+                        stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    seed = gr.Number(
+                        value=args.seed, precision=0, label="Seed"
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        random_seed = gr.Button("Randomize Seed")
+                        random_seed.click(
+                            None,
+                            inputs=[],
+                            outputs=[seed],
+                            _js="() => -1",
+                        )
+                    with gr.Column(scale=6):
+                        stable_diffusion = gr.Button("Generate Image(s)")
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    upscaler_gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2])
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=1,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+                with gr.Row():
+                    upscaler_sendto_img2img = gr.Button(value="SendTo Img2Img")
+                    upscaler_sendto_inpaint = gr.Button(value="SendTo Inpaint")
+                    upscaler_sendto_outpaint = gr.Button(
+                        value="SendTo Outpaint"
+                    )
+
+        kwargs = dict(
+            fn=upscaler_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                upscaler_init_image,
+                height,
+                width,
+                steps,
+                noise_level,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+                lora_weights,
+                lora_hf_id,
+            ],
+            outputs=[upscaler_gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt_submit = prompt.submit(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**kwargs)
+        generate_click = stable_diffusion.click(**kwargs)
+        stop_batch.click(
+            fn=None, cancels=[prompt_submit, neg_prompt_submit, generate_click]
+        )
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -0,0 +1,136 @@
+import os
+import sys
+from apps.stable_diffusion.src import get_available_devices
+import glob
+from pathlib import Path
+from apps.stable_diffusion.src import args
+from dataclasses import dataclass
+import apps.stable_diffusion.web.utils.global_obj as global_obj
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    SD_STATE_CANCEL,
+)
+
+
+@dataclass
+class Config:
+    mode: str
+    model_id: str
+    ckpt_loc: str
+    precision: str
+    batch_size: int
+    max_length: int
+    height: int
+    width: int
+    device: str
+    use_lora: str
+    use_stencil: str
+
+
+custom_model_filetypes = (
+    "*.ckpt",
+    "*.safetensors",
+)  # the tuple of file types
+
+scheduler_list = [
+    "DDIM",
+    "PNDM",
+    "DPMSolverMultistep",
+    "EulerAncestralDiscrete",
+]
+scheduler_list_txt2img = [
+    "DDIM",
+    "PNDM",
+    "LMSDiscrete",
+    "KDPM2Discrete",
+    "DPMSolverMultistep",
+    "EulerDiscrete",
+    "EulerAncestralDiscrete",
+    "SharkEulerDiscrete",
+]
+
+predefined_models = [
+    "Linaqruf/anything-v3.0",
+    "prompthero/openjourney",
+    "wavymulder/Analog-Diffusion",
+    "stabilityai/stable-diffusion-2-1",
+    "stabilityai/stable-diffusion-2-1-base",
+    "CompVis/stable-diffusion-v1-4",
+]
+
+predefined_paint_models = [
+    "runwayml/stable-diffusion-inpainting",
+    "stabilityai/stable-diffusion-2-inpainting",
+]
+predefined_upscaler_models = [
+    "stabilityai/stable-diffusion-x4-upscaler",
+]
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
+
+
+def get_custom_model_path(model="models"):
+    # If `--ckpt_dir` is provided it'd override the heirarchical folder
+    # structure in WebUI :-
+    #       model
+    #         |___lora
+    #         |___vae
+    if args.ckpt_dir:
+        return Path(args.ckpt_dir)
+    match model:
+        case "models":
+            return Path(Path.cwd(), "models")
+        case "vae":
+            return Path(Path.cwd(), "models/vae")
+        case "lora":
+            return Path(Path.cwd(), "models/lora")
+        case _:
+            return ""
+
+
+def get_custom_model_pathfile(custom_model_name, model="models"):
+    return os.path.join(get_custom_model_path(model), custom_model_name)
+
+
+def get_custom_model_files(model="models"):
+    ckpt_files = []
+    file_types = custom_model_filetypes
+    if model == "lora":
+        file_types = custom_model_filetypes + ("*.pt", "*.bin")
+    for extn in file_types:
+        files = [
+            os.path.basename(x)
+            for x in glob.glob(
+                os.path.join(get_custom_model_path(model), extn)
+            )
+        ]
+        ckpt_files.extend(files)
+    return sorted(ckpt_files, key=str.casefold)
+
+
+def get_custom_vae_or_lora_weights(weights, hf_id, model):
+    use_weight = ""
+    if weights == "None" and not hf_id:
+        use_weight = ""
+    elif not hf_id:
+        use_weight = get_custom_model_pathfile(weights, model)
+    else:
+        use_weight = hf_id
+    return use_weight
+
+
+def cancel_sd():
+    # Try catch it, as gc can delete global_obj.sd_obj while switching model
+    try:
+        global_obj.set_sd_status(SD_STATE_CANCEL)
+    except Exception:
+        pass
+
+
+nodlogo_loc = resource_path("logos/nod-logo.png")
+available_devices = get_available_devices()
--- a/apps/stable_diffusion/web/utils/global_obj.py
+++ b/apps/stable_diffusion/web/utils/global_obj.py
@@ -0,0 +1,71 @@
+import gc
+
+
+"""
+The global objects include SD pipeline and config.
+Maintaining the global objects would avoid creating extra pipeline objects when switching modes.
+Also we could avoid memory leak when switching models by clearing the cache.
+"""
+
+
+def _init():
+    global _sd_obj
+    global _config_obj
+    global _schedulers
+    _sd_obj = None
+    _config_obj = None
+    _schedulers = None
+
+
+def set_sd_obj(value):
+    global _sd_obj
+    _sd_obj = value
+
+
+def set_sd_scheduler(key):
+    global _sd_obj
+    _sd_obj.scheduler = _schedulers[key]
+
+
+def set_sd_status(value):
+    global _sd_obj
+    _sd_obj.status = value
+
+
+def set_cfg_obj(value):
+    global _config_obj
+    _config_obj = value
+
+
+def set_schedulers(value):
+    global _schedulers
+    _schedulers = value
+
+
+def get_sd_obj():
+    return _sd_obj
+
+
+def get_sd_status():
+    return _sd_obj.status
+
+
+def get_cfg_obj():
+    return _config_obj
+
+
+def get_scheduler(key):
+    return _schedulers[key]
+
+
+def clear_cache():
+    global _sd_obj
+    global _config_obj
+    global _schedulers
+    del _sd_obj
+    del _config_obj
+    del _schedulers
+    gc.collect()
+    _sd_obj = None
+    _config_obj = None
+    _schedulers = None
--- a/apps/stable_diffusion/web/utils/gradio_configs.py
+++ b/apps/stable_diffusion/web/utils/gradio_configs.py
@@ -0,0 +1,31 @@
+import os
+import tempfile
+import gradio
+from os import listdir
+
+gradio_tmp_imgs_folder = os.path.join(os.getcwd(), "shark_tmp/")
+
+
+# Clear all gradio tmp images
+def clear_gradio_tmp_imgs_folder():
+    if not os.path.exists(gradio_tmp_imgs_folder):
+        return
+    for fileName in listdir(gradio_tmp_imgs_folder):
+        # Delete tmp png files
+        if fileName.startswith("tmp") and fileName.endswith(".png"):
+            os.remove(gradio_tmp_imgs_folder + fileName)
+
+
+# Overwrite save_pil_to_file from gradio to save tmp images generated by gradio into our own tmp folder
+def save_pil_to_file(pil_image, dir=None):
+    if not os.path.exists(gradio_tmp_imgs_folder):
+        os.mkdir(gradio_tmp_imgs_folder)
+    file_obj = tempfile.NamedTemporaryFile(
+        delete=False, suffix=".png", dir=gradio_tmp_imgs_folder
+    )
+    pil_image.save(file_obj)
+    return file_obj
+
+
+# Register save_pil_to_file override
+gradio.processing_utils.save_pil_to_file = save_pil_to_file
--- a/apps/stable_diffusion/web/utils/png_metadata.py
+++ b/apps/stable_diffusion/web/utils/png_metadata.py
@@ -0,0 +1,148 @@
+import re
+from pathlib import Path
+from apps.stable_diffusion.web.ui.txt2img_ui import (
+    png_info_img,
+    prompt,
+    negative_prompt,
+    steps,
+    scheduler,
+    guidance_scale,
+    seed,
+    width,
+    height,
+    custom_model,
+    hf_model_id,
+)
+from apps.stable_diffusion.web.ui.utils import (
+    get_custom_model_pathfile,
+    scheduler_list_txt2img,
+    predefined_models,
+)
+
+re_param_code = r'\s*([\w ]+):\s*("(?:\\"[^,]|\\"|\\|[^\"])+"|[^,]*)(?:,|$)'
+re_param = re.compile(re_param_code)
+re_imagesize = re.compile(r"^(\d+)x(\d+)$")
+
+
+def parse_generation_parameters(x: str):
+    res = {}
+    prompt = ""
+    negative_prompt = ""
+    done_with_prompt = False
+
+    *lines, lastline = x.strip().split("\n")
+    if len(re_param.findall(lastline)) < 3:
+        lines.append(lastline)
+        lastline = ""
+
+    for i, line in enumerate(lines):
+        line = line.strip()
+        if line.startswith("Negative prompt:"):
+            done_with_prompt = True
+            line = line[16:].strip()
+
+        if done_with_prompt:
+            negative_prompt += ("" if negative_prompt == "" else "\n") + line
+        else:
+            prompt += ("" if prompt == "" else "\n") + line
+
+    res["Prompt"] = prompt
+    res["Negative prompt"] = negative_prompt
+
+    for k, v in re_param.findall(lastline):
+        v = v[1:-1] if v[0] == '"' and v[-1] == '"' else v
+        m = re_imagesize.match(v)
+        if m is not None:
+            res[k + "-1"] = m.group(1)
+            res[k + "-2"] = m.group(2)
+        else:
+            res[k] = v
+
+    # Missing CLIP skip means it was set to 1 (the default)
+    if "Clip skip" not in res:
+        res["Clip skip"] = "1"
+
+    hypernet = res.get("Hypernet", None)
+    if hypernet is not None:
+        res[
+            "Prompt"
+        ] += f"""<hypernet:{hypernet}:{res.get("Hypernet strength", "1.0")}>"""
+
+    if "Hires resize-1" not in res:
+        res["Hires resize-1"] = 0
+        res["Hires resize-2"] = 0
+
+    return res
+
+
+def import_png_metadata(pil_data):
+    try:
+        png_info = pil_data.info["parameters"]
+        metadata = parse_generation_parameters(png_info)
+        png_hf_model_id = ""
+        png_custom_model = ""
+
+        if "Model" in metadata:
+            # Remove extension from model info
+            if metadata["Model"].endswith(".safetensors") or metadata[
+                "Model"
+            ].endswith(".ckpt"):
+                metadata["Model"] = Path(metadata["Model"]).stem
+            # Check for the model name match with one of the local ckpt or safetensors files
+            if Path(
+                get_custom_model_pathfile(metadata["Model"] + ".ckpt")
+            ).is_file():
+                png_custom_model = metadata["Model"] + ".ckpt"
+            if Path(
+                get_custom_model_pathfile(metadata["Model"] + ".safetensors")
+            ).is_file():
+                png_custom_model = metadata["Model"] + ".safetensors"
+            # Check for a model match with one of the default model list (ex: "Linaqruf/anything-v3.0")
+            if metadata["Model"] in predefined_models:
+                png_custom_model = metadata["Model"]
+            # If nothing had matched, check vendor/hf_model_id
+            if not png_custom_model and metadata["Model"].count("/"):
+                png_hf_model_id = metadata["Model"]
+            # No matching model was found
+            if not png_custom_model and not png_hf_model_id:
+                print(
+                    "Import PNG info: Unable to find a matching model for %s"
+                    % metadata["Model"]
+                )
+
+        outputs = {
+            png_info_img: None,
+            negative_prompt: metadata["Negative prompt"],
+            steps: int(metadata["Steps"]),
+            guidance_scale: float(metadata["CFG scale"]),
+            seed: int(metadata["Seed"]),
+            width: float(metadata["Size-1"]),
+            height: float(metadata["Size-2"]),
+        }
+        if "Model" in metadata and png_custom_model:
+            outputs[custom_model] = png_custom_model
+            outputs[hf_model_id] = ""
+        if "Model" in metadata and png_hf_model_id:
+            outputs[custom_model] = "None"
+            outputs[hf_model_id] = png_hf_model_id
+        if "Prompt" in metadata:
+            outputs[prompt] = metadata["Prompt"]
+        if "Sampler" in metadata:
+            if metadata["Sampler"] in scheduler_list_txt2img:
+                outputs[scheduler] = metadata["Sampler"]
+            else:
+                print(
+                    "Import PNG info: Unable to find a scheduler for %s"
+                    % metadata["Sampler"]
+                )
+
+        return outputs
+
+    except Exception as ex:
+        if pil_data and pil_data.info.get("parameters"):
+            print("import_png_metadata failed with %s" % ex)
+        pass
+
+    return {
+        png_info_img: None,
+    }
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -30,9 +30,15 @@ def compare_images(new_filename, golden_filename):
    diff = np.abs(new - golden)
    mean = np.mean(diff)
    if mean > 0.1:
-        subprocess.run(
-            ["gsutil", "cp", new_filename, "gs://shark_tank/testdata/builder/"]
-        )
+        if os.name != "nt":
+            subprocess.run(
+                [
+                    "gsutil",
+                    "cp",
+                    new_filename,
+                    "gs://shark_tank/testdata/builder/",
+                ]
+            )
        raise SystemExit("new and golden not close")
    else:
        print("SUCCESS")
--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -2,4 +2,4 @@

 IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
-python generate_sharktank.py
+python tank/generate_sharktank.py
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -1,13 +1,16 @@
 import os
+from sys import executable
 import subprocess
 from apps.stable_diffusion.src.utils.resources import (
    get_json_file,
 )
+from datetime import datetime as dt
 from shark.shark_downloader import download_public_file
 from image_comparison import compare_images
 import argparse
 from glob import glob
 import shutil
+import requests

 model_config_dicts = get_json_file(
    os.path.join(
@@ -17,51 +20,204 @@ model_config_dicts = get_json_file(
 )


+def parse_sd_out(filename, command, device, use_tune, model_name, import_mlir):
+    with open(filename, "r+") as f:
+        lines = f.readlines()
+    metrics = {}
+    vals_to_read = [
+        "Clip Inference time",
+        "Average step",
+        "VAE Inference time",
+        "Total image generation",
+    ]
+    for line in lines:
+        for val in vals_to_read:
+            if val in line:
+                metrics[val] = line.split(" ")[-1].strip("\n")
+
+    metrics["Average step"] = metrics["Average step"].strip("ms/it")
+    metrics["Total image generation"] = metrics[
+        "Total image generation"
+    ].strip("sec")
+    metrics["device"] = device
+    metrics["use_tune"] = use_tune
+    metrics["model_name"] = model_name
+    metrics["import_mlir"] = import_mlir
+    metrics["command"] = command
+    return metrics
+
+
+def get_inpaint_inputs():
+    os.mkdir("./test_images/inputs")
+    img_url = (
+        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
+        "/main/stable_diffusion_inpaint/input_bench_image.png"
+    )
+    mask_url = (
+        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
+        "/main/stable_diffusion_inpaint/input_bench_mask.png"
+    )
+    img = requests.get(img_url)
+    mask = requests.get(mask_url)
+    open("./test_images/inputs/image.png", "wb").write(img.content)
+    open("./test_images/inputs/mask.png", "wb").write(mask.content)
+
+
 def test_loop(device="vulkan", beta=False, extra_flags=[]):
    # Get golden values from tank
    shutil.rmtree("./test_images", ignore_errors=True)
+    model_metrics = []
    os.mkdir("./test_images")
    os.mkdir("./test_images/golden")
+    get_inpaint_inputs()
    hf_model_names = model_config_dicts[0].values()
-    tuned_options = ["--no-use_tuned", "use_tuned"]
+    tuned_options = ["--no-use_tuned", "--use_tuned"]
+    import_options = ["--import_mlir", "--no-import_mlir"]
+    prompt_text = "--prompt=cyberpunk forest by Salvador Dali"
+    inpaint_prompt_text = "--prompt=Face of a yellow cat, high resolution, sitting on a park bench"
+    if os.name == "nt":
+        prompt_text = '--prompt="cyberpunk forest by Salvador Dali"'
+        inpaint_prompt_text = '--prompt="Face of a yellow cat, high resolution, sitting on a park bench"'
    if beta:
        extra_flags.append("--beta_models=True")
-    for model_name in hf_model_names:
-        for use_tune in tuned_options:
-            command = [
-                "python",
-                "apps/stable_diffusion/scripts/txt2img.py",
-                "--device=" + device,
-                "--prompt=cyberpunk forest by Salvador Dali",
-                "--output_dir="
-                + os.path.join(os.getcwd(), "test_images", model_name),
-                "--hf_model_id=" + model_name,
-                use_tune,
+    extra_flags.append("--no-progress_bar")
+    to_skip = [
+        "Linaqruf/anything-v3.0",
+        "prompthero/openjourney",
+        "wavymulder/Analog-Diffusion",
+        "dreamlike-art/dreamlike-diffusion-1.0",
+    ]
+    counter = 0
+    for import_opt in import_options:
+        for model_name in hf_model_names:
+            if model_name in to_skip:
+                continue
+            for use_tune in tuned_options:
+                if (
+                    model_name == "stabilityai/stable-diffusion-2-1"
+                    and use_tune == tuned_options[0]
+                ):
+                    continue
+                elif (
+                    model_name == "stabilityai/stable-diffusion-2-1-base"
+                    and use_tune == tuned_options[1]
+                ):
+                    continue
+                command = (
+                    [
+                        executable,  # executable is the python from the venv used to run this
+                        "apps/stable_diffusion/scripts/txt2img.py",
+                        "--device=" + device,
+                        prompt_text,
+                        "--negative_prompts=" + '""',
+                        "--seed=42",
+                        import_opt,
+                        "--output_dir="
+                        + os.path.join(os.getcwd(), "test_images", model_name),
+                        "--hf_model_id=" + model_name,
+                        use_tune,
+                    ]
+                    if "inpainting" not in model_name
+                    else [
+                        executable,
+                        "apps/stable_diffusion/scripts/inpaint.py",
+                        "--device=" + device,
+                        inpaint_prompt_text,
+                        "--negative_prompts=" + '""',
+                        "--img_path=./test_images/inputs/image.png",
+                        "--mask_path=./test_images/inputs/mask.png",
+                        "--seed=42",
+                        "--import_mlir",
+                        "--output_dir="
+                        + os.path.join(os.getcwd(), "test_images", model_name),
+                        "--hf_model_id=" + model_name,
+                        use_tune,
+                    ]
+                )
+                command += extra_flags
+                if os.name == "nt":
+                    command = " ".join(command)
+                dumpfile_name = "_".join(model_name.split("/")) + ".txt"
+                dumpfile_name = os.path.join(os.getcwd(), dumpfile_name)
+                with open(dumpfile_name, "w+") as f:
+                    generated_image = not subprocess.call(
+                        command,
+                        stdout=f,
+                        stderr=f,
+                    )
+                if os.name != "nt":
+                    command = " ".join(command)
+                if generated_image:
+                    model_metrics.append(
+                        parse_sd_out(
+                            dumpfile_name,
+                            command,
+                            device,
+                            use_tune,
+                            model_name,
+                            import_opt,
+                        )
+                    )
+                    print(command)
+                    print("Successfully generated image")
+                    os.makedirs(
+                        "./test_images/golden/" + model_name, exist_ok=True
+                    )
+                    download_public_file(
+                        "gs://shark_tank/testdata/golden/" + model_name,
+                        "./test_images/golden/" + model_name,
+                    )
+                    test_file_path = os.path.join(
+                        os.getcwd(),
+                        "test_images",
+                        model_name,
+                        "generated_imgs",
+                        dt.now().strftime("%Y%m%d"),
+                        "*.png",
+                    )
+                    test_file = glob(test_file_path)[0]
+
+                    golden_path = (
+                        "./test_images/golden/" + model_name + "/*.png"
+                    )
+                    golden_file = glob(golden_path)[0]
+                    compare_images(test_file, golden_file)
+                else:
+                    print(command)
+                    print("failed to generate image for this configuration")
+                    with open(dumpfile_name, "r+") as f:
+                        output = f.readlines()
+                        print("\n".join(output))
+                    if model_name == "CompVis/stable-diffusion-v1-4":
+                        print("failed a known successful model.")
+                        exit(1)
+                if os.name == "nt":
+                    counter += 1
+                    if counter % 2 == 0:
+                        extra_flags.append(
+                            "--iree_vulkan_target_triple=rdna2-unknown-windows"
+                        )
+                    else:
+                        if counter != 1:
+                            extra_flags.remove(
+                                "--iree_vulkan_target_triple=rdna2-unknown-windows"
+                            )
+    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
+        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
+        f.write(header)
+        for metric in model_metrics:
+            output = [
+                metric["model_name"],
+                metric["device"],
+                metric["use_tune"],
+                metric["import_mlir"],
+                metric["Clip Inference time"],
+                metric["Average step"],
+                metric["VAE Inference time"],
+                metric["Total image generation"],
+                metric["command"],
            ]
-            command += extra_flags
-            generated_image = not subprocess.call(
-                command, stdout=subprocess.DEVNULL
-            )
-            if generated_image:
-                print(" ".join(command))
-                print("Successfully generated image")
-                os.makedirs(
-                    "./test_images/golden/" + model_name, exist_ok=True
-                )
-                download_public_file(
-                    "gs://shark_tank/testdata/golden/" + model_name,
-                    "./test_images/golden/" + model_name,
-                )
-                test_file_path = os.path.join(
-                    os.getcwd(), "test_images", model_name, "generated_imgs"
-                )
-                test_file = glob(test_file_path + "/*.png")[0]
-                golden_path = "./test_images/golden/" + model_name + "/*.png"
-                golden_file = glob(golden_path)[0]
-                compare_images(test_file, golden_file)
-            else:
-                print(" ".join(command))
-                print("failed to generate image for this configuration")
+            f.write(";".join(output) + "\n")


 parser = argparse.ArgumentParser()
--- a/conftest.py
+++ b/conftest.py
@@ -60,3 +60,19 @@ def pytest_addoption(parser):
        default="gs://shark_tank/latest",
        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
    )
+    parser.addoption(
+        "--benchmark_dispatches",
+        default=None,
+        help="Benchmark individual dispatch kernels produced by IREE compiler. Use 'All' for all, or specific dispatches e.g. '0 1 2 10'",
+    )
+    parser.addoption(
+        "--dispatch_benchmarks_dir",
+        default="./temp_dispatch_benchmarks",
+        help="Directory in which dispatch benchmarks are saved.",
+    )
+    parser.addoption(
+        "--batchsize",
+        default=1,
+        type=int,
+        help="Batch size for the tested model.",
+    )
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -65,18 +65,18 @@ A tool for benchmarking other models is built and can be invoked with a command
 see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
 ```bash
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
 ```
 VAE and Autoencoder are also available
 ```bash
 # VAE
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32

 # CLIP Autoencoder
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -0,0 +1,118 @@
+# Overview
+
+This document is intended to provide a starting point for profiling with SHARK/IREE. At it's core
+[SHARK](https://github.com/nod-ai/SHARK/tree/main/tank) is a python API that links the MLIR lowerings from various
+frameworks + frontends (e.g. PyTorch -> Torch-MLIR) with the compiler + runtime offered by IREE. More information
+on model coverage and framework support can be found [here](https://github.com/nod-ai/SHARK/tree/main/tank). The intended
+use case for SHARK is for compilation and deployment of performant state of the art AI models.
+
+![image](https://user-images.githubusercontent.com/22101546/217151219-9bb184a3-cfb9-4788-bb7e-5b502953525c.png)
+
+## Benchmarking with SHARK
+
+TODO: Expand this section.
+
+SHARK offers native benchmarking support, although because it is model focused, fine grain profiling is
+hidden when compared against the common "model benchmarking suite" use case SHARK is good at.
+
+### SharkBenchmarkRunner
+
+SharkBenchmarkRunner is a class designed for benchmarking models against other runtimes.
+TODO: List supported runtimes for comparison + example on how to benchmark with it.
+
+## Directly profiling IREE
+
+A number of excellent developer resources on profiling with IREE can be
+found [here](https://github.com/iree-org/iree/tree/main/docs/developers/developing_iree). As a result this section will
+focus on the bridging the gap between the two.
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling.md
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_with_tracy.md
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_vulkan_gpu.md
+ - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_cpu_events.md
+
+Internally, SHARK builds a pair of IREE commands to compile + run a model. At a high level the flow starts with the
+model represented with a high level dialect (commonly Linalg) and is compiled to a flatbuffer (.vmfb) that
+the runtime is capable of ingesting. At this point (with potentially a few runtime flags) the compiled model is then run
+through the IREE runtime. This is all facilitated with the IREE python bindings, which offers a convenient method
+to capture the compile command SHARK comes up with. This is done by setting the environment variable
+`IREE_SAVE_TEMPS` to point to a directory of choice, e.g. for stable diffusion
+```
+# Linux
+$ export IREE_SAVE_TEMPS=/path/to/some/directory
+# Windows
+$ $env:IREE_SAVE_TEMPS="C:\path\to\some\directory"
+$ python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse" --save_vmfb
+```
+NOTE: Currently this will only save the compile command + input MLIR for a single model if run in a pipeline.
+In the case of stable diffusion this (should) be UNet so to get examples for other models in the pipeline they
+need to be extracted and tested individually.
+
+The save temps directory should contain three files: `core-command-line.txt`, `core-input.mlir`, and `core-output.bin`.
+The command line for compilation will start something like this, where the `-` needs to be replaced with the path to `core-input.mlir`.
+```
+/home/quinn/nod/iree-build/compiler/bindings/python/iree/compiler/tools/../_mlir_libs/iree-compile - --iree-input-type=none ...
+```
+The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
+dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
+```
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
+
+iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
+```
+Where `${NUM}` is the dispatch number that you want to benchmark/profile in isolation.
+
+### Enabling Tracy for Vulkan profiling
+
+To begin profiling with Tracy, a build of IREE runtime with tracing enabled is needed. SHARK-Runtime builds an
+instrumented version alongside the normal version nightly (.whls typically found [here](https://github.com/nod-ai/SHARK-Runtime/releases)), however this is only available for Linux. For Windows, tracing can be enabled by enabling a CMake flag.
+```
+$env:IREE_ENABLE_RUNTIME_TRACING="ON"
+```
+Getting a trace can then be done by setting environment variable `TRACY_NO_EXIT=1` and running the program that is to be
+traced. Then, to actually capture the trace, use the `iree-tracy-capture` tool in a different terminal. Note that to get
+the capture and profiler tools the `IREE_BUILD_TRACY=ON` CMake flag needs to be set.
+```
+TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
+
+# (in another terminal, either on the same machine or through ssh with a tunnel through port 8086)
+iree-tracy-capture -o trace_filename.tracy
+```
+To do it over ssh, the flow looks like this
+```
+# From terminal 1 on local machine
+ssh -L 8086:localhost:8086 <remote_server_name>
+TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
+
+# From terminal 2 on local machine. Requires having built IREE with the CMake flag `IREE_BUILD_TRACY=ON` to build the required tooling.
+iree-tracy-capture -o /path/to/trace.tracy
+```
+
+The trace can then be viewed with
+```
+iree-tracy-profiler /path/to/trace.tracy
+```
+Capturing a runtime trace will work with any IREE tooling that uses the runtime. For example, `iree-benchmark-module`
+can be used for benchmarking an individual module. Importantly this means that any SHARK script can be profiled with tracy.
+
+NOTE: Not all backends have the same tracy support. This writeup is focused on CPU/Vulkan backends but there is recently added support for tracing on CUDA (requires the `--cuda_tracing` flag).
+
+## Experimental RGP support
+
+TODO: This section is temporary until proper RGP support is added.
+
+Currently, for stable diffusion there is a flag for enabling UNet to be visible to RGP with `--enable_rgp`. To get a proper capture though, the `DevModeSqttPrepareFrameCount=1` flag needs to be set for the driver (done with `VkPanel` on Windows).
+With these two settings, a single iteration of UNet can be captured.
+
+(AMD only) To get a dump of the pipelines (result of compiled SPIR-V) the `EnablePipelineDump=1` driver flag can be set. The
+files will typically be dumped to a directory called `spvPipeline` (on Linux `/var/tmp/spvPipeline`. The dumped files will
+include header information that can be used to map back to the source dispatch/SPIR-V, e.g.
+```
+[Version]
+version = 57 
+
+[CsSpvFile]
+fileName = Shader_0x946C08DFD0C10D9A.spv
+
+[CsInfo]
+entryPoint = forward_dispatch_193_matmul_256x65536x2304
+```
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -0,0 +1,58 @@
+# This script will toggle the comment/uncommenting aspect for dealing
+# with __file__ AttributeError arising in case of a few modules in
+# `torch/_dynamo/skipfiles.py` (within shark.venv)
+
+from distutils.sysconfig import get_python_lib
+import fileinput
+from pathlib import Path
+
+# Temorary workaround for transformers/__init__.py.
+path_to_tranformers_hook = Path(
+    get_python_lib()
+    + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
+)
+if path_to_tranformers_hook.is_file():
+    pass
+else:
+    with open(path_to_tranformers_hook, "w") as f:
+        f.write("module_collection_mode = 'pyz+py'")
+
+path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")
+
+modules_to_comment = ["abc,", "os,", "posixpath,", "_collections_abc,"]
+startMonitoring = 0
+for line in fileinput.input(path_to_skipfiles, inplace=True):
+    if "SKIP_DIRS = " in line:
+        startMonitoring = 1
+        print(line, end="")
+    elif startMonitoring in [1, 2]:
+        if "]" in line:
+            startMonitoring += 1
+            print(line, end="")
+        else:
+            flag = True
+            for module in modules_to_comment:
+                if module in line:
+                    if not line.startswith("#"):
+                        print(f"#{line}", end="")
+                    else:
+                        print(f"{line[1:]}", end="")
+                    flag = False
+                    break
+            if flag:
+                print(line, end="")
+    else:
+        print(line, end="")
+
+# For getting around scikit-image's packaging, laze_loader has had a patch merged but yet to be released.
+# Refer: https://github.com/scientific-python/lazy_loader
+path_to_lazy_loader = Path(get_python_lib() + "/lazy_loader/__init__.py")
+
+for line in fileinput.input(path_to_lazy_loader, inplace=True):
+    if 'stubfile = filename if filename.endswith("i")' in line:
+        print(
+            '    stubfile = (filename if filename.endswith("i") else f"{os.path.splitext(filename)[0]}.pyi")',
+            end="",
+        )
+    else:
+        print(line, end="")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,3 +10,8 @@ requires = [
    "iree-runtime>=20221022.190",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 79
+include = '\.pyi?$'
+
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -1,9 +1,9 @@
 -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 --pre

-numpy==1.22.4
-torchvision
+numpy>1.22.4
 pytorch-triton
+torchvision==0.16.0.dev20230322 
 tabulate

 tqdm
@@ -15,8 +15,8 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tensorflow==2.10.1
-keras==2.10
+tensorflow>2.11
+keras
 #tf-models-nightly
 #tensorflow-text-nightly
 transformers
@@ -33,6 +33,7 @@ lit
 pyyaml
 python-dateutil
 sacremoses
+sentencepiece

 # web dependecies.
 gradio
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,13 +16,16 @@ parameterized

 # Add transformers, diffusers and scipy since it most commonly used
 transformers
-diffusers
+diffusers @ git+https://github.com/huggingface/diffusers@main
 scipy
 ftfy
 gradio
 altair
 omegaconf
 safetensors
+opencv-python
+scikit-image
+pytorch_lightning # for runwayml models

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -1,19 +1,54 @@
+<#
+.SYNOPSIS
+  A script to update and install the SHARK runtime and its dependencies.
+
+.DESCRIPTION
+  This script updates and installs the SHARK runtime and its dependencies.
+  It checks the Python version installed and installs any required build
+  dependencies into a Python virtual environment.
+  If that environment does not exist, it creates it.
+  
+.PARAMETER update-src
+  git pulls latest version
+
+.PARAMETER force
+  removes and recreates venv to force update of all dependencies
+  
+.EXAMPLE
+  .\setup_venv.ps1 --force
+
+.EXAMPLE
+  .\setup_venv.ps1 --update-src
+
+.INPUTS
+  None
+
+.OUTPUTS
+  None
+
+#>
+
 param([string]$arguments)

 if ($arguments -eq "--update-src"){
 	git pull
 }

-#Write-Host "Installing python"
-
-#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
-
-#Write-Host "python installation completed successfully"
-
-#Write-Host "Reload environment variables"
-#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
-#Write-Host "Reloaded environment variables"
-
+if ($arguments -eq "--force"){
+	if (Test-Path env:VIRTUAL_ENV) {
+        Write-Host "deactivating..."
+        Deactivate
+    }
+    
+    if (Test-Path .\shark.venv\) {
+        Write-Host "removing and recreating venv..."
+        Remove-Item .\shark.venv -Force -Recurse
+        if (Test-Path .\shark.venv\) {
+            Write-Host 'could not remove .\shark-venv - please try running ".\setup_venv.ps1 --force" again!'
+            exit 1
+        }
+    }
+}

 # redirect stderr into stdout
 $p = &{python -V} 2>&1
@@ -25,19 +60,36 @@ $version = if($p -is [System.Management.Automation.ErrorRecord])
 }
 else
 {
-    # otherwise return as is
-    $p
+    # otherwise return complete Python list
+    $ErrorActionPreference = 'SilentlyContinue'
+    $PyVer = py --list
 }

-Write-Host "Python version found is"
-Write-Host $p
+# deactivate any activated venvs
+if ($PyVer -like "*venv*")
+{
+  deactivate # make sure we don't update the wrong venv
+  $PyVer = py --list # update list
+}

+Write-Host "Python versions found are"
+Write-Host ($PyVer | Out-String) # formatted output with line breaks
+if (!($PyVer.length -ne 0)) {$p} # return Python --version String if py.exe is unavailable
+if (!($PyVer -like "*3.11*") -and !($p -like "*3.11*")) # if 3.11 is not in any list
+{
+    Write-Host "Please install Python 3.11 and try again"
+    exit 34
+}

 Write-Host "Installing Build Dependencies"
-python -m venv .\shark.venv\
+# make sure we really use 3.11 from list, even if it's not the default.
+if ($NULL -ne $PyVer) {py -3.11 -m venv .\shark.venv\}
+else {python -m venv .\shark.venv\}
 .\shark.venv\Scripts\activate
+python -m pip install --upgrade pip
+pip install wheel
 pip install -r requirements.txt
-pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
+pip install --pre torch-mlir torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
 pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
 Write-Host "Building SHARK..."
 pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -42,7 +42,7 @@ Green=`tput setaf 2`
 Yellow=`tput setaf 3`

 # Assume no binary torch-mlir.
-# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
+# Currently available for macOS m1&intel (3.11) and Linux(3.8,3.10,3.11)
 torch_mlir_bin=false
 if [[ $(uname -s) = 'Darwin' ]]; then
  echo "${Yellow}Apple macOS detected"
@@ -60,12 +60,12 @@ if [[ $(uname -s) = 'Darwin' ]]; then
  fi
  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
-  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.11" ]; then
    torch_mlir_bin=true
  fi
 elif [[ $(uname -s) = 'Linux' ]]; then
  echo "${Yellow}Linux detected"
-  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.10" ] || [ "$PYTHON_VERSION_X_Y" == "3.11" ] ; then
    torch_mlir_bin=true
  fi
 else
@@ -78,7 +78,7 @@ $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
  if [[ $(uname -s) = 'Darwin' ]]; then
    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
-    $PYTHON -m pip install --pre --no-cache-dir  torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
+    $PYTHON -m pip install --pre --no-cache-dir torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
  else
    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
    if [ $? -eq 0 ];then
@@ -89,7 +89,7 @@ if [ "$torch_mlir_bin" = true ]; then
  fi
 else
  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
-  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
+  echo "${Yello}Python 3.11 supported on macOS and 3.8,3.10 and 3.11 on Linux"
  echo "${Red}Please build torch-mlir from source in your environment"
  exit 1
 fi
@@ -98,11 +98,11 @@ if [[ -z "${USE_IREE}" ]]; then
  RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
 else
  touch ./.use-iree
-  RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
+  RUNTIME="https://openxla.github.io/iree/pip-release-links.html"
 fi
 if [[ -z "${NO_BACKEND}" ]]; then
  echo "Installing ${RUNTIME}..."
-  $PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
+  $PYTHON -m pip install --pre --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
 else
  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
 fi
@@ -112,7 +112,7 @@ if [[ ! -z "${IMPORTER}" ]]; then
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
    #Always get the importer tools from upstream IREE
-    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://openxla.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
@@ -129,11 +129,11 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  TV_VERSION=${TV_VER:9:18}
  $PYTHON -m pip uninstall -y torch torchvision
  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl
+  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu117."
+    echo "Successfully Installed torch + cu118."
  else
-    echo "Could not install torch + cu117." >&2
+    echo "Could not install torch + cu118." >&2
  fi
 fi

--- a/shark/examples/shark_inference/llama/README.md
+++ b/shark/examples/shark_inference/llama/README.md
@@ -0,0 +1,18 @@
+# SHARK LLaMA
+
+## TORCH-MLIR Version
+
+```
+https://github.com/nod-ai/torch-mlir.git
+```
+Then check out the `complex` branch and `git submodule update --init` and then build with `.\build_tools\python_deploy\build_windows.ps1`
+
+### Setup & Run
+```
+git clone https://github.com/nod-ai/llama.git
+```
+Then in this repository
+```
+pip install -e .
+python llama/shark_model.py
+```
--- a/shark/examples/shark_inference/sharded_bloom.py
+++ b/shark/examples/shark_inference/sharded_bloom.py
@@ -0,0 +1,842 @@
+####################################################################################
+# Please make sure you have transformers 4.21.2 installed before running this demo
+#
+# -p --model_path: the directory in which you want to store the bloom files.
+# -dl --device_list: the list of device indices you want to use.  if you want to only use the first device, or you are running on cpu leave this blank.
+#                     Otherwise, please give this argument in this format: "[0, 1, 2]"
+# -de --device: the device you want to run bloom on.  E.G. cpu, cuda
+# -c, --recompile: set to true if you want to recompile to vmfb.
+# -d, --download: set to true if you want to redownload the mlir files
+# -cm, --create_mlirs: set to true if you want to create the mlir files from scratch.  please make sure you have transformers 4.21.2 before using this option
+# -t --token_count: the number of tokens you want to generate
+# -pr --prompt: the prompt you want to feed to the model
+# -m --model_name: the name of the model, e.g. bloom-560m
+#
+# If you don't specify a prompt when you run this example, you will be able to give prompts through the terminal.  Run the
+# example in this way if you want to run multiple examples without reinitializing the model
+#####################################################################################
+
+import os
+import io
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+import torch_mlir
+from torch_mlir import TensorPlaceholder
+import re
+from transformers.models.bloom.configuration_bloom import BloomConfig
+import json
+import sys
+import argparse
+import json
+import urllib.request
+import subprocess
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_public_file
+from transformers import (
+    BloomTokenizerFast,
+    BloomForSequenceClassification,
+    BloomForCausalLM,
+)
+from transformers.models.bloom.modeling_bloom import (
+    BloomBlock,
+    build_alibi_tensor,
+)
+
+IS_CUDA = False
+
+
+class ShardedBloom:
+    def __init__(self, src_folder):
+        f = open(f"{src_folder}/config.json")
+        config = json.load(f)
+        f.close()
+
+        self.layers_initialized = False
+
+        self.src_folder = src_folder
+        try:
+            self.n_embed = config["n_embed"]
+        except KeyError:
+            self.n_embed = config["hidden_size"]
+        self.vocab_size = config["vocab_size"]
+        self.n_layer = config["n_layer"]
+        try:
+            self.n_head = config["num_attention_heads"]
+        except KeyError:
+            self.n_head = config["n_head"]
+
+    def _init_layer(self, layer_name, device, replace, device_idx):
+        if replace or not os.path.exists(
+            f"{self.src_folder}/{layer_name}.vmfb"
+        ):
+            f_ = open(f"{self.src_folder}/{layer_name}.mlir", encoding="utf-8")
+            module = f_.read()
+            f_.close()
+            module = bytes(module, "utf-8")
+            shark_module = SharkInference(
+                module,
+                device=device,
+                mlir_dialect="tm_tensor",
+                device_idx=device_idx,
+            )
+            shark_module.save_module(
+                module_name=f"{self.src_folder}/{layer_name}",
+                extra_args=[
+                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    "--iree-stream-resource-max-allocation-size=1000000000",
+                    "--iree-codegen-check-ir-before-llvm-conversion=false",
+                ],
+            )
+        else:
+            shark_module = SharkInference(
+                "",
+                device=device,
+                mlir_dialect="tm_tensor",
+                device_idx=device_idx,
+            )
+
+        return shark_module
+
+    def init_layers(self, device, replace=False, device_idx=[0]):
+        if device_idx is not None:
+            n_devices = len(device_idx)
+
+        self.word_embeddings_module = self._init_layer(
+            "word_embeddings",
+            device,
+            replace,
+            device_idx if device_idx is None else device_idx[0 % n_devices],
+        )
+        self.word_embeddings_layernorm_module = self._init_layer(
+            "word_embeddings_layernorm",
+            device,
+            replace,
+            device_idx if device_idx is None else device_idx[1 % n_devices],
+        )
+        self.ln_f_module = self._init_layer(
+            "ln_f",
+            device,
+            replace,
+            device_idx if device_idx is None else device_idx[2 % n_devices],
+        )
+        self.lm_head_module = self._init_layer(
+            "lm_head",
+            device,
+            replace,
+            device_idx if device_idx is None else device_idx[3 % n_devices],
+        )
+        self.block_modules = [
+            self._init_layer(
+                f"bloom_block_{i}",
+                device,
+                replace,
+                device_idx
+                if device_idx is None
+                else device_idx[(i + 4) % n_devices],
+            )
+            for i in range(self.n_layer)
+        ]
+
+        self.layers_initialized = True
+
+    def load_layers(self):
+        assert self.layers_initialized
+
+        self.word_embeddings_module.load_module(
+            f"{self.src_folder}/word_embeddings.vmfb"
+        )
+        self.word_embeddings_layernorm_module.load_module(
+            f"{self.src_folder}/word_embeddings_layernorm.vmfb"
+        )
+        for block_module, i in zip(self.block_modules, range(self.n_layer)):
+            block_module.load_module(f"{self.src_folder}/bloom_block_{i}.vmfb")
+        self.ln_f_module.load_module(f"{self.src_folder}/ln_f.vmfb")
+        self.lm_head_module.load_module(f"{self.src_folder}/lm_head.vmfb")
+
+    def forward_pass(self, input_ids, device):
+        if IS_CUDA:
+            cudaSetDevice(self.word_embeddings_module.device_idx)
+
+        input_embeds = self.word_embeddings_module(
+            inputs=(input_ids,), function_name="forward"
+        )
+
+        input_embeds = torch.tensor(input_embeds).float()
+        if IS_CUDA:
+            cudaSetDevice(self.word_embeddings_layernorm_module.device_idx)
+        hidden_states = self.word_embeddings_layernorm_module(
+            inputs=(input_embeds,), function_name="forward"
+        )
+
+        hidden_states = torch.tensor(hidden_states).float()
+
+        attention_mask = torch.ones(
+            [hidden_states.shape[0], len(input_ids[0])]
+        )
+        alibi = build_alibi_tensor(
+            attention_mask,
+            self.n_head,
+            hidden_states.dtype,
+            hidden_states.device,
+        )
+
+        causal_mask = _prepare_attn_mask(
+            attention_mask, input_ids.size(), input_embeds, 0
+        )
+        causal_mask = torch.tensor(causal_mask).float()
+
+        presents = ()
+        all_hidden_states = tuple(hidden_states)
+
+        for block_module, i in zip(self.block_modules, range(self.n_layer)):
+            if IS_CUDA:
+                cudaSetDevice(block_module.device_idx)
+
+            output = block_module(
+                inputs=(
+                    hidden_states.detach().numpy(),
+                    alibi.detach().numpy(),
+                    causal_mask.detach().numpy(),
+                ),
+                function_name="forward",
+            )
+            hidden_states = torch.tensor(output[0]).float()
+            all_hidden_states = all_hidden_states + (hidden_states,)
+            presents = presents + (
+                tuple(
+                    (
+                        output[1],
+                        output[2],
+                    )
+                ),
+            )
+        if IS_CUDA:
+            cudaSetDevice(self.ln_f_module.device_idx)
+
+        hidden_states = self.ln_f_module(
+            inputs=(hidden_states,), function_name="forward"
+        )
+        if IS_CUDA:
+            cudaSetDevice(self.lm_head_module.device_idx)
+
+        logits = self.lm_head_module(
+            inputs=(hidden_states,), function_name="forward"
+        )
+        logits = torch.tensor(logits).float()
+
+        return torch.argmax(logits[:, -1, :], dim=-1)
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
+    mask_cond = torch.arange(mask.size(-1))
+    intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
+    mask.masked_fill_(intermediate_mask, 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    target_length, past_key_values_length, dtype=dtype
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    expanded_mask = mask[None, None, :, :].expand(
+        batch_size, 1, target_length, target_length + past_key_values_length
+    )
+    return expanded_mask
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    batch_size, source_length = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else source_length
+
+    expanded_mask = (
+        mask[:, None, None, :]
+        .expand(batch_size, 1, tgt_len, source_length)
+        .to(dtype)
+    )
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+
+def _prepare_attn_mask(
+    attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # create causal mask
+    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+    combined_attention_mask = None
+    if input_shape[-1] > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            past_key_values_length=past_key_values_length,
+        ).to(attention_mask.device)
+
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(
+            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+        )
+        combined_attention_mask = (
+            expanded_attn_mask
+            if combined_attention_mask is None
+            else expanded_attn_mask + combined_attention_mask
+        )
+
+    return combined_attention_mask
+
+
+def download_model(destination_folder, model_name):
+    download_public_file(
+        f"gs://shark_tank/sharded_bloom/{model_name}/", destination_folder
+    )
+
+
+def compile_embeddings(embeddings_layer, input_ids, path):
+    input_ids_placeholder = torch_mlir.TensorPlaceholder.like(
+        input_ids, dynamic_axes=[1]
+    )
+    module = torch_mlir.compile(
+        embeddings_layer,
+        (input_ids_placeholder),
+        torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=False,
+        verbose=False,
+    )
+
+    bytecode_stream = io.BytesIO()
+    module.operation.write_bytecode(bytecode_stream)
+    bytecode = bytecode_stream.getvalue()
+
+    f_ = open(path, "w+")
+    f_.write(str(module))
+    f_.close()
+    return
+
+
+def compile_word_embeddings_layernorm(
+    embeddings_layer_layernorm, embeds, path
+):
+    embeds_placeholder = torch_mlir.TensorPlaceholder.like(
+        embeds, dynamic_axes=[1]
+    )
+    module = torch_mlir.compile(
+        embeddings_layer_layernorm,
+        (embeds_placeholder),
+        torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=False,
+        verbose=False,
+    )
+
+    bytecode_stream = io.BytesIO()
+    module.operation.write_bytecode(bytecode_stream)
+    bytecode = bytecode_stream.getvalue()
+
+    f_ = open(path, "w+")
+    f_.write(str(module))
+    f_.close()
+    return
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+def compile_to_mlir(
+    bblock,
+    hidden_states,
+    layer_past=None,
+    attention_mask=None,
+    head_mask=None,
+    use_cache=None,
+    output_attentions=False,
+    alibi=None,
+    block_index=0,
+    path=".",
+):
+    fx_g = make_fx(
+        bblock,
+        decomposition_table=get_decompositions(
+            [
+                torch.ops.aten.split.Tensor,
+                torch.ops.aten.split_with_sizes,
+            ]
+        ),
+        tracing_mode="real",
+        _allow_non_fake_inputs=False,
+    )(hidden_states, alibi, attention_mask)
+
+    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+    fx_g.recompile()
+
+    strip_overloads(fx_g)
+
+    hidden_states_placeholder = TensorPlaceholder.like(
+        hidden_states, dynamic_axes=[1]
+    )
+    attention_mask_placeholder = TensorPlaceholder.like(
+        attention_mask, dynamic_axes=[2, 3]
+    )
+    alibi_placeholder = TensorPlaceholder.like(alibi, dynamic_axes=[2])
+
+    ts_g = torch.jit.script(fx_g)
+
+    module = torch_mlir.compile(
+        ts_g,
+        (
+            hidden_states_placeholder,
+            alibi_placeholder,
+            attention_mask_placeholder,
+        ),
+        torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=False,
+        verbose=False,
+    )
+
+    module_placeholder = module
+    module_context = module_placeholder.context
+
+    def check_valid_line(line, line_n, mlir_file_len):
+        if "private" in line:
+            return False
+        if "attributes" in line:
+            return False
+        if mlir_file_len - line_n == 2:
+            return False
+
+        return True
+
+    mlir_file_len = len(str(module).split("\n"))
+
+    def remove_constant_dim(line):
+        if "17x" in line:
+            line = re.sub("17x", "?x", line)
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
+            )
+        if "arith.cmpi eq" in line:
+            line = re.sub("c17", "dim", line)
+        if " 17," in line:
+            line = re.sub(" 17,", " %dim,", line)
+        return line
+
+    module = "\n".join(
+        [
+            remove_constant_dim(line)
+            for line, line_n in zip(
+                str(module).split("\n"), range(mlir_file_len)
+            )
+            if check_valid_line(line, line_n, mlir_file_len)
+        ]
+    )
+
+    module = module_placeholder.parse(module, context=module_context)
+    bytecode_stream = io.BytesIO()
+    module.operation.write_bytecode(bytecode_stream)
+    bytecode = bytecode_stream.getvalue()
+
+    f_ = open(path, "w+")
+    f_.write(str(module))
+    f_.close()
+    return
+
+
+def compile_ln_f(ln_f, hidden_layers, path):
+    hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
+        hidden_layers, dynamic_axes=[1]
+    )
+    module = torch_mlir.compile(
+        ln_f,
+        (hidden_layers_placeholder),
+        torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=False,
+        verbose=False,
+    )
+
+    bytecode_stream = io.BytesIO()
+    module.operation.write_bytecode(bytecode_stream)
+    bytecode = bytecode_stream.getvalue()
+
+    f_ = open(path, "w+")
+    f_.write(str(module))
+    f_.close()
+    return
+
+
+def compile_lm_head(lm_head, hidden_layers, path):
+    hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
+        hidden_layers, dynamic_axes=[1]
+    )
+    module = torch_mlir.compile(
+        lm_head,
+        (hidden_layers_placeholder),
+        torch_mlir.OutputType.LINALG_ON_TENSORS,
+        use_tracing=False,
+        verbose=False,
+    )
+
+    bytecode_stream = io.BytesIO()
+    module.operation.write_bytecode(bytecode_stream)
+    bytecode = bytecode_stream.getvalue()
+
+    f_ = open(path, "w+")
+    f_.write(str(module))
+    f_.close()
+    return
+
+
+def create_mlirs(destination_folder, model_name):
+    model_config = "bigscience/" + model_name
+    sample_input_ids = torch.ones([1, 17], dtype=torch.int64)
+
+    urllib.request.urlretrieve(
+        f"https://huggingface.co/bigscience/{model_name}/resolve/main/config.json",
+        filename=f"{destination_folder}/config.json",
+    )
+    urllib.request.urlretrieve(
+        f"https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json",
+        filename=f"{destination_folder}/tokenizer.json",
+    )
+
+    class HuggingFaceLanguage(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = BloomForCausalLM.from_pretrained(model_config)
+
+        def forward(self, tokens):
+            return self.model.forward(tokens)[0]
+
+    class HuggingFaceBlock(torch.nn.Module):
+        def __init__(self, block):
+            super().__init__()
+            self.model = block
+
+        def forward(self, tokens, alibi, attention_mask):
+            output = self.model(
+                hidden_states=tokens,
+                alibi=alibi,
+                attention_mask=attention_mask,
+                use_cache=True,
+                output_attentions=False,
+            )
+            return (output[0], output[1][0], output[1][1])
+
+    model = HuggingFaceLanguage()
+
+    compile_embeddings(
+        model.model.transformer.word_embeddings,
+        sample_input_ids,
+        f"{destination_folder}/word_embeddings.mlir",
+    )
+
+    inputs_embeds = model.model.transformer.word_embeddings(sample_input_ids)
+
+    compile_word_embeddings_layernorm(
+        model.model.transformer.word_embeddings_layernorm,
+        inputs_embeds,
+        f"{destination_folder}/word_embeddings_layernorm.mlir",
+    )
+
+    hidden_states = model.model.transformer.word_embeddings_layernorm(
+        inputs_embeds
+    )
+
+    input_shape = sample_input_ids.size()
+
+    current_sequence_length = hidden_states.shape[1]
+    past_key_values_length = 0
+    past_key_values = tuple([None] * len(model.model.transformer.h))
+
+    attention_mask = torch.ones(
+        (hidden_states.shape[0], current_sequence_length), device="cpu"
+    )
+
+    alibi = build_alibi_tensor(
+        attention_mask,
+        model.model.transformer.n_head,
+        hidden_states.dtype,
+        "cpu",
+    )
+
+    causal_mask = _prepare_attn_mask(
+        attention_mask, input_shape, inputs_embeds, past_key_values_length
+    )
+
+    head_mask = model.model.transformer.get_head_mask(
+        None, model.model.transformer.config.n_layer
+    )
+    output_attentions = model.model.transformer.config.output_attentions
+
+    all_hidden_states = ()
+
+    for i, (block, layer_past) in enumerate(
+        zip(model.model.transformer.h, past_key_values)
+    ):
+        all_hidden_states = all_hidden_states + (hidden_states,)
+
+        proxy_model = HuggingFaceBlock(block)
+
+        compile_to_mlir(
+            proxy_model,
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=causal_mask,
+            head_mask=head_mask[i],
+            use_cache=True,
+            output_attentions=output_attentions,
+            alibi=alibi,
+            block_index=i,
+            path=f"{destination_folder}/bloom_block_{i}.mlir",
+        )
+
+    compile_ln_f(
+        model.model.transformer.ln_f,
+        hidden_states,
+        f"{destination_folder}/ln_f.mlir",
+    )
+    hidden_states = model.model.transformer.ln_f(hidden_states)
+    compile_lm_head(
+        model.model.lm_head,
+        hidden_states,
+        f"{destination_folder}/lm_head.mlir",
+    )
+
+
+def run_large_model(
+    token_count,
+    recompile,
+    model_path,
+    prompt,
+    device_list,
+    script_path,
+    device,
+):
+    f = open(f"{model_path}/prompt.txt", "w+")
+    f.write(prompt)
+    f.close()
+    for i in range(token_count):
+        if i == 0:
+            will_compile = recompile
+        else:
+            will_compile = False
+            f = open(f"{model_path}/prompt.txt", "r")
+            prompt = f.read()
+            f.close()
+
+        subprocess.run(
+            [
+                "python",
+                script_path,
+                model_path,
+                "start",
+                str(will_compile),
+                "cpu",
+                "None",
+                prompt,
+            ]
+        )
+        for i in range(config["n_layer"]):
+            if device_list is not None:
+                device_idx = str(device_list[i % len(device_list)])
+            else:
+                device_idx = "None"
+            subprocess.run(
+                [
+                    "python",
+                    script_path,
+                    model_path,
+                    str(i),
+                    str(will_compile),
+                    device,
+                    device_idx,
+                    prompt,
+                ]
+            )
+        subprocess.run(
+            [
+                "python",
+                script_path,
+                model_path,
+                "end",
+                str(will_compile),
+                "cpu",
+                "None",
+                prompt,
+            ]
+        )
+
+    f = open(f"{model_path}/prompt.txt", "r")
+    output = f.read()
+    f.close()
+    print(output)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog="Bloom-560m")
+    parser.add_argument("-p", "--model_path")
+    parser.add_argument("-dl", "--device_list", default=None)
+    parser.add_argument("-de", "--device", default="cpu")
+    parser.add_argument("-c", "--recompile", default=False, type=bool)
+    parser.add_argument("-d", "--download", default=False, type=bool)
+    parser.add_argument("-t", "--token_count", default=10, type=int)
+    parser.add_argument("-m", "--model_name", default="bloom-560m")
+    parser.add_argument("-cm", "--create_mlirs", default=False, type=bool)
+
+    parser.add_argument(
+        "-lm", "--large_model_memory_efficient", default=False, type=bool
+    )
+
+    parser.add_argument(
+        "-pr",
+        "--prompt",
+        default=None,
+    )
+    args = parser.parse_args()
+
+    if args.create_mlirs and args.large_model_memory_efficient:
+        print(
+            "Warning: If you need to use memory efficient mode, you probably want to use 'download' instead"
+        )
+
+    if not os.path.isdir(args.model_path):
+        os.mkdir(args.model_path)
+
+    if args.device_list is not None:
+        args.device_list = json.loads(args.device_list)
+
+    if args.device == "cuda" and args.device_list is not None:
+        IS_CUDA = True
+        from cuda.cudart import cudaSetDevice
+    if args.download and args.create_mlirs:
+        print(
+            "WARNING: It is not advised to turn on both download and create_mlirs"
+        )
+    if args.download:
+        download_model(args.model_path, args.model_name)
+    if args.create_mlirs:
+        create_mlirs(args.model_path, args.model_name)
+    from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    if args.prompt is not None:
+        input_ids = tokenizer.encode(args.prompt, return_tensors="pt")
+
+    if args.large_model_memory_efficient:
+        f = open(f"{args.model_path}/config.json")
+        config = json.load(f)
+        f.close()
+
+        self_path = os.path.dirname(os.path.abspath(__file__))
+        script_path = os.path.join(self_path, "sharded_bloom_large_models.py")
+
+        if args.prompt is not None:
+            run_large_model(
+                args.token_count,
+                args.recompile,
+                args.model_path,
+                args.prompt,
+                args.device_list,
+                script_path,
+                args.device,
+            )
+
+        else:
+            while True:
+                prompt = input("Enter Prompt: ")
+                try:
+                    token_count = int(
+                        input("Enter number of tokens you want to generate: ")
+                    )
+                except:
+                    print(
+                        "Invalid integer entered.  Using default value of 10"
+                    )
+                    token_count = 10
+
+                run_large_model(
+                    token_count,
+                    args.recompile,
+                    args.model_path,
+                    prompt,
+                    args.device_list,
+                    script_path,
+                    args.device,
+                )
+
+    else:
+        shardedbloom = ShardedBloom(args.model_path)
+        shardedbloom.init_layers(
+            device=args.device,
+            replace=args.recompile,
+            device_idx=args.device_list,
+        )
+        shardedbloom.load_layers()
+
+        if args.prompt is not None:
+            for _ in range(args.token_count):
+                next_token = shardedbloom.forward_pass(
+                    torch.tensor(input_ids), device=args.device
+                )
+                input_ids = torch.cat(
+                    [input_ids, next_token.unsqueeze(-1)], dim=-1
+                )
+
+            print(tokenizer.decode(input_ids.squeeze()))
+
+        else:
+            while True:
+                prompt = input("Enter Prompt: ")
+                try:
+                    token_count = int(
+                        input("Enter number of tokens you want to generate: ")
+                    )
+                except:
+                    print(
+                        "Invalid integer entered.  Using default value of 10"
+                    )
+                    token_count = 10
+
+                input_ids = tokenizer.encode(prompt, return_tensors="pt")
+
+                for _ in range(token_count):
+                    next_token = shardedbloom.forward_pass(
+                        torch.tensor(input_ids), device=args.device
+                    )
+                    input_ids = torch.cat(
+                        [input_ids, next_token.unsqueeze(-1)], dim=-1
+                    )
+
+                print(tokenizer.decode(input_ids.squeeze()))
--- a/shark/examples/shark_inference/sharded_bloom_large_models.py
+++ b/shark/examples/shark_inference/sharded_bloom_large_models.py
@@ -0,0 +1,381 @@
+import sys
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
+import re
+from shark.shark_inference import SharkInference
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+from transformers.models.bloom.modeling_bloom import (
+    BloomBlock,
+    build_alibi_tensor,
+)
+import time
+import json
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    batch_size, source_length = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else source_length
+
+    expanded_mask = (
+        mask[:, None, None, :]
+        .expand(batch_size, 1, tgt_len, source_length)
+        .to(dtype)
+    )
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+
+def _prepare_attn_mask(
+    attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # create causal mask
+    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+    combined_attention_mask = None
+    if input_shape[-1] > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            past_key_values_length=past_key_values_length,
+        ).to(attention_mask.device)
+
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(
+            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+        )
+        combined_attention_mask = (
+            expanded_attn_mask
+            if combined_attention_mask is None
+            else expanded_attn_mask + combined_attention_mask
+        )
+
+    return combined_attention_mask
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
+    mask_cond = torch.arange(mask.size(-1))
+    intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
+    mask.masked_fill_(intermediate_mask, 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    target_length, past_key_values_length, dtype=dtype
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    expanded_mask = mask[None, None, :, :].expand(
+        batch_size, 1, target_length, target_length + past_key_values_length
+    )
+    return expanded_mask
+
+
+if __name__ == "__main__":
+    working_dir = sys.argv[1]
+    layer_name = sys.argv[2]
+    will_compile = sys.argv[3]
+    device = sys.argv[4]
+    device_idx = sys.argv[5]
+    prompt = sys.argv[6]
+
+    if device_idx.lower().strip() == "none":
+        device_idx = None
+    else:
+        device_idx = int(device_idx)
+
+    if will_compile.lower().strip() == "true":
+        will_compile = True
+    else:
+        will_compile = False
+
+    f = open(f"{working_dir}/config.json")
+    config = json.load(f)
+    f.close()
+
+    layers_initialized = False
+    try:
+        n_embed = config["n_embed"]
+    except KeyError:
+        n_embed = config["hidden_size"]
+    vocab_size = config["vocab_size"]
+    n_layer = config["n_layer"]
+    try:
+        n_head = config["num_attention_heads"]
+    except KeyError:
+        n_head = config["n_head"]
+
+    if not os.path.isdir(working_dir):
+        os.mkdir(working_dir)
+
+    if layer_name == "start":
+        tokenizer = AutoTokenizer.from_pretrained(working_dir)
+        input_ids = tokenizer.encode(prompt, return_tensors="pt")
+
+        mlir_str = ""
+
+        if will_compile:
+            f = open(f"{working_dir}/word_embeddings.mlir", encoding="utf-8")
+            mlir_str = f.read()
+            f.close()
+
+            mlir_str = bytes(mlir_str, "utf-8")
+
+        shark_module = SharkInference(
+            mlir_str,
+            device="cpu",
+            mlir_dialect="tm_tensor",
+            device_idx=None,
+        )
+
+        if will_compile:
+            shark_module.save_module(
+                module_name=f"{working_dir}/word_embeddings",
+                extra_args=[
+                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    "--iree-stream-resource-max-allocation-size=1000000000",
+                    "--iree-codegen-check-ir-before-llvm-conversion=false",
+                ],
+            )
+
+        shark_module.load_module(f"{working_dir}/word_embeddings.vmfb")
+        input_embeds = shark_module(
+            inputs=(input_ids,), function_name="forward"
+        )
+        input_embeds = torch.tensor(input_embeds).float()
+
+        mlir_str = ""
+
+        if will_compile:
+            f = open(
+                f"{working_dir}/word_embeddings_layernorm.mlir",
+                encoding="utf-8",
+            )
+            mlir_str = f.read()
+            f.close()
+
+        shark_module = SharkInference(
+            mlir_str,
+            device="cpu",
+            mlir_dialect="tm_tensor",
+            device_idx=None,
+        )
+
+        if will_compile:
+            shark_module.save_module(
+                module_name=f"{working_dir}/word_embeddings_layernorm",
+                extra_args=[
+                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    "--iree-stream-resource-max-allocation-size=1000000000",
+                    "--iree-codegen-check-ir-before-llvm-conversion=false",
+                ],
+            )
+
+        shark_module.load_module(
+            f"{working_dir}/word_embeddings_layernorm.vmfb"
+        )
+        hidden_states = shark_module(
+            inputs=(input_embeds,), function_name="forward"
+        )
+        hidden_states = torch.tensor(hidden_states).float()
+
+        torch.save(hidden_states, f"{working_dir}/hidden_states_0.pt")
+
+        attention_mask = torch.ones(
+            [hidden_states.shape[0], len(input_ids[0])]
+        )
+
+        attention_mask = torch.tensor(attention_mask).float()
+
+        alibi = build_alibi_tensor(
+            attention_mask,
+            n_head,
+            hidden_states.dtype,
+            device="cpu",
+        )
+
+        torch.save(alibi, f"{working_dir}/alibi.pt")
+
+        causal_mask = _prepare_attn_mask(
+            attention_mask, input_ids.size(), input_embeds, 0
+        )
+        causal_mask = torch.tensor(causal_mask).float()
+
+        torch.save(causal_mask, f"{working_dir}/causal_mask.pt")
+
+    elif layer_name in [str(x) for x in range(n_layer)]:
+        hidden_states = torch.load(
+            f"{working_dir}/hidden_states_{layer_name}.pt"
+        )
+        alibi = torch.load(f"{working_dir}/alibi.pt")
+        causal_mask = torch.load(f"{working_dir}/causal_mask.pt")
+
+        mlir_str = ""
+
+        if will_compile:
+            f = open(
+                f"{working_dir}/bloom_block_{layer_name}.mlir",
+                encoding="utf-8",
+            )
+            mlir_str = f.read()
+            f.close()
+
+            mlir_str = bytes(mlir_str, "utf-8")
+
+        shark_module = SharkInference(
+            mlir_str,
+            device=device,
+            mlir_dialect="tm_tensor",
+            device_idx=device_idx,
+        )
+
+        if will_compile:
+            shark_module.save_module(
+                module_name=f"{working_dir}/bloom_block_{layer_name}",
+                extra_args=[
+                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    "--iree-stream-resource-max-allocation-size=1000000000",
+                    "--iree-codegen-check-ir-before-llvm-conversion=false",
+                ],
+            )
+
+        shark_module.load_module(
+            f"{working_dir}/bloom_block_{layer_name}.vmfb"
+        )
+
+        output = shark_module(
+            inputs=(
+                hidden_states.detach().numpy(),
+                alibi.detach().numpy(),
+                causal_mask.detach().numpy(),
+            ),
+            function_name="forward",
+        )
+
+        hidden_states = torch.tensor(output[0]).float()
+
+        torch.save(
+            hidden_states,
+            f"{working_dir}/hidden_states_{int(layer_name) + 1}.pt",
+        )
+
+    elif layer_name == "end":
+        mlir_str = ""
+
+        if will_compile:
+            f = open(f"{working_dir}/ln_f.mlir", encoding="utf-8")
+            mlir_str = f.read()
+            f.close()
+
+            mlir_str = bytes(mlir_str, "utf-8")
+
+        shark_module = SharkInference(
+            mlir_str,
+            device="cpu",
+            mlir_dialect="tm_tensor",
+            device_idx=None,
+        )
+
+        if will_compile:
+            shark_module.save_module(
+                module_name=f"{working_dir}/ln_f",
+                extra_args=[
+                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    "--iree-stream-resource-max-allocation-size=1000000000",
+                    "--iree-codegen-check-ir-before-llvm-conversion=false",
+                ],
+            )
+
+        shark_module.load_module(f"{working_dir}/ln_f.vmfb")
+
+        hidden_states = torch.load(f"{working_dir}/hidden_states_{n_layer}.pt")
+
+        hidden_states = shark_module(
+            inputs=(hidden_states,), function_name="forward"
+        )
+
+        mlir_str = ""
+
+        if will_compile:
+            f = open(f"{working_dir}/lm_head.mlir", encoding="utf-8")
+            mlir_str = f.read()
+            f.close()
+
+            mlir_str = bytes(mlir_str, "utf-8")
+
+        if config["n_embed"] == 14336:
+
+            def get_state_dict():
+                d = torch.load(
+                    f"{working_dir}/pytorch_model_00001-of-00072.bin"
+                )
+                return OrderedDict(
+                    (k.replace("word_embeddings.", ""), v)
+                    for k, v in d.items()
+                )
+
+            def load_causal_lm_head():
+                linear = nn.utils.skip_init(
+                    nn.Linear, 14336, 250880, bias=False, dtype=torch.float
+                )
+                linear.load_state_dict(get_state_dict(), strict=False)
+                return linear.float()
+
+            lm_head = load_causal_lm_head()
+
+            logits = lm_head(torch.tensor(hidden_states).float())
+
+        else:
+            shark_module = SharkInference(
+                mlir_str,
+                device="cpu",
+                mlir_dialect="tm_tensor",
+                device_idx=None,
+            )
+
+            if will_compile:
+                shark_module.save_module(
+                    module_name=f"{working_dir}/lm_head",
+                    extra_args=[
+                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                        "--iree-stream-resource-max-allocation-size=1000000000",
+                        "--iree-codegen-check-ir-before-llvm-conversion=false",
+                    ],
+                )
+
+            shark_module.load_module(f"{working_dir}/lm_head.vmfb")
+
+            logits = shark_module(
+                inputs=(hidden_states,), function_name="forward"
+            )
+
+        logits = torch.tensor(logits).float()
+
+        tokenizer = AutoTokenizer.from_pretrained(working_dir)
+
+        next_token = tokenizer.decode(torch.argmax(logits[:, -1, :], dim=-1))
+
+        f = open(f"{working_dir}/prompt.txt", "w+")
+        f.write(prompt + next_token)
+        f.close()
--- a/shark/examples/shark_training/stable_diffusion/README.md
+++ b/shark/examples/shark_training/stable_diffusion/README.md
@@ -0,0 +1,43 @@
+# Stable Diffusion Fine Tuning
+
+## Installation (Linux)
+
+### Activate shark.venv Virtual Environment
+
+```shell
+source shark.venv/bin/activate
+
+# Some older pip installs may not be able to handle the recent PyTorch deps
+python -m pip install --upgrade pip
+```
+
+## Install dependencies
+
+### Run the following installation commands:
+```
+pip install -U git+https://github.com/huggingface/diffusers.git
+pip install accelerate transformers ftfy
+```
+
+### Build torch-mlir with the following branch:
+
+Please cherry-pick this branch of torch-mlir: https://github.com/vivekkhandelwal1/torch-mlir/tree/sd-ops
+and build it locally. You can find the instructions for using locally build Torch-MLIR,
+here: https://github.com/nod-ai/SHARK#how-to-use-your-locally-built-iree--torch-mlir-with-shark
+
+## Run the Stable diffusion fine tuning
+
+To run the model with the default set of images and params, run:
+```shell
+python stable_diffusion_fine_tuning.py
+```
+By default the training is run through the PyTorch path. If you want to train the model using the Torchdynamo path of Torch-MLIR, you need to specify `--use_torchdynamo=True`.
+
+The default number of training steps are `2000`, which would take many hours to complete based on your system config. You can pass the smaller value with the arg `--training_steps`. You can specify the number of images to be sampled for the result with the `--num_inference_samples` arg. For the number of inference steps you can use `--inference_steps` flag.
+
+For example, you can run the training for a limited set of steps via the dynamo path by using the following command:
+```
+python stable_diffusion_fine_tuning.py --training_steps=1 --inference_steps=1 --num_inference_samples=1 --train_batch_size=1 --use_torchdynamo=True
+```
+
+You can also specify the device to be used via the flag `--device`. The default value is `cpu`, for GPU execution you can specify `--device="cuda"`.
--- a/shark/examples/shark_training/stable_diffusion/stable_diffusion_fine_tuning.py
+++ b/shark/examples/shark_training/stable_diffusion/stable_diffusion_fine_tuning.py
@@ -0,0 +1,914 @@
+# Install the required libs
+# pip install -U git+https://github.com/huggingface/diffusers.git
+# pip install accelerate transformers ftfy
+
+# Import required libraries
+import argparse
+import itertools
+import math
+import os
+from typing import List
+import random
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import PIL
+import logging
+
+import torch_mlir
+from torch_mlir.dynamo import make_simple_dynamo_backend
+import torch._dynamo as dynamo
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
+from shark.shark_inference import SharkInference
+
+torch._dynamo.config.verbose = True
+
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionSafetyChecker,
+)
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+
+# Enter your HuggingFace Token
+# Note: You can comment this prompt and just set your token instead of passing it through cli for every execution.
+hf_token = input("Please enter your huggingface token here: ")
+YOUR_TOKEN = hf_token
+
+
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+# `pretrained_model_name_or_path` which Stable Diffusion checkpoint you want to use
+# Options: 1.) "stabilityai/stable-diffusion-2"
+#          2.) "stabilityai/stable-diffusion-2-base"
+#          3.) "CompVis/stable-diffusion-v1-4"
+#          4.) "runwayml/stable-diffusion-v1-5"
+pretrained_model_name_or_path = "stabilityai/stable-diffusion-2"
+
+# Add here the URLs to the images of the concept you are adding. 3-5 should be fine
+urls = [
+    "https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg",
+    "https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg",
+    "https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg",
+    "https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg",
+    ## You can add additional images here
+]
+
+# Downloading Images
+import requests
+import glob
+from io import BytesIO
+
+
+def download_image(url):
+    try:
+        response = requests.get(url)
+    except:
+        return None
+    return Image.open(BytesIO(response.content)).convert("RGB")
+
+
+images = list(filter(None, [download_image(url) for url in urls]))
+save_path = "./my_concept"
+if not os.path.exists(save_path):
+    os.mkdir(save_path)
+[image.save(f"{save_path}/{i}.jpeg") for i, image in enumerate(images)]
+
+p = argparse.ArgumentParser(
+    description=__doc__,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+)
+p.add_argument(
+    "--input_dir",
+    type=str,
+    default="my_concept/",
+    help="the directory contains the images used for fine tuning",
+)
+p.add_argument(
+    "--output_dir",
+    type=str,
+    default="sd_result",
+    help="the directory contains the images used for fine tuning",
+)
+p.add_argument(
+    "--training_steps",
+    type=int,
+    default=2000,
+    help="the maximum number of training steps",
+)
+p.add_argument(
+    "--train_batch_size",
+    type=int,
+    default=4,
+    help="The batch size for training",
+)
+p.add_argument(
+    "--save_steps",
+    type=int,
+    default=250,
+    help="the number of steps after which to save the learned concept",
+)
+p.add_argument("--seed", type=int, default=42, help="the random seed")
+p.add_argument(
+    "--what_to_teach",
+    type=str,
+    choices=["object", "style"],
+    default="object",
+    help="what is it that you are teaching?",
+)
+p.add_argument(
+    "--placeholder_token",
+    type=str,
+    default="<cat-toy>",
+    help="It is the token you are going to use to represent your new concept",
+)
+p.add_argument(
+    "--initializer_token",
+    type=str,
+    default="toy",
+    help="It is a word that can summarise what is your new concept",
+)
+p.add_argument(
+    "--inference_steps",
+    type=int,
+    default=50,
+    help="the number of steps for inference",
+)
+p.add_argument(
+    "--num_inference_samples",
+    type=int,
+    default=4,
+    help="the number of samples for inference",
+)
+p.add_argument(
+    "--prompt",
+    type=str,
+    default="a grafitti in a wall with a *s on it",
+    help="the text prompt to use",
+)
+p.add_argument(
+    "--device",
+    type=str,
+    default="cpu",
+    help="The device to use",
+)
+p.add_argument(
+    "--use_torchdynamo",
+    type=bool,
+    default=False,
+    help="This flag is used to determine whether the training has to be done through the torchdynamo path or not.",
+)
+args = p.parse_args()
+torch.manual_seed(args.seed)
+
+if "*s" not in args.prompt:
+    raise ValueError(
+        f'The prompt should have a "*s" which will be replaced by a placeholder token.'
+    )
+
+prompt1, prompt2 = args.prompt.split("*s")
+args.prompt = prompt1 + args.placeholder_token + prompt2
+
+# `images_path` is a path to directory containing the training images.
+images_path = args.input_dir
+while not os.path.exists(str(images_path)):
+    print(
+        "The images_path specified does not exist, use the colab file explorer to copy the path :"
+    )
+    images_path = input("")
+save_path = images_path
+
+# Setup and check the images you have just added
+images = []
+for file_path in os.listdir(save_path):
+    try:
+        image_path = os.path.join(save_path, file_path)
+        images.append(Image.open(image_path).resize((512, 512)))
+    except:
+        print(
+            f"{image_path} is not a valid image, please make sure to remove this file from the directory otherwise the training could fail."
+        )
+image_grid(images, 1, len(images))
+
+########### Create Dataset ##########
+
+# Setup the prompt templates for training
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+# Setup the dataset
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [
+            os.path.join(self.data_root, file_path)
+            for file_path in os.listdir(self.data_root)
+        ]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+        }[interpolation]
+
+        self.templates = (
+            imagenet_style_templates_small
+            if learnable_property == "style"
+            else imagenet_templates_small
+        )
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[
+                (h - crop) // 2 : (h + crop) // 2,
+                (w - crop) // 2 : (w + crop) // 2,
+            ]
+
+        image = Image.fromarray(img)
+        image = image.resize(
+            (self.size, self.size), resample=self.interpolation
+        )
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+########## Setting up the model ##########
+
+# Load the tokenizer and add the placeholder token as a additional special token.
+tokenizer = CLIPTokenizer.from_pretrained(
+    pretrained_model_name_or_path,
+    subfolder="tokenizer",
+)
+
+# Add the placeholder token in tokenizer
+num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+if num_added_tokens == 0:
+    raise ValueError(
+        f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+        " `placeholder_token` that is not already in the tokenizer."
+    )
+
+# Get token ids for our placeholder and initializer token.
+# This code block will complain if initializer string is not a single token
+# Convert the initializer_token, placeholder_token to ids
+token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+# Check if initializer_token is a single token or a sequence of tokens
+if len(token_ids) > 1:
+    raise ValueError("The initializer token must be a single token.")
+
+initializer_token_id = token_ids[0]
+placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+# Load the Stable Diffusion model
+# Load models and create wrapper for stable diffusion
+# pipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path)
+# del pipeline
+text_encoder = CLIPTextModel.from_pretrained(
+    pretrained_model_name_or_path, subfolder="text_encoder"
+)
+vae = AutoencoderKL.from_pretrained(
+    pretrained_model_name_or_path, subfolder="vae"
+)
+unet = UNet2DConditionModel.from_pretrained(
+    pretrained_model_name_or_path, subfolder="unet"
+)
+
+# We have added the placeholder_token in the tokenizer so we resize the token embeddings here
+# this will a new embedding vector in the token embeddings for our placeholder_token
+text_encoder.resize_token_embeddings(len(tokenizer))
+
+# Initialise the newly added placeholder token with the embeddings of the initializer token
+token_embeds = text_encoder.get_input_embeddings().weight.data
+token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+
+# In Textual-Inversion we only train the newly added embedding vector
+#  so lets freeze rest of the model parameters here
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+# Freeze vae and unet
+freeze_params(vae.parameters())
+freeze_params(unet.parameters())
+# Freeze all parameters except for the token embeddings in text encoder
+params_to_freeze = itertools.chain(
+    text_encoder.text_model.encoder.parameters(),
+    text_encoder.text_model.final_layer_norm.parameters(),
+    text_encoder.text_model.embeddings.position_embedding.parameters(),
+)
+freeze_params(params_to_freeze)
+
+
+# Move vae and unet to device
+# For the dynamo path default compilation device is `cpu`, since torch-mlir
+# supports only that. Therefore, convert to device only for PyTorch path.
+if not args.use_torchdynamo:
+    vae.to(args.device)
+    unet.to(args.device)
+
+# Keep vae in eval mode as we don't train it
+vae.eval()
+# Keep unet in train mode to enable gradient checkpointing
+unet.train()
+
+
+class VaeModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.vae = vae
+
+    def forward(self, input):
+        x = self.vae.encode(input, return_dict=False)[0]
+        return x
+
+
+class UnetModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.unet = unet
+
+    def forward(self, x, y, z):
+        return self.unet.forward(x, y, z, return_dict=False)[0]
+
+
+shark_vae = VaeModel()
+shark_unet = UnetModel()
+
+####### Creating our training data ########
+
+# Let's create the Dataset and Dataloader
+train_dataset = TextualInversionDataset(
+    data_root=save_path,
+    tokenizer=tokenizer,
+    size=vae.sample_size,
+    placeholder_token=args.placeholder_token,
+    repeats=100,
+    learnable_property=args.what_to_teach,  # Option selected above between object and style
+    center_crop=False,
+    set="train",
+)
+
+
+def create_dataloader(train_batch_size=1):
+    return torch.utils.data.DataLoader(
+        train_dataset, batch_size=train_batch_size, shuffle=True
+    )
+
+
+# Create noise_scheduler for training
+noise_scheduler = DDPMScheduler.from_config(
+    pretrained_model_name_or_path, subfolder="scheduler"
+)
+
+######## Training ###########
+
+# Define hyperparameters for our training. If you are not happy with your results,
+# you can tune the `learning_rate` and the `max_train_steps`
+
+# Setting up all training args
+hyperparameters = {
+    "learning_rate": 5e-04,
+    "scale_lr": True,
+    "max_train_steps": args.training_steps,
+    "save_steps": args.save_steps,
+    "train_batch_size": args.train_batch_size,
+    "gradient_accumulation_steps": 1,
+    "gradient_checkpointing": True,
+    "mixed_precision": "fp16",
+    "seed": 42,
+    "output_dir": "sd-concept-output",
+}
+# creating output directory
+cwd = os.getcwd()
+out_dir = os.path.join(cwd, hyperparameters["output_dir"])
+while not os.path.exists(str(out_dir)):
+    try:
+        os.mkdir(out_dir)
+    except OSError as error:
+        print("Output directory not created")
+
+###### Torch-MLIR Compilation ######
+
+
+def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+    removed_indexes = []
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, (list, tuple)):
+                node_arg = list(node_arg)
+                node_args_len = len(node_arg)
+                for i in range(node_args_len):
+                    curr_index = node_args_len - (i + 1)
+                    if node_arg[curr_index] is None:
+                        removed_indexes.append(curr_index)
+                        node_arg.pop(curr_index)
+                node.args = (tuple(node_arg),)
+                break
+
+    if len(removed_indexes) > 0:
+        fx_g.graph.lint()
+        fx_g.graph.eliminate_dead_code()
+        fx_g.recompile()
+    removed_indexes.sort()
+    return removed_indexes
+
+
+def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+    """
+    Replace tuple with tuple element in functions that return one-element tuples.
+    Returns true if an unwrapping took place, and false otherwise.
+    """
+    unwrapped_tuple = False
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                if len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+                    unwrapped_tuple = True
+                    break
+
+    if unwrapped_tuple:
+        fx_g.graph.lint()
+        fx_g.recompile()
+    return unwrapped_tuple
+
+
+def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                return len(node_arg) == 0
+    return False
+
+
+def transform_fx(fx_g):
+    for node in fx_g.graph.nodes:
+        if node.op == "call_function":
+            if node.target in [
+                torch.ops.aten.empty,
+            ]:
+                # aten.empty should be filled with zeros.
+                if node.target in [torch.ops.aten.empty]:
+                    with fx_g.graph.inserting_after(node):
+                        new_node = fx_g.graph.call_function(
+                            torch.ops.aten.zero_,
+                            args=(node,),
+                        )
+                        node.append(new_node)
+                        node.replace_all_uses_with(new_node)
+                        new_node.args = (node,)
+
+    fx_g.graph.lint()
+
+
+@make_simple_dynamo_backend
+def refbackend_torchdynamo_backend(
+    fx_graph: torch.fx.GraphModule, example_inputs: List[torch.Tensor]
+):
+    # handling usage of empty tensor without initializing
+    transform_fx(fx_graph)
+    fx_graph.recompile()
+    if _returns_nothing(fx_graph):
+        return fx_graph
+    removed_none_indexes = _remove_nones(fx_graph)
+    was_unwrapped = _unwrap_single_tuple_return(fx_graph)
+
+    mlir_module = torch_mlir.compile(
+        fx_graph, example_inputs, output_type="linalg-on-tensors"
+    )
+
+    bytecode_stream = BytesIO()
+    mlir_module.operation.write_bytecode(bytecode_stream)
+    bytecode = bytecode_stream.getvalue()
+
+    shark_module = SharkInference(
+        mlir_module=bytecode, device=args.device, mlir_dialect="tm_tensor"
+    )
+    shark_module.compile()
+
+    def compiled_callable(*inputs):
+        inputs = [x.numpy() for x in inputs]
+        result = shark_module("forward", inputs)
+        if was_unwrapped:
+            result = [
+                result,
+            ]
+        if not isinstance(result, list):
+            result = torch.from_numpy(result)
+        else:
+            result = tuple(torch.from_numpy(x) for x in result)
+            result = list(result)
+            for removed_index in removed_none_indexes:
+                result.insert(removed_index, None)
+            result = tuple(result)
+        return result
+
+    return compiled_callable
+
+
+def predictions(torch_func, jit_func, batchA, batchB):
+    res = jit_func(batchA.numpy(), batchB.numpy())
+    if res is not None:
+        prediction = res
+    else:
+        prediction = None
+    return prediction
+
+
+logger = logging.getLogger(__name__)
+
+
+# def save_progress(text_encoder, placeholder_token_id, accelerator, save_path):
+def save_progress(text_encoder, placeholder_token_id, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = (
+        # accelerator.unwrap_model(text_encoder)
+        text_encoder.get_input_embeddings().weight[placeholder_token_id]
+    )
+    learned_embeds_dict = {
+        args.placeholder_token: learned_embeds.detach().cpu()
+    }
+    torch.save(learned_embeds_dict, save_path)
+
+
+train_batch_size = hyperparameters["train_batch_size"]
+gradient_accumulation_steps = hyperparameters["gradient_accumulation_steps"]
+learning_rate = hyperparameters["learning_rate"]
+if hyperparameters["scale_lr"]:
+    learning_rate = (
+        learning_rate
+        * gradient_accumulation_steps
+        * train_batch_size
+        # * accelerator.num_processes
+    )
+
+# Initialize the optimizer
+optimizer = torch.optim.AdamW(
+    text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
+    lr=learning_rate,
+)
+
+
+# Training function
+def train_func(batch_pixel_values, batch_input_ids):
+    # Convert images to latent space
+    latents = shark_vae(batch_pixel_values).sample().detach()
+    latents = latents * 0.18215
+
+    # Sample noise that we'll add to the latents
+    noise = torch.randn_like(latents)
+    bsz = latents.shape[0]
+    # Sample a random timestep for each image
+    timesteps = torch.randint(
+        0,
+        noise_scheduler.num_train_timesteps,
+        (bsz,),
+        device=latents.device,
+    ).long()
+
+    # Add noise to the latents according to the noise magnitude at each timestep
+    # (this is the forward diffusion process)
+    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+    # Get the text embedding for conditioning
+    encoder_hidden_states = text_encoder(batch_input_ids)[0]
+
+    # Predict the noise residual
+    noise_pred = shark_unet(
+        noisy_latents,
+        timesteps,
+        encoder_hidden_states,
+    )
+
+    # Get the target for loss depending on the prediction type
+    if noise_scheduler.config.prediction_type == "epsilon":
+        target = noise
+    elif noise_scheduler.config.prediction_type == "v_prediction":
+        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+    else:
+        raise ValueError(
+            f"Unknown prediction type {noise_scheduler.config.prediction_type}"
+        )
+
+    loss = (
+        F.mse_loss(noise_pred, target, reduction="none").mean([1, 2, 3]).mean()
+    )
+    loss.backward()
+
+    # Zero out the gradients for all token embeddings except the newly added
+    # embeddings for the concept, as we only want to optimize the concept embeddings
+    grads = text_encoder.get_input_embeddings().weight.grad
+    # Get the index for tokens that we want to zero the grads for
+    index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
+    grads.data[index_grads_to_zero, :] = grads.data[
+        index_grads_to_zero, :
+    ].fill_(0)
+
+    optimizer.step()
+    optimizer.zero_grad()
+
+    return loss
+
+
+def training_function():
+    max_train_steps = hyperparameters["max_train_steps"]
+    output_dir = hyperparameters["output_dir"]
+    gradient_checkpointing = hyperparameters["gradient_checkpointing"]
+
+    train_dataloader = create_dataloader(train_batch_size)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / gradient_accumulation_steps
+    )
+    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = (
+        train_batch_size
+        * gradient_accumulation_steps
+        # train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    )
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(
+        f"  Gradient Accumulation steps = {gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        # range(max_train_steps), disable=not accelerator.is_local_main_process
+        range(max_train_steps)
+    )
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    params_ = [i for i in text_encoder.get_input_embeddings().parameters()]
+    if args.use_torchdynamo:
+        print("******** TRAINING STARTED - TORCHYDNAMO PATH ********")
+    else:
+        print("******** TRAINING STARTED - PYTORCH PATH ********")
+    print("Initial weights:")
+    print(params_, params_[0].shape)
+
+    for epoch in range(num_train_epochs):
+        text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            if args.use_torchdynamo:
+                dynamo_callable = dynamo.optimize(
+                    refbackend_torchdynamo_backend
+                )(train_func)
+                lam_func = lambda x, y: dynamo_callable(
+                    torch.from_numpy(x), torch.from_numpy(y)
+                )
+                loss = predictions(
+                    train_func,
+                    lam_func,
+                    batch["pixel_values"],
+                    batch["input_ids"],
+                    # params[0].detach(),
+                )
+            else:
+                loss = train_func(batch["pixel_values"], batch["input_ids"])
+            print(loss)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            progress_bar.update(1)
+            global_step += 1
+            if global_step % hyperparameters["save_steps"] == 0:
+                save_path = os.path.join(
+                    output_dir,
+                    f"learned_embeds-step-{global_step}.bin",
+                )
+                save_progress(
+                    text_encoder,
+                    placeholder_token_id,
+                    save_path,
+                )
+
+            logs = {"loss": loss.detach().item()}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= max_train_steps:
+                break
+
+    # Create the pipeline using using the trained modules and save it.
+    params__ = [i for i in text_encoder.get_input_embeddings().parameters()]
+    print("******** TRAINING PROCESS FINISHED ********")
+    print("Updated weights:")
+    print(params__, params__[0].shape)
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        pretrained_model_name_or_path,
+        # text_encoder=accelerator.unwrap_model(text_encoder),
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        vae=vae,
+        unet=unet,
+    )
+    pipeline.save_pretrained(output_dir)
+    # Also save the newly trained embeddings
+    save_path = os.path.join(output_dir, f"learned_embeds.bin")
+    save_progress(text_encoder, placeholder_token_id, save_path)
+
+
+training_function()
+
+for param in itertools.chain(unet.parameters(), text_encoder.parameters()):
+    if param.grad is not None:
+        del param.grad  # free some memory
+    torch.cuda.empty_cache()
+
+# Set up the pipeline
+from diffusers import DPMSolverMultistepScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    hyperparameters["output_dir"],
+    scheduler=DPMSolverMultistepScheduler.from_pretrained(
+        hyperparameters["output_dir"], subfolder="scheduler"
+    ),
+)
+if not args.use_torchdynamo:
+    pipe.to(args.device)
+
+# Run the Stable Diffusion pipeline
+# Don't forget to use the placeholder token in your prompt
+
+all_images = []
+for _ in range(args.num_inference_samples):
+    images = pipe(
+        [args.prompt],
+        num_inference_steps=args.inference_steps,
+        guidance_scale=7.5,
+    ).images
+    all_images.extend(images)
+
+output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
+if not os.path.isdir(args.output_dir):
+    os.mkdir(args.output_dir)
+
+[
+    image.save(f"{args.output_dir}/{i}.jpeg")
+    for i, image in enumerate(all_images)
+]
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -19,10 +19,14 @@ import sys
 import subprocess


-def run_cmd(cmd):
+def run_cmd(cmd, debug=False):
    """
    Inputs: cli command string.
    """
+    if debug:
+        print("IREE run command: \n\n")
+        print(cmd)
+        print("\n\n")
    try:
        result = subprocess.run(
            cmd,
@@ -31,8 +35,9 @@ def run_cmd(cmd):
            stderr=subprocess.PIPE,
            check=True,
        )
-        result_str = result.stdout.decode()
-        return result_str
+        stdout = result.stdout.decode()
+        stderr = result.stderr.decode()
+        return stdout, stderr
    except subprocess.CalledProcessError as e:
        print(e.output)
        sys.exit(f"Exiting program due to error running {cmd}")
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -90,6 +90,7 @@ def build_benchmark_args(
            benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
    # if time_extractor:
    #    benchmark_cl.append(time_extractor)
+    benchmark_cl.append(f"--print_statistics=true")
    return benchmark_cl


@@ -129,7 +130,8 @@ def build_benchmark_args_non_tensor_input(

 def run_benchmark_module(benchmark_cl):
    """
-    Run benchmark command, extract result and return iteration/seconds.
+    Run benchmark command, extract result and return iteration/seconds, host
+    peak memory, and device peak memory.

    # TODO: Add an example of the benchmark command.
    Input: benchmark command.
@@ -138,10 +140,22 @@ def run_benchmark_module(benchmark_cl):
    assert os.path.exists(
        benchmark_path
    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
-    bench_result = run_cmd(" ".join(benchmark_cl))
-    print(bench_result)
-    regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
-    match = regex_split.search(bench_result)
-    time = float(match.group(1))
-    unit = match.group(3)
-    return 1.0 / (time * 0.001)
+    bench_stdout, bench_stderr = run_cmd(" ".join(benchmark_cl))
+    try:
+        regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
+        match = regex_split.search(bench_stdout)
+        time_ms = float(match.group(1))
+        unit = match.group(3)
+    except AttributeError:
+        regex_split = re.compile("(\d+[.]*\d*)([a-zA-Z]+)")
+        match = regex_split.search(bench_stdout)
+        time_ms = float(match.group(1))
+        unit = match.group(2)
+    iter_per_second = 1.0 / (time_ms * 0.001)
+
+    # Extract peak memory.
+    host_regex = re.compile(r".*HOST_LOCAL:\s*([0-9]+)B peak")
+    host_peak_b = int(host_regex.search(bench_stderr).group(1))
+    device_regex = re.compile(r".*DEVICE_LOCAL:\s*([0-9]+)B peak")
+    device_peak_b = int(device_regex.search(bench_stderr).group(1))
+    return iter_per_second, host_peak_b, device_peak_b
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -52,11 +52,11 @@ def get_iree_device_args(device, extra_args=[]):

 # Get the iree-compiler arguments given frontend.
 def get_iree_frontend_args(frontend):
-    if frontend in ["torch", "pytorch", "linalg"]:
-        return ["--iree-llvm-target-cpu-features=host"]
+    if frontend in ["torch", "pytorch", "linalg", "tm_tensor"]:
+        return ["--iree-llvmcpu-target-cpu-features=host"]
    elif frontend in ["tensorflow", "tf", "mhlo"]:
        return [
-            "--iree-llvm-target-cpu-features=host",
+            "--iree-llvmcpu-target-cpu-features=host",
            "--iree-mhlo-demote-i64-to-i32=false",
            "--iree-flow-demote-i64-to-i32",
        ]
@@ -70,7 +70,6 @@ def get_iree_common_args():
    return [
        "--iree-stream-resource-index-bits=64",
        "--iree-vm-target-index-bits=64",
-        "--iree-vm-bytecode-module-strip-source-map=true",
        "--iree-util-zero-fill-elided-attrs",
    ]

@@ -189,21 +188,23 @@ def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
                        benchmark_bash.write(" ".join(benchmark_cl))
                        benchmark_bash.close()

-                        benchmark_data = run_benchmark_module(benchmark_cl)
+                        iter_per_second, _, _ = run_benchmark_module(
+                            benchmark_cl
+                        )

                        benchmark_file = open(
                            f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
                        )
                        benchmark_file.write(f"DISPATCH: {d_}\n")
-                        benchmark_file.write(str(benchmark_data) + "\n")
+                        benchmark_file.write(str(iter_per_second) + "\n")
                        benchmark_file.write(
                            "SHARK BENCHMARK RESULT: "
-                            + str(1 / (benchmark_data * 0.001))
+                            + str(1 / (iter_per_second * 0.001))
                            + "\n"
                        )
                        benchmark_file.close()

-                        benchmark_runtimes[d_] = 1 / (benchmark_data * 0.001)
+                        benchmark_runtimes[d_] = 1 / (iter_per_second * 0.001)

                    elif ".mlir" in f_ and "benchmark" not in f_:
                        dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
@@ -294,7 +295,8 @@ def get_iree_module(flatbuffer_blob, device, device_idx=None):
        haldriver = ireert.get_driver(device)

        haldevice = haldriver.create_device(
-            haldriver.query_available_devices()[device_idx]["device_id"]
+            haldriver.query_available_devices()[device_idx]["device_id"],
+            allocators=shark_args.device_allocator,
        )
        config = ireert.Config(device=haldevice)
    else:
@@ -403,5 +405,10 @@ def get_results(

 def get_iree_runtime_config(device):
    device = iree_device_map(device)
-    config = ireert.Config(device=ireert.get_device(device))
+    haldriver = ireert.get_driver(device)
+    haldevice = haldriver.create_device_by_uri(
+        device,
+        allocators=shark_args.device_allocator,
+    )
+    config = ireert.Config(device=haldevice)
    return config
--- a/shark/iree_utils/cpu_utils.py
+++ b/shark/iree_utils/cpu_utils.py
@@ -44,4 +44,4 @@ def get_iree_cpu_args():
        error_message = f"OS Type f{os_name} not supported and triple can't be determined, open issue to dSHARK team please :)"
        raise Exception(error_message)
    print(f"Target triple found:{target_triple}")
-    return [f"-iree-llvm-target-triple={target_triple}"]
+    return [f"--iree-llvmcpu-target-triple={target_triple}"]
--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -22,7 +22,7 @@ from shark.parser import shark_args
 # Get the default gpu args given the architecture.
 def get_iree_gpu_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    ireert.flags.parse_flags("--cuda_allow_inline_execution", "--device_allocator=caching")
+    ireert.flags.parse_flags("--cuda_allow_inline_execution")
    # TODO: Give the user_interface to pass the sm_arch.
    sm_arch = get_cuda_sm_cc()
    if (
@@ -30,11 +30,10 @@ def get_iree_gpu_args():
        in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86", "sm_89"]
    ) and (shark_args.enable_tf32 == True):
        return [
-            "--iree-hal-cuda-disable-loop-nounroll-wa",
            f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
        ]
    else:
-        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
+        return []


 # Get the default gpu args given the architecture.
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -131,6 +131,8 @@ def get_vendor(triple):
        return "ARM"
    if arch == "m1":
        return "Apple"
+    if arch in ["arc", "UHD"]:
+        return "Intel"
    if arch in ["turing", "ampere"]:
        return "NVIDIA"
    if arch == "ardeno":
@@ -149,7 +151,7 @@ def get_device_type(triple):
        return "Unknown"
    if arch == "cpu":
        return "CPU"
-    if arch in ["turing", "ampere"]:
+    if arch in ["turing", "ampere", "arc"]:
        return "DiscreteGPU"
    if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn5"]:
        if product == "ivega10":
@@ -343,6 +345,37 @@ def get_vulkan_target_capabilities(triple):
        cap["variablePointers"] = True
        cap["variablePointersStorageBuffer"] = True

+    elif arch == "arc":
+        cap["maxComputeSharedMemorySize"] = 32768
+        cap["maxComputeWorkGroupInvocations"] = 1024
+        cap["maxComputeWorkGroupSize"] = [1024, 1024, 64]
+
+        cap["subgroupSize"] = 32
+        cap["subgroupFeatures"] = [
+            "Basic",
+            "Vote",
+            "Arithmetic",
+            "Ballot",
+            "Shuffle",
+            "ShuffleRelative",
+            "Clustered",
+            "Quad",
+        ]
+
+        cap["shaderFloat16"] = True
+        cap["shaderFloat64"] = False
+        cap["shaderInt8"] = True
+        cap["shaderInt16"] = True
+        cap["shaderInt64"] = False
+        cap["storageBuffer16BitAccess"] = True
+        cap["storagePushConstant16"] = True
+        cap["uniformAndStorageBuffer16BitAccess"] = True
+        cap["storageBuffer8BitAccess"] = True
+        cap["storagePushConstant8"] = True
+        cap["uniformAndStorageBuffer8BitAccess"] = True
+        cap["variablePointers"] = True
+        cap["variablePointersStorageBuffer"] = True
+
    elif arch == "cpu":
        if product == "swiftshader":
            cap["maxComputeSharedMemorySize"] = 16384
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -22,7 +22,8 @@ from shark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag


 def get_vulkan_device_name():
-    vulkaninfo_dump = run_cmd("vulkaninfo").split(linesep)
+    vulkaninfo_dump, _ = run_cmd("vulkaninfo")
+    vulkaninfo_dump = vulkaninfo_dump.split(linesep)
    vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
    if len(vulkaninfo_list) == 0:
        raise ValueError("No device name found in VulkanInfo!")
@@ -108,6 +109,9 @@ def get_vulkan_target_triple(device_name):
        triple = f"rdna3-7900-{system_os}"
    elif any(x in device_name for x in ("AMD", "Radeon")):
        triple = f"rdna2-unknown-{system_os}"
+    # Intel Targets
+    elif any(x in device_name for x in ("A770", "A750")):
+        triple = f"arc-770-{system_os}"
    else:
        triple = None
    return triple
@@ -139,8 +143,9 @@ def get_vulkan_triple_flag(device_name="", extra_args=[]):


 def get_iree_vulkan_args(extra_args=[]):
-    res_vulkan_flag = ["--device_allocator=caching"]
+    # res_vulkan_flag = ["--iree-flow-demote-i64-to-i32"]

+    res_vulkan_flag = []
    vulkan_triple_flag = None
    for arg in extra_args:
        if "-iree-vulkan-target-triple=" in arg:
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -108,4 +108,14 @@ parser.add_argument(
    help="Enables the --iree-flow-enable-conv-winograd-transform flag.",
 )

+parser.add_argument(
+    "--device_allocator",
+    type=str,
+    nargs="*",
+    default=[],
+    help="Specifies one or more HAL device allocator specs "
+    "to augment the base device allocator",
+    choices=["debug", "caching"],
+)
+
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -21,9 +21,17 @@ from shark.iree_utils.benchmark_utils import (
 from shark.parser import shark_args
 from datetime import datetime
 import time
+from typing import Optional
 import csv
 import os

+TF_CPU_DEVICE = "/CPU:0"
+TF_GPU_DEVICE = "/GPU:0"
+
+
+def _bytes_to_mb_str(bytes_: Optional[int]) -> str:
+    return "" if bytes_ is None else f"{bytes_ / 1e6:.6f}"
+

 class OnnxFusionOptions(object):
    def __init__(self):
@@ -70,6 +78,7 @@ class SharkBenchmarkRunner(SharkRunner):
        self.vmfb_file = None
        self.mlir_dialect = mlir_dialect
        self.extra_args = extra_args
+        self.import_args = {}
        SharkRunner.__init__(
            self,
            mlir_module,
@@ -104,39 +113,56 @@ class SharkBenchmarkRunner(SharkRunner):

    def benchmark_torch(self, modelname):
        import torch
-        import torch._dynamo as dynamo
        from tank.model_utils import get_torch_model

        if self.device == "cuda":
            torch.set_default_tensor_type(torch.cuda.FloatTensor)
            if self.enable_tf32:
-                torch.backends.cuda.matmul.allow_tf32 = True
+                print(
+                    "Currently disabled TensorFloat32 calculations in pytorch benchmarks."
+                )
+                # torch.backends.cuda.matmul.allow_tf32 = True
        else:
            torch.set_default_tensor_type(torch.FloatTensor)
        torch_device = torch.device(
            "cuda:0" if self.device == "cuda" else "cpu"
        )
-        HFmodel, input = get_torch_model(modelname)[:2]
+        HFmodel, input = get_torch_model(modelname, self.import_args)[:2]
        frontend_model = HFmodel.model
-        # frontend_model = dynamo.optimize("inductor")(frontend_model)
        frontend_model.to(torch_device)
        input.to(torch_device)

+        # TODO: re-enable as soon as pytorch CUDA context issues are resolved
+        # try:
+        #    frontend_model = torch.compile(
+        #        frontend_model, mode="max-autotune", backend="inductor"
+        #    )
+        # except RuntimeError:
+        #    frontend_model = HFmodel.model
+
        for i in range(shark_args.num_warmup_iterations):
            frontend_model.forward(input)

+        if self.device == "cuda":
+            torch.cuda.reset_peak_memory_stats()
        begin = time.time()
        for i in range(shark_args.num_iterations):
            out = frontend_model.forward(input)
-            if i == shark_args.num_iterations - 1:
-                end = time.time()
-                break
+        end = time.time()
+        if self.device == "cuda":
+            stats = torch.cuda.memory_stats()
+            device_peak_b = stats["allocated_bytes.all.peak"]
+        else:
+            device_peak_b = None
+
        print(
            f"Torch benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
        return [
            f"{shark_args.num_iterations/(end-begin)}",
            f"{((end-begin)/shark_args.num_iterations)*1000}",
+            "",  # host_peak_b (CPU usage) is not reported by PyTorch.
+            _bytes_to_mb_str(device_peak_b),
        ]

    def benchmark_tf(self, modelname):
@@ -154,38 +180,55 @@ class SharkBenchmarkRunner(SharkRunner):

        from tank.model_utils_tf import get_tf_model

-        # tf_device = "/GPU:0" if self.device == "cuda" else "/CPU:0"
-        tf_device = "/CPU:0"
+        # tf_device = TF_GPU_DEVICE if self.device == "cuda" else TF_CPU_DEVICE
+        tf_device = TF_CPU_DEVICE
        with tf.device(tf_device):
            (
                model,
                input,
            ) = get_tf_model(
-                modelname
+                modelname, self.import_args
            )[:2]
            frontend_model = model

            for i in range(shark_args.num_warmup_iterations):
                frontend_model.forward(*input)

+            if tf_device == TF_GPU_DEVICE:
+                tf.config.experimental.reset_memory_stats(tf_device)
            begin = time.time()
            for i in range(shark_args.num_iterations):
                out = frontend_model.forward(*input)
-                if i == shark_args.num_iterations - 1:
-                    end = time.time()
-                    break
+            end = time.time()
+            if tf_device == TF_GPU_DEVICE:
+                memory_info = tf.config.experimental.get_memory_info(tf_device)
+                device_peak_b = memory_info["peak"]
+            else:
+                # tf.config.experimental does not currently support measuring
+                # CPU memory usage.
+                device_peak_b = None
+
            print(
                f"TF benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
            )
            return [
                f"{shark_args.num_iterations/(end-begin)}",
                f"{((end-begin)/shark_args.num_iterations)*1000}",
+                "",  # host_peak_b (CPU usage) is not reported by TensorFlow.
+                _bytes_to_mb_str(device_peak_b),
            ]

    def benchmark_c(self):
-        result = run_benchmark_module(self.benchmark_cl)
-        print(f"Shark-IREE-C benchmark:{result} iter/second")
-        return [f"{result}", f"{1000/result}"]
+        iter_per_second, host_peak_b, device_peak_b = run_benchmark_module(
+            self.benchmark_cl
+        )
+        print(f"Shark-IREE-C benchmark:{iter_per_second} iter/second")
+        return [
+            f"{iter_per_second}",
+            f"{1000/iter_per_second}",
+            _bytes_to_mb_str(host_peak_b),
+            _bytes_to_mb_str(device_peak_b),
+        ]

    def benchmark_python(self, inputs):
        input_list = [x for x in inputs]
@@ -195,8 +238,7 @@ class SharkBenchmarkRunner(SharkRunner):
        begin = time.time()
        for i in range(shark_args.num_iterations):
            out = self.run("forward", input_list)
-            if i == shark_args.num_iterations - 1:
-                end = time.time()
+        end = time.time()
        print(
            f"Shark-IREE Python benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
@@ -305,11 +347,19 @@ for currently supported models. Exiting benchmark ONNX."
        return comp_str

    def benchmark_all_csv(
-        self, inputs: tuple, modelname, dynamic, device_str, frontend
+        self,
+        inputs: tuple,
+        modelname,
+        dynamic,
+        device_str,
+        frontend,
+        import_args,
    ):
        self.setup_cl(inputs)
+        self.import_args = import_args
        field_names = [
            "model",
+            "batch_size",
            "engine",
            "dialect",
            "device",
@@ -323,7 +373,12 @@ for currently supported models. Exiting benchmark ONNX."
            "tags",
            "notes",
            "datetime",
+            "host_memory_mb",
+            "device_memory_mb",
+            "measured_host_memory_mb",
+            "measured_device_memory_mb",
        ]
+        # "frontend" must be the first element.
        engines = ["frontend", "shark_python", "shark_iree_c"]
        if shark_args.onnx_bench == True:
            engines.append("onnxruntime")
@@ -335,75 +390,77 @@ for currently supported models. Exiting benchmark ONNX."

        with open("bench_results.csv", mode="a", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=field_names)
-            bench_result = {}
-            bench_result["model"] = modelname
+            bench_info = {}
+            bench_info["model"] = modelname
+            bench_info["batch_size"] = str(import_args["batch_size"])
+            bench_info["dialect"] = self.mlir_dialect
+            bench_info["iterations"] = shark_args.num_iterations
            if dynamic == True:
-                bench_result["shape_type"] = "dynamic"
+                bench_info["shape_type"] = "dynamic"
            else:
-                bench_result["shape_type"] = "static"
-            bench_result["device"] = device_str
+                bench_info["shape_type"] = "static"
+            bench_info["device"] = device_str
            if "fp16" in modelname:
-                bench_result["data_type"] = "float16"
+                bench_info["data_type"] = "float16"
            else:
-                bench_result["data_type"] = inputs[0].dtype
+                bench_info["data_type"] = inputs[0].dtype
+
            for e in engines:
-                (
-                    bench_result["param_count"],
-                    bench_result["tags"],
-                    bench_result["notes"],
-                ) = ["", "", ""]
+                engine_result = {}
                if e == "frontend":
-                    bench_result["engine"] = frontend
+                    engine_result["engine"] = frontend
                    if check_requirements(frontend):
                        (
-                            bench_result["iter/sec"],
-                            bench_result["ms/iter"],
+                            engine_result["iter/sec"],
+                            engine_result["ms/iter"],
+                            engine_result["host_memory_mb"],
+                            engine_result["device_memory_mb"],
                        ) = self.benchmark_frontend(modelname)
-                        self.frontend_result = bench_result["ms/iter"]
-                        bench_result["vs. PyTorch/TF"] = "baseline"
+                        self.frontend_result = engine_result["ms/iter"]
+                        engine_result["vs. PyTorch/TF"] = "baseline"
                        (
-                            bench_result["param_count"],
-                            bench_result["tags"],
-                            bench_result["notes"],
+                            engine_result["param_count"],
+                            engine_result["tags"],
+                            engine_result["notes"],
                        ) = self.get_metadata(modelname)
                    else:
                        self.frontend_result = None
                        continue

                elif e == "shark_python":
-                    bench_result["engine"] = "shark_python"
+                    engine_result["engine"] = "shark_python"
                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
+                        engine_result["iter/sec"],
+                        engine_result["ms/iter"],
                    ) = self.benchmark_python(inputs)

-                    bench_result[
+                    engine_result[
                        "vs. PyTorch/TF"
                    ] = self.compare_bench_results(
-                        self.frontend_result, bench_result["ms/iter"]
+                        self.frontend_result, engine_result["ms/iter"]
                    )

                elif e == "shark_iree_c":
-                    bench_result["engine"] = "shark_iree_c"
+                    engine_result["engine"] = "shark_iree_c"
                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
+                        engine_result["iter/sec"],
+                        engine_result["ms/iter"],
+                        engine_result["host_memory_mb"],
+                        engine_result["device_memory_mb"],
                    ) = self.benchmark_c()

-                    bench_result[
+                    engine_result[
                        "vs. PyTorch/TF"
                    ] = self.compare_bench_results(
-                        self.frontend_result, bench_result["ms/iter"]
+                        self.frontend_result, engine_result["ms/iter"]
                    )

                elif e == "onnxruntime":
-                    bench_result["engine"] = "onnxruntime"
+                    engine_result["engine"] = "onnxruntime"
                    (
-                        bench_result["iter/sec"],
-                        bench_result["ms/iter"],
+                        engine_result["iter/sec"],
+                        engine_result["ms/iter"],
                    ) = self.benchmark_onnx(modelname, inputs)

-                bench_result["dialect"] = self.mlir_dialect
-                bench_result["iterations"] = shark_args.num_iterations
-                bench_result["datetime"] = str(datetime.now())
-                writer.writerow(bench_result)
+                engine_result["datetime"] = str(datetime.now())
+                writer.writerow(bench_info | engine_result)
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -99,6 +99,7 @@ else:
    print(
        f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
    )
+os.makedirs(WORKDIR, exist_ok=True)


 # Checks whether the directory and files exists.
@@ -138,21 +139,35 @@ def download_model(
    tank_url="gs://shark_tank/latest",
    frontend=None,
    tuned=None,
+    import_args={"batch_size": "1"},
 ):
    model_name = model_name.replace("/", "_")
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
-    model_dir_name = model_name + "_" + frontend
+    if import_args["batch_size"] != 1:
+        model_dir_name = (
+            model_name
+            + "_"
+            + frontend
+            + "_BS"
+            + str(import_args["batch_size"])
+        )
+    else:
+        model_dir_name = model_name + "_" + frontend
    model_dir = os.path.join(WORKDIR, model_dir_name)
    full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name

    if not check_dir_exists(
        model_dir_name, frontend=frontend, dynamic=dyn_str
    ):
-        print(f"Downloading artifacts for model {model_name}...")
+        print(
+            f"Force-updating artifacts for model {model_name} from: {full_gs_url}"
+        )
        download_public_file(full_gs_url, model_dir)
    elif shark_args.force_update_tank == True:
-        print(f"Force-updating artifacts for model {model_name}...")
+        print(
+            f"Force-updating artifacts for model {model_name} from: {full_gs_url}"
+        )
        download_public_file(full_gs_url, model_dir)
    else:
        if not _internet_connected():
@@ -189,9 +204,17 @@ def download_model(
    suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
    filename = os.path.join(model_dir, model_name + suffix)

+    if not os.path.exists(filename):
+        from tank.generate_sharktank import gen_shark_files
+
+        print(
+            "The model data was not found. Trying to generate artifacts locally."
+        )
+        gen_shark_files(model_name, frontend, WORKDIR, import_args)
+
+    assert os.path.exists(filename), f"MLIR not found at {filename}"
    with open(filename, mode="rb") as f:
        mlir_file = f.read()
-
    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -4,6 +4,17 @@
 import sys
 import tempfile
 import os
+import hashlib
+
+
+def create_hash(file_name):
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(2**20):
+            file_hash.update(chunk)
+
+    return file_hash.hexdigest()
+

 # List of the supported frontends.
 supported_frontends = {
@@ -140,6 +151,7 @@ class SharkImporter:
        outputs_name = "golden_out.npz"
        func_file_name = "function_name"
        model_name_mlir = model_name + "_" + self.frontend + ".mlir"
+        print(f"saving {model_name_mlir} to {dir}")
        try:
            inputs = [x.cpu().detach() for x in inputs]
        except AttributeError:
@@ -150,11 +162,11 @@ class SharkImporter:
        np.savez(os.path.join(dir, inputs_name), *inputs)
        np.savez(os.path.join(dir, outputs_name), *outputs)
        np.save(os.path.join(dir, func_file_name), np.array(func_name))
-
        if self.frontend == "torch":
            with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
                mlir_file.write(mlir_data)
-
+            mlir_hash = create_hash(os.path.join(dir, model_name_mlir))
+            np.save(os.path.join(dir, "hash"), np.array(mlir_hash))
        return

    def import_debug(
@@ -285,6 +297,7 @@ def transform_fx(fx_g):
            if node.target in [
                torch.ops.aten.arange,
                torch.ops.aten.empty,
+                torch.ops.aten.zeros,
            ]:
                node.kwargs = kwargs_dict
            # Inputs and outputs of aten.var.mean should be upcasted to fp32.
@@ -377,7 +390,10 @@ def import_with_fx(

    golden_values = None
    if debug:
-        golden_values = model(*inputs)
+        try:
+            golden_values = model(*inputs)
+        except:
+            golden_values = None
    # TODO: Control the decompositions.
    fx_g = make_fx(
        model,
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -1,28 +1,29 @@
 resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
-roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
+bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"","enabled_windows"
+camembert-base,mhlo,tf,1e-2,1e-3,default,None,True,True,True,"",""
 dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/iree-org/iree/issues/9971",""
 distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342",""
+facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342","macos"
 funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/201",""
 google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile",""
 google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,False,"",""
 microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
 microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
+microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,True,True,True,"",""
 albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir",""
 alexnet,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/879",""
 bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
 bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
 bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,"",""
+bert-large-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+bert-large-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
 google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
-microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390",""
+microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390","macos"
 microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
 google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"https://github.com/nod-ai/SHARK/issues/344",""
 mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/388","macos"
 nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/343","macos"
@@ -33,4 +34,13 @@ resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc/img2col,True,False,True,"
 squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
 efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
+efficientnet_b0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/1243",""
+efficientnet_b7,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,False,False,"Torchvision imports issue",""
+efficientnet_b0,mhlo,tf,1e-2,1e-3,default,None,nhcw-nhwc,False,False,False,"",""
+efficientnet_b7,mhlo,tf,1e-2,1e-3,default,None,nhcw-nhwc,False,False,False,"",""
+gpt2,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"",""
+t5-base,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported.",""
+t5-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+t5-large,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported",""
+t5-large,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
--- a/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
+++ b/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
@@ -63,14 +63,14 @@ if __name__ == "__main__":
    # Compile the model using IREE
    backend = "dylib-llvm-aot"
    args = [
-        "--iree-llvm-target-cpu-features=host",
+        "--iree-llvmcpu-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
        "--iree-flow-demote-i64-to-i32",
    ]
    backend_config = "dylib"
    # backend = "cuda"
    # backend_config = "cuda"
-    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-enable-fusion-with-reduction-ops"]
    flatbuffer_blob = compile_str(
        compiler_module,
        target_backends=[backend],
--- a/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
@@ -136,7 +136,7 @@ if __name__ == "__main__":
    backend = "dylib-llvm-aot"
    if backend == "dylib-llvm-aot":
        args = [
-            "--iree-llvm-target-cpu-features=host",
+            "--iree-llvmcpu-target-cpu-features=host",
            "--iree-mhlo-demote-i64-to-i32=false",
            "--iree-flow-demote-i64-to-i32",
        ]
@@ -146,7 +146,6 @@ if __name__ == "__main__":
        backend_config = "cuda"
        args = [
            "--iree-cuda-llvm-target-arch=sm_80",
-            "--iree-hal-cuda-disable-loop-nounroll-wa",
            "--iree-enable-fusion-with-reduction-ops",
        ]

--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -83,7 +83,7 @@ if __name__ == "__main__":
    # Compile the model using IREE
    backend = "dylib-llvm-aot"
    args = [
-        "--iree-llvm-target-cpu-features=host",
+        "--iree-llvmcpu-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
        "--iree-stream-resource-index-bits=64",
        "--iree-vm-target-index-bits=64",
@@ -91,7 +91,7 @@ if __name__ == "__main__":
    backend_config = "dylib"
    # backend = "cuda"
    # backend_config = "cuda"
-    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-enable-fusion-with-reduction-ops"]
    flatbuffer_blob = compile_str(
        compiler_module,
        target_backends=[backend],
--- a/tank/examples/bert_tf/bert_small_run.py
+++ b/tank/examples/bert_tf/bert_small_run.py
@@ -79,14 +79,14 @@ if __name__ == "__main__":
    # Compile the model using IREE
    backend = "dylib-llvm-aot"
    args = [
-        "--iree-llvm-target-cpu-features=host",
+        "--iree-llvmcpu-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
        "--iree-flow-demote-i64-to-i32",
    ]
    backend_config = "dylib"
    # backend = "cuda"
    # backend_config = "cuda"
-    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-enable-fusion-with-reduction-ops"]
    flatbuffer_blob = compile_str(
        compiler_module,
        target_backends=[backend],
--- a/tank/generate_sharktank.py
+++ b/tank/generate_sharktank.py
@@ -33,9 +33,10 @@ def create_hash(file_name):
    return file_hash.hexdigest()


-def save_torch_model(torch_model_list):
+def save_torch_model(torch_model_list, local_tank_cache, import_args):
    from tank.model_utils import (
        get_hf_model,
+        get_hf_seq2seq_model,
        get_vision_model,
        get_hf_img_cls_model,
        get_fp16_model,
@@ -52,14 +53,13 @@ def save_torch_model(torch_model_list):

            tracing_required = False if tracing_required == "False" else True
            is_dynamic = False if is_dynamic == "False" else True
-
+            print("generating artifacts for: " + torch_model_name)
            model = None
            input = None
            if model_type == "stable_diffusion":
                args.use_tuned = False
                args.import_mlir = True
-                args.use_tuned = False
-                args.local_tank_cache = WORKDIR
+                args.local_tank_cache = local_tank_cache

                precision_values = ["fp16"]
                seq_lengths = [64, 77]
@@ -74,24 +74,41 @@ def save_torch_model(torch_model_list):
                            width=512,
                            height=512,
                            use_base_vae=False,
+                            custom_vae="",
                            debug=True,
-                            sharktank_dir=WORKDIR,
+                            sharktank_dir=local_tank_cache,
                            generate_vmfb=False,
                        )
                        model()
                continue
            if model_type == "vision":
-                model, input, _ = get_vision_model(torch_model_name)
+                model, input, _ = get_vision_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "hf":
-                model, input, _ = get_hf_model(torch_model_name)
+                model, input, _ = get_hf_model(torch_model_name, import_args)
+            elif model_type == "hf_seq2seq":
+                model, input, _ = get_hf_seq2seq_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "hf_img_cls":
-                model, input, _ = get_hf_img_cls_model(torch_model_name)
+                model, input, _ = get_hf_img_cls_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "fp16":
-                model, input, _ = get_fp16_model(torch_model_name)
+                model, input, _ = get_fp16_model(torch_model_name, import_args)
            torch_model_name = torch_model_name.replace("/", "_")
-            torch_model_dir = os.path.join(
-                WORKDIR, str(torch_model_name) + "_torch"
-            )
+            if import_args["batch_size"] != 1:
+                torch_model_dir = os.path.join(
+                    local_tank_cache,
+                    str(torch_model_name)
+                    + "_torch"
+                    + f"_BS{str(import_args['batch_size'])}",
+                )
+            else:
+                torch_model_dir = os.path.join(
+                    local_tank_cache, str(torch_model_name) + "_torch"
+                )
            os.makedirs(torch_model_dir, exist_ok=True)

            mlir_importer = SharkImporter(
@@ -105,12 +122,6 @@ def save_torch_model(torch_model_list):
                dir=torch_model_dir,
                model_name=torch_model_name,
            )
-            mlir_hash = create_hash(
-                os.path.join(
-                    torch_model_dir, torch_model_name + "_torch" + ".mlir"
-                )
-            )
-            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
            # Generate torch dynamic models.
            if is_dynamic:
                mlir_importer.import_debug(
@@ -121,12 +132,14 @@ def save_torch_model(torch_model_list):
                )


-def save_tf_model(tf_model_list):
+def save_tf_model(tf_model_list, local_tank_cache, import_args):
    from tank.model_utils_tf import (
        get_causal_image_model,
+        get_masked_lm_model,
        get_causal_lm_model,
        get_keras_model,
        get_TFhf_model,
+        get_tfhf_seq2seq_model,
    )
    import tensorflow as tf

@@ -151,34 +164,52 @@ def save_tf_model(tf_model_list):
            input = None
            print(f"Generating artifacts for model {tf_model_name}")
            if model_type == "hf":
-                model, input, _ = get_causal_lm_model(tf_model_name)
-            if model_type == "img":
-                model, input, _ = get_causal_image_model(tf_model_name)
-            if model_type == "keras":
-                model, input, _ = get_keras_model(tf_model_name)
-            if model_type == "TFhf":
-                model, input, _ = get_TFhf_model(tf_model_name)
+                model, input, _ = get_masked_lm_model(
+                    tf_model_name, import_args
+                )
+            elif model_type == "img":
+                model, input, _ = get_causal_image_model(
+                    tf_model_name, import_args
+                )
+            elif model_type == "keras":
+                model, input, _ = get_keras_model(tf_model_name, import_args)
+            elif model_type == "TFhf":
+                model, input, _ = get_TFhf_model(tf_model_name, import_args)
+            elif model_type == "tfhf_seq2seq":
+                model, input, _ = get_tfhf_seq2seq_model(
+                    tf_model_name, import_args
+                )
+            elif model_type == "hf_causallm":
+                model, input, _ = get_causal_lm_model(
+                    tf_model_name, import_args
+                )

            tf_model_name = tf_model_name.replace("/", "_")
-            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
+            if import_args["batch_size"] != 1:
+                tf_model_dir = os.path.join(
+                    local_tank_cache,
+                    str(tf_model_name)
+                    + "_tf"
+                    + f"_BS{str(import_args['batch_size'])}",
+                )
+            else:
+                tf_model_dir = os.path.join(
+                    local_tank_cache, str(tf_model_name) + "_tf"
+                )
            os.makedirs(tf_model_dir, exist_ok=True)
-
            mlir_importer = SharkImporter(
                model,
-                input,
+                inputs=input,
                frontend="tf",
            )
            mlir_importer.import_debug(
+                is_dynamic=False,
                dir=tf_model_dir,
                model_name=tf_model_name,
            )
-            mlir_hash = create_hash(
-                os.path.join(tf_model_dir, tf_model_name + "_tf" + ".mlir")
-            )
-            np.save(os.path.join(tf_model_dir, "hash"), np.array(mlir_hash))


-def save_tflite_model(tflite_model_list):
+def save_tflite_model(tflite_model_list, local_tank_cache, import_args):
    from shark.tflite_utils import TFLitePreprocessor

    with open(tflite_model_list) as csvfile:
@@ -190,18 +221,18 @@ def save_tflite_model(tflite_model_list):
            print("tflite_model_name", tflite_model_name)
            print("tflite_model_link", tflite_model_link)
            tflite_model_name_dir = os.path.join(
-                WORKDIR, str(tflite_model_name) + "_tflite"
+                local_tank_cache, str(tflite_model_name) + "_tflite"
            )
            os.makedirs(tflite_model_name_dir, exist_ok=True)
            print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")

-            # Preprocess to get SharkImporter input args
+            # Preprocess to get SharkImporter input import_args
            tflite_preprocessor = TFLitePreprocessor(str(tflite_model_name))
            raw_model_file_path = tflite_preprocessor.get_raw_model_file()
            inputs = tflite_preprocessor.get_inputs()
            tflite_interpreter = tflite_preprocessor.get_interpreter()

-            # Use SharkImporter to get SharkInference input args
+            # Use SharkImporter to get SharkInference input import_args
            my_shark_importer = SharkImporter(
                module=tflite_interpreter,
                inputs=inputs,
@@ -225,6 +256,71 @@ def save_tflite_model(tflite_model_list):
            )


+def check_requirements(frontend):
+    import importlib
+
+    has_pkgs = False
+    if frontend == "torch":
+        tv_spec = importlib.util.find_spec("torchvision")
+        has_pkgs = tv_spec is not None
+
+    elif frontend in ["tensorflow", "tf"]:
+        tf_spec = importlib.util.find_spec("tensorflow")
+        has_pkgs = tf_spec is not None
+
+    return has_pkgs
+
+
+class NoImportException(Exception):
+    "Raised when requirements are not met for OTF model artifact generation."
+    pass
+
+
+def gen_shark_files(modelname, frontend, tank_dir, importer_args):
+    # If a model's artifacts are requested by shark_downloader but they don't exist in the cloud, we call this function to generate the artifacts on-the-fly.
+    # TODO: Add TFlite support.
+    import tempfile
+
+    import_args = importer_args
+    if check_requirements(frontend):
+        torch_model_csv = os.path.join(
+            os.path.dirname(__file__), "torch_model_list.csv"
+        )
+        tf_model_csv = os.path.join(
+            os.path.dirname(__file__), "tf_model_list.csv"
+        )
+        custom_model_csv = tempfile.NamedTemporaryFile(
+            dir=os.path.dirname(__file__),
+            delete=True,
+        )
+        # Create a temporary .csv with only the desired entry.
+        if frontend == "tf":
+            with open(tf_model_csv, mode="r") as src:
+                reader = csv.reader(src)
+                for row in reader:
+                    if row[0] == modelname:
+                        target = row
+            with open(custom_model_csv.name, mode="w") as trg:
+                writer = csv.writer(trg)
+                writer.writerow(["modelname", "src"])
+                writer.writerow(target)
+            save_tf_model(custom_model_csv.name, tank_dir, import_args)
+
+        elif frontend == "torch":
+            with open(torch_model_csv, mode="r") as src:
+                reader = csv.reader(src)
+                for row in reader:
+                    if row[0] == modelname:
+                        target = row
+            with open(custom_model_csv.name, mode="w") as trg:
+                writer = csv.writer(trg)
+                writer.writerow(["modelname", "src"])
+                writer.writerow(target)
+            save_torch_model(custom_model_csv.name, tank_dir, import_args)
+    else:
+        raise NoImportException
+
+
 # Validates whether the file is present or not.
 def is_valid_file(arg):
    if not os.path.exists(arg):
@@ -234,7 +330,7 @@ def is_valid_file(arg):


 if __name__ == "__main__":
-    # Note, all of these flags are overridden by the import of args from stable_args.py, flags are duplicated temporarily to preserve functionality
+    # Note, all of these flags are overridden by the import of import_args from stable_args.py, flags are duplicated temporarily to preserve functionality
    # parser = argparse.ArgumentParser()
    # parser.add_argument(
    #    "--torch_model_csv",
@@ -262,20 +358,26 @@ if __name__ == "__main__":
    # )
    # parser.add_argument("--upload", type=bool, default=False)

-    # old_args = parser.parse_args()
-
+    # old_import_args = parser.parse_import_args()
+    import_args = {
+        "batch_size": "1",
+    }
+    print(import_args)
    home = str(Path.home())
-    WORKDIR = os.path.join(os.path.dirname(__file__), "gen_shark_tank")
+    WORKDIR = os.path.join(os.path.dirname(__file__), "..", "gen_shark_tank")
    torch_model_csv = os.path.join(
-        os.path.dirname(__file__), "tank", "torch_model_list.csv"
-    )
-    tf_model_csv = os.path.join(
-        os.path.dirname(__file__), "tank", "tf_model_list.csv"
+        os.path.dirname(__file__), "torch_model_list.csv"
    )
+    tf_model_csv = os.path.join(os.path.dirname(__file__), "tf_model_list.csv")
    tflite_model_csv = os.path.join(
-        os.path.dirname(__file__), "tank", "tflite", "tflite_model_list.csv"
+        os.path.dirname(__file__), "tflite", "tflite_model_list.csv"
    )

-    save_torch_model(torch_model_csv)
-    save_tf_model(tf_model_csv)
-    save_tflite_model(tflite_model_csv)
+    save_torch_model(
+        os.path.join(os.path.dirname(__file__), "torch_sd_list.csv"),
+        WORKDIR,
+        import_args,
+    )
+    save_torch_model(torch_model_csv, WORKDIR, import_args)
+    save_tf_model(tf_model_csv, WORKDIR, import_args)
+    save_tflite_model(tflite_model_csv, WORKDIR, import_args)
--- a/tank/model_metadata.csv
+++ b/tank/model_metadata.csv
@@ -31,3 +31,12 @@ xlm-roberta-base,False,False,-,-,-
 facebook/convnext-tiny-224,False,False,-,-,-
 efficientnet-v2-s,False,False,22M,"image-classification,cnn","Includes MBConv and Fused-MBConv"
 mnasnet1_0,False,True,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
+bert-large-uncased,True,True,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
+t5-base,True,False,220M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
+t5-large,True,False,770M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
+bert-large-uncased,True,hf,True,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
+efficientnet_b0,True,False,5.3M,"image-classification;cnn;conv2d;depthwise-conv","Smallest EfficientNet variant with 224x224 input"
+efficientnet_b7,True,False,66M,"image-classification;cnn;conv2d;depthwise-conv","Largest EfficientNet variant with 600x600 input"
+gpt2,True,False,110M,"nlp;transformer-decoder;auto-regressive","12 layers, 768 hidden units, 12 attention heads"
+t5-base,True,False,220M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
+t5-large,True,False,770M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
--- a/Show More
+++ b/Show More