generate sharktank for apps dir

also adds support for the sub-models
2026-04-20 03:00:34 -04:00 · 2023-02-08 21:32:50 +00:00
87 changed files with 959 additions and 7748 deletions
--- a/.flake8
+++ b/.flake8
@@ -1,5 +0,0 @@
-[flake8]
-count = 1
-show-source = 1
-select = E9,F63,F7,F82
-exclude = lit.cfg.py
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -14,7 +14,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.11"]
+        python-version: ["3.10"]

    steps:
    - uses: actions/checkout@v2
@@ -44,20 +44,18 @@ jobs:
        body: |
          Automatic snapshot release of nod.ai SHARK.
        draft: true
-        prerelease: true
+        prerelease: false

    - name: Build Package 
      shell: powershell
      run: |
        ./setup_venv.ps1
-        python process_skipfiles.py
        pyinstaller .\apps\stable_diffusion\shark_sd.spec
        mv ./dist/shark_sd.exe ./dist/shark_sd_${{ env.package_version_ }}.exe
-        signtool sign /f c:\g\shark_02152023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_${{ env.package_version_ }}.exe
        pyinstaller .\apps\stable_diffusion\shark_sd_cli.spec
-        python process_skipfiles.py
        mv ./dist/shark_sd_cli.exe ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
-        signtool sign /f c:\g\shark_02152023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe
+        signtool sign /f C:\shark_2023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/shark_sd_cli_${{ env.package_version_ }}.exe

        
    # GHA windows VM OOMs so disable for now
@@ -67,9 +65,9 @@ jobs:
    #    $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
    #    pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html

-    #- uses: actions/upload-artifact@v2
-    #  with:
-    #    path: dist/*
+    - uses: actions/upload-artifact@v2
+      with:
+        path: dist/*
    
    - name: Upload Release Assets
      id: upload-release-assets
@@ -79,7 +77,6 @@ jobs:
      with:
        release_id: ${{ steps.create_release.outputs.id }}
        assets_path: ./dist/*
-        #asset_content_type: application/vnd.microsoft.portable-executable 

    - name: Publish Release
      id: publish_release
@@ -95,7 +92,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.11"]
+        python-version: ["3.10"]
        backend: [IREE, SHARK]

    steps:
@@ -134,7 +131,7 @@ jobs:
        source iree.venv/bin/activate
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
+        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://iree-org.github.io/iree/pip-release-links.html
        # Install the built wheel
        pip install ./wheelhouse/nodai*
        # Validate the Models
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -31,7 +31,7 @@ jobs:
      matrix:
        os: [7950x, icelake, a100, MacStudio, ubuntu-latest]
        suite: [cpu,cuda,vulkan]
-        python-version: ["3.11"]
+        python-version: ["3.10"]
        include:
          - os: ubuntu-latest
            suite: lint
@@ -99,12 +99,11 @@ jobs:
      run: |
        # black format check
        black --version
-        black --check .
+        black --line-length 79 --check .
        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --statistics
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude lit.cfg.py
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --isolated --count --exit-zero --max-complexity=10 --max-line-length=127 \
-          --statistics --exclude lit.cfg.py
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude lit.cfg.py

    - name: Validate Models on CPU
      if: matrix.suite == 'cpu'
@@ -112,7 +111,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cpu 
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k cpu
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -122,7 +121,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cuda
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
        # Disabled due to black image bug
@@ -137,7 +136,7 @@ jobs:
        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k vulkan --update_tank
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" -k vulkan

    - name: Validate Vulkan Models (a100)
      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
@@ -145,17 +144,19 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k vulkan
+        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank -k vulkan
        python build_tools/stable_diffusion_testing.py --device=vulkan

    - name: Validate Vulkan Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
      run: |
        ./setup_venv.ps1
-        pytest -k vulkan -s
+        pytest --benchmark -k vulkan -s
+        type bench_results.csv

    - name: Validate Stable Diffusion Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
      run: |
        ./setup_venv.ps1
+        ./shark.venv/Scripts/activate
        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/.gitignore
+++ b/.gitignore
@@ -177,7 +177,7 @@ onnx_models/
 generated_imgs/

 # Custom model related artefacts
-variants.json
+apps/stable_diffusion/src/utils/resources/variants.json
 models/

 # models folder
--- a/.style.yapf
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+  based_on_style = google
+  column_limit = 80
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
  <summary>Prerequisites - Drivers </summary>
  
 #### Install your Windows hardware drivers
-* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
+* [AMD RDNA Users] Download this specific driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mril-iree). Latest drivers may not work.
 * [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
  
@@ -25,32 +25,18 @@ Other users please ensure you have your latest vendor drivers and Vulkan SDK fro
 
 ### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users

-Install the Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 
+Install Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 

-Download the [stable release](https://github.com/nod-ai/shark/releases/latest)
+Download the latest .exe https://github.com/nod-ai/SHARK/releases. 

-Double click the .exe and you should have the [UI](http://localhost:8080/) in the browser. 
+Double click the .exe and you should have the [UI]( http://localhost:8080/?__theme=dark) in the browser. 

-If you have custom models put them in a `models/` directory where the .exe is. 
+If you have custom models (ckpt, safetensors) put in a `models/` directory where the .exe is. 

 Enjoy. 

-<details>
-  <summary>More installation notes</summary>
-* We recommend that you download EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files with `rm *.vmfb`. You can also use `--clear_all` flag once to clean all the old files. 
-* If you recently updated the driver or this binary (EXE file), we recommend you clear all the local artifacts with `--clear_all` 
+Some known AMD Driver quirks and fixes with cursors are documented [here](https://github.com/nod-ai/SHARK/blob/main/apps/stable_diffusion/stable_diffusion_amd.md ).

-## Running
-
-* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE)
-* The first run may take few minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
-* You will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
-* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/.
-
-## Stopping
-
-* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment or close the terminal. 
-</details>

 <details>
  <summary>Advanced Installation (Only for developers)</summary>
@@ -68,7 +54,7 @@ cd SHARK

 ### Windows 10/11 Users

-* Install the latest Python 3.11.x version from [here](https://www.python.org/downloads/windows/)
+* Install the latest Python 3.10.x version from [here](https://www.python.org/downloads/windows/)

 * Install Git for Windows from [here](https://git-scm.com/download/win)

@@ -119,15 +105,16 @@ source shark.venv/bin/activate

 #### Linux / macOS Users
 ```shell
-python3.11 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+python3.10 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
 ```

 You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
 </details>

-The output on a AMD 7900XTX would look something like:
+The output on a 7900XTX would like:

-```shell
+```shell 
+Stats for run 0:
 Average step time: 47.19188690185547ms/it
 Clip Inference time (ms) = 109.531
 VAE Inference time (ms): 78.590
@@ -153,7 +140,7 @@ Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any
 This step sets up a new VirtualEnv for Python

 ```shell
-python --version #Check you have 3.11 on Linux, macOS or Windows Powershell
+python --version #Check you have 3.10 on Linux, macOS or Windows Powershell
 python -m venv shark_venv
 source shark_venv/bin/activate   # Use shark_venv/Scripts/activate on Windows

@@ -167,7 +154,7 @@ python -m pip install --upgrade pip

 ### Install SHARK

-This step pip installs SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11
+This step pip installs SHARK and related packages on Linux Python 3.7, 3.8, 3.9, 3.10 and macOS Python 3.10

 ```shell
 pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
@@ -202,10 +189,10 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 <details>
  <summary>Development, Testing and Benchmarks</summary>

-If you want to use Python3.11 and with TF Import tools you can use the environment variables like:
+If you want to use Python3.10 and with TF Import tools you can use the environment variables like:
 Set `USE_IREE=1` to use upstream IREE
 ```
-# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
+# PYTHON=python3.10 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
 ```

 ### Run any of the hundreds of SHARK tank models via the test framework
@@ -215,14 +202,14 @@ python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use g
 pytest tank/test_models.py -k "MiniLM"
 ```
  
-### How to use your locally built IREE / Torch-MLIR with SHARK
+
 If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
 with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
 for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
 for Torch-MLIR.

-How to use your locally built Torch-MLIR with SHARK:
+### How to use your locally built Torch-MLIR with SHARK
 ```shell
 1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
 2.) Run `pip uninstall torch-mlir`.
@@ -240,15 +227,9 @@ Now the SHARK will use your locally build Torch-MLIR repo.

 ## Benchmarking Dispatches

-To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your pytest command line argument.  
+To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your command line argument.  
 If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`

-For example, to generate and run dispatch benchmarks for MiniLM on CUDA:
-```
-pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s --dispatch_benchmarks_dir=./my_dispatch_benchmarks                                                                                
-```
-The given command will populate `<dispatch_benchmarks_dir>/<model_name>/` with an `ordered_dispatches.txt` that lists and orders the dispatches and their latencies, as well as folders for each dispatch that contain .mlir, .vmfb, and results of the benchmark for that dispatch.
-
 if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled.  E.G:

 ```
@@ -272,7 +253,7 @@ Output will include:
 - A .txt file containing benchmark output


-See tank/README.md for further instructions on how to run model tests and benchmarks from the SHARK tank.
+See tank/README.md for instructions on how to run model tests and benchmarks from the SHARK tank.

 </details>

--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1,4 +1 @@
 from apps.stable_diffusion.scripts.txt2img import txt2img_inf
-from apps.stable_diffusion.scripts.img2img import img2img_inf
-from apps.stable_diffusion.scripts.inpaint import inpaint_inf
-from apps.stable_diffusion.scripts.outpaint import outpaint_inf
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -1,332 +0,0 @@
-import sys
-import torch
-import time
-from PIL import Image
-from dataclasses import dataclass
-from apps.stable_diffusion.src import (
-    args,
-    Image2ImagePipeline,
-    StencilPipeline,
-    get_schedulers,
-    set_init_device_flags,
-    utils,
-    clear_all,
-    save_output_img,
-)
-
-
-@dataclass
-class Config:
-    model_id: str
-    ckpt_loc: str
-    precision: str
-    batch_size: int
-    max_length: int
-    height: int
-    width: int
-    device: str
-    use_stencil: str
-
-
-img2img_obj = None
-config_obj = None
-schedulers = None
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def img2img_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image: Image,
-    height: int,
-    width: int,
-    steps: int,
-    strength: float,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    use_stencil: str,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-):
-    global img2img_obj
-    global config_obj
-    global schedulers
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.seed = seed
-    args.steps = steps
-    args.strength = strength
-    args.scheduler = scheduler
-    args.img_path = "not none"
-
-    if init_image is None:
-        return None, "An Initial Image is required"
-    image = init_image.convert("RGB")
-
-    # set ckpt_loc and hf_model_id.
-    types = (
-        ".ckpt",
-        ".safetensors",
-    )  # the tuple of file types
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = custom_model
-    else:
-        args.hf_model_id = custom_model
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    use_stencil = None if use_stencil == "None" else use_stencil
-    args.use_stencil = use_stencil
-    if use_stencil is not None:
-        args.scheduler = "DDIM"
-        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
-    elif args.scheduler != "PNDM":
-        if "Shark" in args.scheduler:
-            print(
-                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
-            )
-            args.scheduler = "PNDM"
-        else:
-            sys.exit(
-                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
-            )
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    args.precision = precision
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    new_config_obj = Config(
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_stencil,
-    )
-    if not img2img_obj or config_obj != new_config_obj:
-        config_obj = new_config_obj
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-        if use_stencil is not None:
-            args.use_tuned = False
-            img2img_obj = StencilPipeline.from_pretrained(
-                scheduler_obj,
-                args.import_mlir,
-                args.hf_model_id,
-                args.ckpt_loc,
-                args.custom_vae,
-                args.precision,
-                args.max_length,
-                args.batch_size,
-                args.height,
-                args.width,
-                args.use_base_vae,
-                args.use_tuned,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                use_stencil=use_stencil,
-            )
-        else:
-            img2img_obj = Image2ImagePipeline.from_pretrained(
-                scheduler_obj,
-                args.import_mlir,
-                args.hf_model_id,
-                args.ckpt_loc,
-                args.custom_vae,
-                args.precision,
-                args.max_length,
-                args.batch_size,
-                args.height,
-                args.width,
-                args.use_base_vae,
-                args.use_tuned,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-            )
-
-    img2img_obj.scheduler = schedulers[scheduler]
-
-    start_time = time.time()
-    img2img_obj.log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    extra_info = {"STRENGTH": strength}
-    for current_batch in range(batch_count):
-        if current_batch > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = img2img_obj.generate_images(
-            prompt,
-            negative_prompt,
-            image,
-            batch_size,
-            height,
-            width,
-            steps,
-            strength,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-            use_stencil=use_stencil,
-        )
-        save_output_img(out_imgs[0], img_seed, extra_info)
-        generated_imgs.extend(out_imgs)
-        seeds.append(img_seed)
-        img2img_obj.log += "\n"
-
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={steps}, strength={args.strength}, guidance_scale={guidance_scale}, seed={seeds}"
-    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
-    text_output += img2img_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    return generated_imgs, text_output
-
-
-if __name__ == "__main__":
-    if args.clear_all:
-        clear_all()
-
-    if args.img_path is None:
-        print("Flag --img_path is required.")
-        exit()
-
-    # When the models get uploaded, it should be default to False.
-    args.import_mlir = True
-
-    use_stencil = args.use_stencil
-    if use_stencil:
-        args.scheduler = "DDIM"
-        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
-    elif args.scheduler != "PNDM":
-        if "Shark" in args.scheduler:
-            print(
-                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
-            )
-            args.scheduler = "PNDM"
-        else:
-            sys.exit(
-                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
-            )
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    set_init_device_flags()
-    schedulers = get_schedulers(args.hf_model_id)
-
-    scheduler_obj = schedulers[args.scheduler]
-    image = Image.open(args.img_path).convert("RGB")
-    seed = utils.sanitize_seed(args.seed)
-    # Adjust for height and width based on model
-
-    if use_stencil:
-        img2img_obj = StencilPipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-            low_cpu_mem_usage=args.low_cpu_mem_usage,
-            use_stencil=use_stencil,
-        )
-    else:
-        img2img_obj = Image2ImagePipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-            low_cpu_mem_usage=args.low_cpu_mem_usage,
-        )
-
-    start_time = time.time()
-    generated_imgs = img2img_obj.generate_images(
-        args.prompts,
-        args.negative_prompts,
-        image,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.steps,
-        args.strength,
-        args.guidance_scale,
-        seed,
-        args.max_length,
-        dtype,
-        args.use_base_vae,
-        cpu_scheduling,
-        use_stencil=use_stencil,
-    )
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
-    text_output += f"\nsteps={args.steps}, strength={args.strength}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
-    text_output += (
-        f", batch size={args.batch_size}, max_length={args.max_length}"
-    )
-    text_output += img2img_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    extra_info = {"STRENGTH": args.strength}
-    save_output_img(generated_imgs[0], seed, extra_info)
-    print(text_output)
--- a/apps/stable_diffusion/scripts/inpaint.py
+++ b/apps/stable_diffusion/scripts/inpaint.py
@@ -1,269 +0,0 @@
-import sys
-import torch
-import time
-from PIL import Image
-from dataclasses import dataclass
-from apps.stable_diffusion.src import (
-    args,
-    InpaintPipeline,
-    get_schedulers,
-    set_init_device_flags,
-    utils,
-    clear_all,
-    save_output_img,
-)
-
-
-@dataclass
-class Config:
-    model_id: str
-    ckpt_loc: str
-    precision: str
-    batch_size: int
-    max_length: int
-    height: int
-    width: int
-    device: str
-
-
-inpaint_obj = None
-config_obj = None
-schedulers = None
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def inpaint_inf(
-    prompt: str,
-    negative_prompt: str,
-    image_dict,
-    height: int,
-    width: int,
-    inpaint_full_res: bool,
-    inpaint_full_res_padding: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-):
-    global inpaint_obj
-    global config_obj
-    global schedulers
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-    args.img_path = "not none"
-    args.mask_path = "not none"
-
-    # set ckpt_loc and hf_model_id.
-    types = (
-        ".ckpt",
-        ".safetensors",
-    )  # the tuple of file types
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = custom_model
-    else:
-        args.hf_model_id = custom_model
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-    )
-    if not inpaint_obj or config_obj != new_config_obj:
-        config_obj = new_config_obj
-        args.precision = precision
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-inpainting"
-        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-        inpaint_obj = InpaintPipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-        )
-
-    inpaint_obj.scheduler = schedulers[scheduler]
-
-    start_time = time.time()
-    inpaint_obj.log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    image = image_dict["image"]
-    mask_image = image_dict["mask"]
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = inpaint_obj.generate_images(
-            prompt,
-            negative_prompt,
-            image,
-            mask_image,
-            batch_size,
-            height,
-            width,
-            inpaint_full_res,
-            inpaint_full_res_padding,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        save_output_img(out_imgs[0], img_seed)
-        generated_imgs.extend(out_imgs)
-        seeds.append(img_seed)
-        inpaint_obj.log += "\n"
-
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seeds}"
-    text_output += f"\nsize={args.height}x{args.width}, batch-count={batch_count}, batch-size={args.batch_size}, max_length={args.max_length}"
-    text_output += inpaint_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    return generated_imgs, text_output
-
-
-if __name__ == "__main__":
-    if args.clear_all:
-        clear_all()
-
-    if args.img_path is None:
-        print("Flag --img_path is required.")
-        exit()
-    if args.mask_path is None:
-        print("Flag --mask_path is required.")
-        exit()
-
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    set_init_device_flags()
-    model_id = (
-        args.hf_model_id
-        if "inpaint" in args.hf_model_id
-        else "stabilityai/stable-diffusion-2-inpainting"
-    )
-    schedulers = get_schedulers(model_id)
-    scheduler_obj = schedulers[args.scheduler]
-    seed = args.seed
-    image = Image.open(args.img_path)
-    mask_image = Image.open(args.mask_path)
-
-    inpaint_obj = InpaintPipeline.from_pretrained(
-        scheduler_obj,
-        args.import_mlir,
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.custom_vae,
-        args.precision,
-        args.max_length,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.use_base_vae,
-        args.use_tuned,
-    )
-
-    for current_batch in range(args.batch_count):
-        if current_batch > 0:
-            seed = -1
-        seed = utils.sanitize_seed(seed)
-
-        start_time = time.time()
-        generated_imgs = inpaint_obj.generate_images(
-            args.prompts,
-            args.negative_prompts,
-            image,
-            mask_image,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.inpaint_full_res,
-            args.inpaint_full_res_padding,
-            args.steps,
-            args.guidance_scale,
-            seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        total_time = time.time() - start_time
-        text_output = f"prompt={args.prompts}"
-        text_output += f"\nnegative prompt={args.negative_prompts}"
-        text_output += (
-            f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-        )
-        text_output += f"\nscheduler={args.scheduler}, device={args.device}"
-        text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
-        text_output += (
-            f", batch size={args.batch_size}, max_length={args.max_length}"
-        )
-        text_output += inpaint_obj.log
-        text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-        save_output_img(generated_imgs[0], seed)
-        print(text_output)
--- a/apps/stable_diffusion/scripts/outpaint.py
+++ b/apps/stable_diffusion/scripts/outpaint.py
@@ -1,298 +0,0 @@
-import sys
-import torch
-import time
-from PIL import Image
-from dataclasses import dataclass
-from apps.stable_diffusion.src import (
-    args,
-    OutpaintPipeline,
-    get_schedulers,
-    set_init_device_flags,
-    utils,
-    clear_all,
-    save_output_img,
-)
-
-
-@dataclass
-class Config:
-    model_id: str
-    ckpt_loc: str
-    precision: str
-    batch_size: int
-    max_length: int
-    height: int
-    width: int
-    device: str
-
-
-outpaint_obj = None
-config_obj = None
-schedulers = None
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def outpaint_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image: Image,
-    pixels: int,
-    mask_blur: int,
-    directions: list,
-    noise_q: float,
-    color_variation: float,
-    height: int,
-    width: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-):
-    global outpaint_obj
-    global config_obj
-    global schedulers
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-    args.img_path = "not none"
-
-    # set ckpt_loc and hf_model_id.
-    types = (
-        ".ckpt",
-        ".safetensors",
-    )  # the tuple of file types
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = custom_model
-    else:
-        args.hf_model_id = custom_model
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-    )
-    if not outpaint_obj or config_obj != new_config_obj:
-        config_obj = new_config_obj
-        args.precision = precision
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-inpainting"
-        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-        outpaint_obj = OutpaintPipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-        )
-
-    outpaint_obj.scheduler = schedulers[scheduler]
-
-    start_time = time.time()
-    outpaint_obj.log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-
-    left = True if "left" in directions else False
-    right = True if "right" in directions else False
-    top = True if "up" in directions else False
-    bottom = True if "down" in directions else False
-
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = outpaint_obj.generate_images(
-            prompt,
-            negative_prompt,
-            init_image,
-            pixels,
-            mask_blur,
-            left,
-            right,
-            top,
-            bottom,
-            noise_q,
-            color_variation,
-            batch_size,
-            height,
-            width,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        save_output_img(out_imgs[0], img_seed)
-        generated_imgs.extend(out_imgs)
-        seeds.append(img_seed)
-        outpaint_obj.log += "\n"
-
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seeds}"
-    text_output += f"\nsize={args.height}x{args.width}, batch-count={batch_count}, batch-size={args.batch_size}, max_length={args.max_length}"
-    text_output += outpaint_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    return generated_imgs, text_output
-
-
-if __name__ == "__main__":
-    if args.clear_all:
-        clear_all()
-
-    if args.img_path is None:
-        print("Flag --img_path is required.")
-        exit()
-
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    set_init_device_flags()
-    model_id = (
-        args.hf_model_id
-        if "inpaint" in args.hf_model_id
-        else "stabilityai/stable-diffusion-2-inpainting"
-    )
-    schedulers = get_schedulers(model_id)
-    scheduler_obj = schedulers[args.scheduler]
-    seed = args.seed
-    image = Image.open(args.img_path)
-
-    outpaint_obj = OutpaintPipeline.from_pretrained(
-        scheduler_obj,
-        args.import_mlir,
-        args.hf_model_id,
-        args.ckpt_loc,
-        args.custom_vae,
-        args.precision,
-        args.max_length,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.use_base_vae,
-        args.use_tuned,
-    )
-
-    for current_batch in range(args.batch_count):
-        if current_batch > 0:
-            seed = -1
-        seed = utils.sanitize_seed(seed)
-
-        start_time = time.time()
-        generated_imgs = outpaint_obj.generate_images(
-            args.prompts,
-            args.negative_prompts,
-            image,
-            args.pixels,
-            args.mask_blur,
-            args.left,
-            args.right,
-            args.top,
-            args.bottom,
-            args.noise_q,
-            args.color_variation,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.steps,
-            args.guidance_scale,
-            seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        total_time = time.time() - start_time
-        text_output = f"prompt={args.prompts}"
-        text_output += f"\nnegative prompt={args.negative_prompts}"
-        text_output += (
-            f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-        )
-        text_output += f"\nscheduler={args.scheduler}, device={args.device}"
-        text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
-        text_output += (
-            f", batch size={args.batch_size}, max_length={args.max_length}"
-        )
-        text_output += outpaint_obj.log
-        text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-        # save this information as metadata of output generated image.
-        directions = []
-        if args.left:
-            directions.append("left")
-        if args.right:
-            directions.append("right")
-        if args.top:
-            directions.append("up")
-        if args.bottom:
-            directions.append("down")
-        extra_info = {
-            "PIXELS": args.pixels,
-            "MASK_BLUR": args.mask_blur,
-            "DIRECTIONS": directions,
-            "NOISE_Q": args.noise_q,
-            "COLOR_VARIATION": args.color_variation,
-        }
-        save_output_img(generated_imgs[0], seed, extra_info)
-        print(text_output)
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -1,15 +1,24 @@
+import os
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"
+
 import sys
+import json
 import torch
+import re
 import time
+from pathlib import Path
+from PIL import PngImagePlugin
+from datetime import datetime as dt
 from dataclasses import dataclass
+from csv import DictWriter
 from apps.stable_diffusion.src import (
    args,
    Text2ImagePipeline,
    get_schedulers,
    set_init_device_flags,
    utils,
-    clear_all,
-    save_output_img,
 )


@@ -25,15 +34,100 @@ class Config:
    device: str


+# This has to come before importing cache objects
+if args.clear_all:
+    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
+    from glob import glob
+    import shutil
+
+    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
+    for vmfb in vmfbs:
+        if os.path.exists(vmfb):
+            os.remove(vmfb)
+    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
+    # TODO: Remove this once we have better weight updation logic.
+    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
+    for yaml in inference_yaml:
+        if os.path.exists(yaml):
+            os.remove(yaml)
+    home = os.path.expanduser("~")
+    if os.name == "nt":  # Windows
+        appdata = os.getenv("LOCALAPPDATA")
+        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
+        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+    elif os.name == "unix":
+        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
+        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+
+
+# save output images and the inputs corresponding to it.
+def save_output_img(output_img, img_seed):
+    output_path = args.output_dir if args.output_dir else Path.cwd()
+    generated_imgs_path = Path(output_path, "generated_imgs")
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
+    out_img_name = (
+        f"{prompt_slice}_{img_seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
+    )
+
+    img_model = args.hf_model_id
+    if args.ckpt_loc:
+        img_model = os.path.basename(args.ckpt_loc)
+
+    if args.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(out_img_path, quality=95, subsampling=0)
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        pngInfo = PngImagePlugin.PngInfo()
+
+        if args.write_metadata_to_png:
+            pngInfo.add_text(
+                "parameters",
+                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {img_seed}, Size: {args.width}x{args.height}, Model: {img_model}",
+            )
+
+        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
+
+        if args.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {args.output_img_format} is not supported yet."
+                "Image saved as png instead. Supported formats: png / jpg"
+            )
+
+    new_entry = {
+        "VARIANT": img_model,
+        "SCHEDULER": args.scheduler,
+        "PROMPT": args.prompts[0],
+        "NEG_PROMPT": args.negative_prompts[0],
+        "SEED": img_seed,
+        "CFG_SCALE": args.guidance_scale,
+        "PRECISION": args.precision,
+        "STEPS": args.steps,
+        "HEIGHT": args.height,
+        "WIDTH": args.width,
+        "MAX_LENGTH": args.max_length,
+        "OUTPUT": out_img_path,
+    }
+
+    with open(csv_path, "a") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+    if args.save_metadata_to_json:
+        del new_entry["OUTPUT"]
+        json_path = Path(generated_imgs_path, f"{out_img_name}.json")
+        with open(json_path, "w") as f:
+            json.dump(new_entry, f, indent=4)
+
+
 txt2img_obj = None
 config_obj = None
 schedulers = None

-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-

 # Exposed to UI.
 def txt2img_inf(
@@ -99,7 +193,7 @@ def txt2img_inf(
        width,
        device,
    )
-    if not txt2img_obj or config_obj != new_config_obj:
+    if config_obj != new_config_obj:
        config_obj = new_config_obj
        args.precision = precision
        args.batch_size = batch_size
@@ -107,10 +201,8 @@ def txt2img_inf(
        args.height = height
        args.width = width
        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        args.img_path = None
+        args.use_tuned = True
+        args.import_mlir = False
        set_init_device_flags()
        model_id = (
            args.hf_model_id
@@ -124,7 +216,6 @@ def txt2img_inf(
            args.import_mlir,
            args.hf_model_id,
            args.ckpt_loc,
-            args.custom_vae,
            args.precision,
            args.max_length,
            args.batch_size,
@@ -132,9 +223,11 @@ def txt2img_inf(
            args.width,
            args.use_base_vae,
            args.use_tuned,
-            low_cpu_mem_usage=args.low_cpu_mem_usage,
        )

+    if not txt2img_obj:
+        sys.exit("text to image pipeline must not return a null value")
+
    txt2img_obj.scheduler = schedulers[scheduler]

    start_time = time.time()
@@ -163,27 +256,21 @@ def txt2img_inf(
        generated_imgs.extend(out_imgs)
        seeds.append(img_seed)
        txt2img_obj.log += "\n"
-        yield generated_imgs, generated_imgs[0], txt2img_obj.log

    total_time = time.time() - start_time
    text_output = f"prompt={args.prompts}"
    text_output += f"\nnegative prompt={args.negative_prompts}"
    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += (
-        f"\nsteps={steps}, guidance_scale={guidance_scale}, seed={seeds}"
-    )
-    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
-    # text_output += txt2img_obj.log
+    text_output += f"\nsteps={args.steps}, guidance_scale={args.guidance_scale}, seed={seeds}"
+    text_output += f"\nsize={args.height}x{args.width}, batch-count={batch_count}, batch-size={args.batch_size}, max_length={args.max_length}"
+    text_output += txt2img_obj.log
    text_output += f"\nTotal image generation time: {total_time:.4f}sec"

-    yield generated_imgs, text_output
+    return generated_imgs, text_output


 if __name__ == "__main__":
-    if args.clear_all:
-        clear_all()
-
    dtype = torch.float32 if args.precision == "fp32" else torch.half
    cpu_scheduling = not args.scheduler.startswith("Shark")
    set_init_device_flags()
@@ -196,7 +283,6 @@ if __name__ == "__main__":
        args.import_mlir,
        args.hf_model_id,
        args.ckpt_loc,
-        args.custom_vae,
        args.precision,
        args.max_length,
        args.batch_size,
@@ -204,11 +290,10 @@ if __name__ == "__main__":
        args.width,
        args.use_base_vae,
        args.use_tuned,
-        low_cpu_mem_usage=args.low_cpu_mem_usage,
    )

-    for current_batch in range(args.batch_count):
-        if current_batch > 0:
+    for run in range(args.runs):
+        if run > 0:
            seed = -1
        seed = utils.sanitize_seed(seed)

@@ -238,7 +323,7 @@ if __name__ == "__main__":
        text_output += (
            f", batch size={args.batch_size}, max_length={args.max_length}"
        )
-        # TODO: if using --batch_count=x txt2img_obj.log will output on each display every iteration infos from the start
+        # TODO: if using --runs=x txt2img_obj.log will output on each display every iteration infos from the start
        text_output += txt2img_obj.log
        text_output += f"\nTotal image generation time: {total_time:.4f}sec"

--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -15,12 +15,12 @@ datas += copy_metadata('filelock')
 datas += copy_metadata('numpy')
 datas += copy_metadata('tokenizers')
 datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
+datas += copy_metadata('diffusers')
+datas += copy_metadata('transformers')
 datas += copy_metadata('omegaconf')
 datas += copy_metadata('safetensors')
-datas += collect_data_files('diffusers')
-datas += collect_data_files('transformers')
-datas += collect_data_files('opencv-python')
 datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
@@ -30,8 +30,8 @@ datas += [
         ( 'src/utils/resources/model_db.json', 'resources' ),
         ( 'src/utils/resources/opt_flags.json', 'resources' ),
         ( 'src/utils/resources/base_model.json', 'resources' ),
-         ( 'web/ui/css/*', 'ui/css' ),
-         ( 'web/ui/logos/*', 'logos' )
+         ( 'web/css/*', 'css' ),
+         ( 'web/logos/*', 'logos' )
         ]

 binaries = []
@@ -44,7 +44,7 @@ a = Analysis(
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.shark_inference', 'apps'],
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/apps/stable_diffusion/shark_sd_cli.spec
+++ b/apps/stable_diffusion/shark_sd_cli.spec
@@ -15,12 +15,12 @@ datas += copy_metadata('filelock')
 datas += copy_metadata('numpy')
 datas += copy_metadata('tokenizers')
 datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torchvision')
 datas += copy_metadata('torch-mlir')
+datas += copy_metadata('diffusers')
+datas += copy_metadata('transformers')
 datas += copy_metadata('omegaconf')
 datas += copy_metadata('safetensors')
-datas += collect_data_files('diffusers')
-datas += collect_data_files('transformers')
-datas += collect_data_files('opencv-python')
 datas += collect_data_files('gradio')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
@@ -42,7 +42,7 @@ a = Analysis(
    pathex=['.'],
    binaries=binaries,
    datas=datas,
-    hiddenimports=['shark', 'shark.shark_inference', 'apps'],
+    hiddenimports=['shark', 'shark.*', 'shark.shark_inference', 'shark_inference', 'iree.tools.core', 'gradio', 'apps'],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -3,14 +3,6 @@ from apps.stable_diffusion.src.utils import (
    set_init_device_flags,
    prompt_examples,
    get_available_devices,
-    clear_all,
-    save_output_img,
-)
-from apps.stable_diffusion.src.pipelines import (
-    Text2ImagePipeline,
-    Image2ImagePipeline,
-    InpaintPipeline,
-    OutpaintPipeline,
-    StencilPipeline,
 )
+from apps.stable_diffusion.src.pipelines import Text2ImagePipeline
 from apps.stable_diffusion.src.schedulers import get_schedulers
--- a/apps/stable_diffusion/src/models/init.py
+++ b/apps/stable_diffusion/src/models/init.py
@@ -2,7 +2,6 @@ from apps.stable_diffusion.src.models.model_wrappers import (
    SharkifyStableDiffusionModel,
 )
 from apps.stable_diffusion.src.models.opt_params import (
-    get_vae_encode,
    get_vae,
    get_unet,
    get_clip,
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -1,10 +1,11 @@
-from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
+from diffusers import AutoencoderKL, UNet2DConditionModel
 from transformers import CLIPTextModel
 from collections import defaultdict
 import torch
-import safetensors.torch
 import traceback
+import re
 import sys
+import os
 from apps.stable_diffusion.src.utils import (
    compile_through_fx,
    get_opt_flags,
@@ -14,9 +15,6 @@ from apps.stable_diffusion.src.utils import (
    preprocessCKPT,
    get_path_to_diffusers_checkpoint,
    fetch_and_update_base_model_id,
-    get_path_stem,
-    get_extended_name,
-    get_stencil_model_id,
 )


@@ -31,19 +29,15 @@ def replace_shape_str(shape, max_len, width, height, batch_size):
        elif shape[i] == "width":
            new_shape.append(width)
        elif isinstance(shape[i], str):
-            mul_val = int(shape[i].split("*")[0])
            if "batch_size" in shape[i]:
+                mul_val = int(shape[i].split("*")[0])
                new_shape.append(batch_size * mul_val)
-            elif "height" in shape[i]:
-                new_shape.append(height * mul_val)
-            elif "width" in shape[i]:
-                new_shape.append(width * mul_val)
        else:
            new_shape.append(shape[i])
    return new_shape


-# Get the input info for various models i.e. "unet", "clip", "vae", "vae_encode".
+# Get the input info for various models i.e. "unet", "clip", "vae".
 def get_input_info(model_info, max_len, width, height, batch_size):
    dtype_config = {"f32": torch.float32, "i64": torch.int64}
    input_map = defaultdict(list)
@@ -73,7 +67,6 @@ class SharkifyStableDiffusionModel:
        self,
        model_id: str,
        custom_weights: str,
-        custom_vae: str,
        precision: str,
        max_len: int = 64,
        width: int = 512,
@@ -81,9 +74,9 @@ class SharkifyStableDiffusionModel:
        batch_size: int = 1,
        use_base_vae: bool = False,
        use_tuned: bool = False,
-        low_cpu_mem_usage: bool = False,
-        is_inpaint: bool = False,
-        use_stencil: str = None
+        debug: bool = False,
+        sharktank_dir: str = "",
+        generate_vmfb: bool = True,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
@@ -97,14 +90,11 @@ class SharkifyStableDiffusionModel:
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
            custom_weights = get_path_to_diffusers_checkpoint(custom_weights)
        self.model_id = model_id if custom_weights == "" else custom_weights
-        # TODO: remove the following line when stable-diffusion-2-1 works
-        if self.model_id == "stabilityai/stable-diffusion-2-1":
-            self.model_id = "stabilityai/stable-diffusion-2-1-base"
-        self.custom_vae = custom_vae
        self.precision = precision
        self.base_vae = use_base_vae
        self.model_name = (
-            str(batch_size)
+            "_"
+            + str(batch_size)
            + "_"
            + str(max_len)
            + "_"
@@ -117,29 +107,20 @@ class SharkifyStableDiffusionModel:
        self.use_tuned = use_tuned
        if use_tuned:
            self.model_name = self.model_name + "_tuned"
-        self.model_name = self.model_name + "_" + get_path_stem(self.model_id)
-        self.low_cpu_mem_usage = low_cpu_mem_usage
-        self.is_inpaint = is_inpaint
-        self.use_stencil = get_stencil_model_id(use_stencil)
-
-    def get_extended_name_for_all_model(self, mask_to_fetch):
-        model_name = {}
-        sub_model_list = ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
-        index = 0
-        for model in sub_model_list:
-            if mask_to_fetch[index] == False:
-                index += 1
-                continue
-            sub_model = model
-            model_config = self.model_name
-            if "vae" == model:
-                if self.custom_vae != "":
-                    model_config = model_config + get_path_stem(self.custom_vae)
-                if self.base_vae:
-                    sub_model = "base_vae"
-            model_name[model] = get_extended_name(sub_model + model_config)
-            index += 1
-        return model_name
+        # We need a better naming convention for the .vmfbs because despite
+        # using the custom model variant the .vmfb names remain the same and
+        # it'll always pick up the compiled .vmfb instead of compiling the
+        # custom model.
+        # So, currently, we add `self.model_id` in the `self.model_name` of
+        # .vmfb file.
+        # TODO: Have a better way of naming the vmfbs using self.model_name.
+        model_name = re.sub(r"\W+", "_", self.model_id)
+        if model_name[0] == "_":
+            model_name = model_name[1:]
+        self.model_name = self.model_name + "_" + model_name
+        self.debug = debug
+        self.sharktank_dir = sharktank_dir
+        self.generate_vmfb = generate_vmfb

    def check_params(self, max_len, width, height):
        if not (max_len >= 32 and max_len <= 77):
@@ -149,57 +130,14 @@ class SharkifyStableDiffusionModel:
        if not (height % 8 == 0 and height >= 384):
            sys.exit("height should be greater than 384 and multiple of 8")

-    def get_vae_encode(self):
-        class VaeEncodeModel(torch.nn.Module):
-            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
+    def get_vae(self):
+        class VaeModel(torch.nn.Module):
+            def __init__(self, model_id=self.model_id, base_vae=self.base_vae):
                super().__init__()
                self.vae = AutoencoderKL.from_pretrained(
                    model_id,
                    subfolder="vae",
-                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
-
-            def forward(self, input):
-                latents = self.vae.encode(input).latent_dist.sample()
-                return 0.18215 * latents
-
-        vae_encode = VaeEncodeModel()
-        inputs = tuple(self.inputs["vae_encode"])
-        is_f16 = True if self.precision == "fp16" else False
-        shark_vae_encode = compile_through_fx(
-            vae_encode,
-            inputs,
-            is_f16=is_f16,
-            use_tuned=self.use_tuned,
-            model_name=self.model_name["vae_encode"],
-            extra_args=get_opt_flags("vae", precision=self.precision),
-        )
-        return shark_vae_encode
-
-    def get_vae(self):
-        class VaeModel(torch.nn.Module):
-            def __init__(self, model_id=self.model_id, base_vae=self.base_vae, custom_vae=self.custom_vae, low_cpu_mem_usage=False):
-                super().__init__()
-                self.vae = None
-                if custom_vae == "":
-                    self.vae = AutoencoderKL.from_pretrained(
-                        model_id,
-                        subfolder="vae",
-                        low_cpu_mem_usage=low_cpu_mem_usage,
-                    )
-                elif not isinstance(custom_vae, dict):
-                    self.vae = AutoencoderKL.from_pretrained(
-                        custom_vae,
-                        subfolder="vae",
-                        low_cpu_mem_usage=low_cpu_mem_usage,
-                    )
-                else:
-                    self.vae = AutoencoderKL.from_pretrained(
-                        model_id,
-                        subfolder="vae",
-                        low_cpu_mem_usage=low_cpu_mem_usage,
-                    )
-                    self.vae.load_state_dict(custom_vae)
                self.base_vae = base_vae

            def forward(self, input):
@@ -212,144 +150,40 @@ class SharkifyStableDiffusionModel:
                x = x * 255.0
                return x.round()

-        vae = VaeModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        vae = VaeModel()
        inputs = tuple(self.inputs["vae"])
        is_f16 = True if self.precision == "fp16" else False
+        vae_name = "base_vae" if self.base_vae else "vae"
+        vae_model_name = vae_name + self.model_name
+        save_dir = os.path.join(self.sharktank_dir, vae_model_name)
+        if self.debug:
+            os.makedirs(save_dir, exist_ok=True)
        shark_vae = compile_through_fx(
            vae,
            inputs,
            is_f16=is_f16,
            use_tuned=self.use_tuned,
-            model_name=self.model_name["vae"],
+            model_name=vae_model_name,
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
            extra_args=get_opt_flags("vae", precision=self.precision),
        )
        return shark_vae

-    def get_controlled_unet(self):
-        class ControlledUnetModel(torch.nn.Module):
-            def __init__(
-                self, model_id=self.model_id, low_cpu_mem_usage=False
-            ):
-                super().__init__()
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    model_id,
-                    subfolder="unet",
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                )
-                self.in_channels = self.unet.in_channels
-                self.train(False)
-
-            def forward( self, latent, timestep, text_embedding, guidance_scale, control1,
-                         control2, control3, control4, control5, control6, control7,
-                         control8, control9, control10, control11, control12, control13,
-            ):
-                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-                db_res_samples = tuple([ control1, control2, control3, control4, control5, control6, control7, control8, control9, control10, control11, control12,])
-                mb_res_samples = control13
-                latents = torch.cat([latent] * 2)
-                unet_out = self.unet.forward(
-                    latents,
-                    timestep,
-                    encoder_hidden_states=text_embedding,
-                    down_block_additional_residuals=db_res_samples,
-                    mid_block_additional_residual=mb_res_samples,
-                    return_dict=False,
-                )[0]
-                noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
-                return noise_pred
-
-        unet = ControlledUnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
-        is_f16 = True if self.precision == "fp16" else False
-
-        inputs = tuple(self.inputs["stencil_unet"])
-        input_mask = [True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True,]
-        shark_controlled_unet = compile_through_fx(
-            unet,
-            inputs,
-            model_name=self.model_name["stencil_unet"],
-            is_f16=is_f16,
-            f16_input_mask=input_mask,
-            use_tuned=self.use_tuned,
-            extra_args=get_opt_flags("unet", precision=self.precision),
-        )
-        return shark_controlled_unet
-
-    def get_control_net(self):
-        class StencilControlNetModel(torch.nn.Module):
-            def __init__(
-                self, model_id=self.use_stencil, low_cpu_mem_usage=False
-            ):
-                super().__init__()
-                self.cnet = ControlNetModel.from_pretrained(
-                    model_id,
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                )
-                self.in_channels = self.cnet.in_channels
-                self.train(False)
-
-            def forward(
-                self,
-                latent,
-                timestep,
-                text_embedding,
-                stencil_image_input,
-            ):
-                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-                # TODO: guidance NOT NEEDED change in `get_input_info` later
-                latents = torch.cat(
-                    [latent] * 2
-                )  # needs to be same as controlledUNET latents
-                stencil_image = torch.cat(
-                    [stencil_image_input] * 2
-                )  # needs to be same as controlledUNET latents
-                down_block_res_samples, mid_block_res_sample = self.cnet.forward(
-                    latents,
-                    timestep,
-                    encoder_hidden_states=text_embedding,
-                    controlnet_cond=stencil_image,
-                    return_dict=False,
-                )
-                return tuple(list(down_block_res_samples) + [mid_block_res_sample])
-
-        scnet = StencilControlNetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
-        is_f16 = True if self.precision == "fp16" else False
-
-        inputs = tuple(self.inputs["stencil_adaptor"])
-        input_mask = [True, True, True, True]
-        shark_cnet = compile_through_fx(
-            scnet,
-            inputs,
-            model_name=self.model_name["stencil_adaptor"],
-            is_f16=is_f16,
-            f16_input_mask=input_mask,
-            use_tuned=self.use_tuned,
-            extra_args=get_opt_flags("unet", precision=self.precision),
-        )
-        return shark_cnet
-
    def get_unet(self):
        class UnetModel(torch.nn.Module):
-            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
+            def __init__(self, model_id=self.model_id):
                super().__init__()
                self.unet = UNet2DConditionModel.from_pretrained(
                    model_id,
                    subfolder="unet",
-                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
                self.in_channels = self.unet.in_channels
                self.train(False)
-                if(args.attention_slicing is not None and args.attention_slicing != "none"):
-                    if(args.attention_slicing.isdigit()):
-                        self.unet.set_attention_slice(int(args.attention_slicing))
-                    else:
-                        self.unet.set_attention_slice(args.attention_slicing)

-            # TODO: Instead of flattening the `control` try to use the list.
            def forward(
-                self, latent, timestep, text_embedding, guidance_scale,
+                self, latent, timestep, text_embedding, guidance_scale
            ):
                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
                latents = torch.cat([latent] * 2)
@@ -362,67 +196,65 @@ class SharkifyStableDiffusionModel:
                )
                return noise_pred

-        unet = UnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        unet = UnetModel()
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False]
+        unet_model_name = "unet" + self.model_name
+        save_dir = os.path.join(self.sharktank_dir, unet_model_name)
+        if self.debug:
+            os.makedirs(
+                save_dir,
+                exist_ok=True,
+            )
        shark_unet = compile_through_fx(
            unet,
            inputs,
-            model_name=self.model_name["unet"],
+            model_name=unet_model_name,
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
            extra_args=get_opt_flags("unet", precision=self.precision),
        )
        return shark_unet

    def get_clip(self):
        class CLIPText(torch.nn.Module):
-            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
+            def __init__(self, model_id=self.model_id):
                super().__init__()
                self.text_encoder = CLIPTextModel.from_pretrained(
                    model_id,
                    subfolder="text_encoder",
-                    low_cpu_mem_usage=low_cpu_mem_usage,
                )

            def forward(self, input):
                return self.text_encoder(input)[0]

-        clip_model = CLIPText(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        clip_model = CLIPText()
+        clip_model_name = "clip" + self.model_name
+        save_dir = os.path.join(self.sharktank_dir, clip_model_name)
+        if self.debug:
+            os.makedirs(
+                save_dir,
+                exist_ok=True,
+            )
        shark_clip = compile_through_fx(
            clip_model,
            tuple(self.inputs["clip"]),
-            model_name=self.model_name["clip"],
+            model_name=clip_model_name,
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
            extra_args=get_opt_flags("clip", precision="fp32"),
        )
        return shark_clip

-    def process_custom_vae(self):
-        custom_vae = self.custom_vae.lower()
-        if not custom_vae.endswith((".ckpt", ".safetensors")):
-            return self.custom_vae
-        try:
-            preprocessCKPT(self.custom_vae)
-            return get_path_to_diffusers_checkpoint(self.custom_vae)
-        except:
-            print("Processing standalone Vae checkpoint")
-            vae_checkpoint = None
-            vae_ignore_keys = {"model_ema.decay", "model_ema.num_updates"}
-            if custom_vae.endswith(".ckpt"):
-                vae_checkpoint = torch.load(self.custom_vae, map_location="cpu")
-            else:
-                vae_checkpoint = safetensors.torch.load_file(self.custom_vae, device="cpu")
-            if "state_dict" in vae_checkpoint:
-                vae_checkpoint = vae_checkpoint["state_dict"]
-            vae_dict = {k: v for k, v in vae_checkpoint.items() if k[0:4] != "loss" and k not in vae_ignore_keys}
-            return vae_dict
-        
-            
    # Compiles Clip, Unet and Vae with `base_model_id` as defining their input
    # configiration.
-    def compile_all(self, base_model_id, need_vae_encode, need_stencil):
+    def compile_all(self, base_model_id):
        self.inputs = get_input_info(
            base_models[base_model_id],
            self.max_len,
@@ -430,45 +262,18 @@ class SharkifyStableDiffusionModel:
            self.height,
            self.batch_size,
        )
-        compiled_controlnet = None
-        compiled_controlled_unet = None
-        compiled_unet = None
-        if need_stencil:
-            compiled_controlnet = self.get_control_net()
-            compiled_controlled_unet = self.get_controlled_unet()
-        else:
-            compiled_unet = self.get_unet()
-        if self.custom_vae != "":
-            print("Plugging in custom Vae")
+        compiled_unet = self.get_unet()
        compiled_vae = self.get_vae()
        compiled_clip = self.get_clip()
-
-        if need_stencil:
-            return compiled_clip, compiled_controlled_unet, compiled_vae, compiled_controlnet
-        if need_vae_encode:
-            compiled_vae_encode = self.get_vae_encode()
-            return compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode
-
+        
        return compiled_clip, compiled_unet, compiled_vae

    def __call__(self):
        # Step 1:
        # --  Fetch all vmfbs for the model, if present, else delete the lot.
-        need_vae_encode, need_stencil = False, False
-        if args.img_path is not None:
-            if self.use_stencil is not None:
-                need_stencil = True
-            else:
-                need_vae_encode = True
-        # `mask_to_fetch` prepares a mask to pick a combination out of :-
-        # ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
-        mask_to_fetch = [True, True, False, True, False, False]
-        if need_vae_encode:
-            mask_to_fetch = [True, True, False, True, True, False]
-        elif need_stencil:
-            mask_to_fetch = [True, False, True, True, False, True]
-        self.model_name = self.get_extended_name_for_all_model(mask_to_fetch)
-        vmfbs = fetch_or_delete_vmfbs(self.model_name, self.precision)   
+        vmfbs = fetch_or_delete_vmfbs(
+            self.model_name, self.base_vae, self.precision
+        )   
        if vmfbs[0]:
            # -- If all vmfbs are indeed present, we also try and fetch the base
            #    model configuration for running SD with custom checkpoints.
@@ -488,18 +293,15 @@ class SharkifyStableDiffusionModel:
            assert self.custom_weights.lower().endswith(
                (".ckpt", ".safetensors")
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
-            preprocessCKPT(self.custom_weights, self.is_inpaint)
+            preprocessCKPT(self.custom_weights)
        else:
            model_to_run = args.hf_model_id
-        # For custom Vae user can provide either the repo-id or a checkpoint file,
-        # and for a checkpoint file we'd need to process it via Diffusers' script.
-        self.custom_vae = self.process_custom_vae()
        base_model_fetched = fetch_and_update_base_model_id(model_to_run)
        if base_model_fetched != "":
            print("Compiling all the models with the fetched base model configuration.")
            if args.ckpt_loc != "":
                args.hf_model_id = base_model_fetched
-            return self.compile_all(base_model_fetched, need_vae_encode, need_stencil)
+            return self.compile_all(base_model_fetched)

        # Step 3:
        # -- This is the retry mechanism where the base model's configuration is not
@@ -507,13 +309,10 @@ class SharkifyStableDiffusionModel:
        print("Inferring base model configuration.")
        for model_id in base_models:
            try:
-                if need_vae_encode:
-                    compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode = self.compile_all(model_id, need_vae_encode, need_stencil)
-                elif need_stencil:
-                    compiled_clip, compiled_unet, compiled_vae, compiled_controlnet = self.compile_all(model_id, need_vae_encode, need_stencil)
-                else:
-                    compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id, need_vae_encode, need_stencil)
+                compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id)
            except Exception as e:
+                if args.enable_stack_trace:
+                    traceback.print_exc()
                print("Retrying with a different base model configuration")
                continue
            # -- Once a successful compilation has taken place we'd want to store
@@ -525,21 +324,7 @@ class SharkifyStableDiffusionModel:
            # the knowledge of base model id accordingly into `args.hf_model_id`.
            if args.ckpt_loc != "":
                args.hf_model_id = model_id
-            if need_vae_encode:
-                return (
-                    compiled_clip,
-                    compiled_unet,
-                    compiled_vae,
-                    compiled_vae_encode,
-                )
-            if need_stencil:
-                return (
-                    compiled_clip,
-                    compiled_unet,
-                    compiled_vae,
-                    compiled_controlnet,
-                )
            return compiled_clip, compiled_unet, compiled_vae
        sys.exit(
-            "Cannot compile the model. Please create an issue with the detailed log at https://github.com/nod-ai/SHARK/issues"
+            "Cannot compile the model. Please re-run the command with `--enable_stack_trace` flag and create an issue with detailed log at https://github.com/nod-ai/SHARK/issues"
        )
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -9,15 +9,13 @@ from apps.stable_diffusion.src.utils import (


 hf_model_variant_map = {
-    "Linaqruf/anything-v3.0": ["anythingv3", "v1_4"],
-    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v1_4"],
-    "prompthero/openjourney": ["openjourney", "v1_4"],
-    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v1_4"],
+    "Linaqruf/anything-v3.0": ["anythingv3", "v2_1base"],
+    "dreamlike-art/dreamlike-diffusion-1.0": ["dreamlike", "v2_1base"],
+    "prompthero/openjourney": ["openjourney", "v2_1base"],
+    "wavymulder/Analog-Diffusion": ["analogdiffusion", "v2_1base"],
    "stabilityai/stable-diffusion-2-1": ["stablediffusion", "v2_1base"],
    "stabilityai/stable-diffusion-2-1-base": ["stablediffusion", "v2_1base"],
    "CompVis/stable-diffusion-v1-4": ["stablediffusion", "v1_4"],
-    "runwayml/stable-diffusion-inpainting": ["stablediffusion", "inpaint_v1"],
-    "stabilityai/stable-diffusion-2-inpainting": ["stablediffusion", "inpaint_v2"],
 }


@@ -54,23 +52,6 @@ def get_unet():
    return get_shark_model(bucket, model_name, iree_flags)


-def get_vae_encode():
-    variant, version = get_variant_version(args.hf_model_id)
-    # Tuned model is present only for `fp16` precision.
-    is_tuned = "tuned" if args.use_tuned else "untuned"
-    if "vulkan" not in args.device and args.use_tuned:
-        bucket_key = f"{variant}/{is_tuned}/{args.device}"
-        model_key = f"{variant}/{version}/vae_encode/{args.precision}/length_77/{is_tuned}/{args.device}"
-    else:
-        bucket_key = f"{variant}/{is_tuned}"
-        model_key = f"{variant}/{version}/vae_encode/{args.precision}/length_77/{is_tuned}"
-
-    bucket, model_name, iree_flags = get_params(
-        bucket_key, model_key, "vae", is_tuned, args.precision
-    )
-    return get_shark_model(bucket, model_name, iree_flags)
-
-
 def get_vae():
    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -1,15 +1,3 @@
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
    Text2ImagePipeline,
 )
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_img2img import (
-    Image2ImagePipeline,
-)
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_inpaint import (
-    InpaintPipeline,
-)
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_outpaint import (
-    OutpaintPipeline,
-)
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_stencil import (
-    StencilPipeline,
-)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -1,172 +0,0 @@
-import torch
-import time
-import numpy as np
-from tqdm.auto import tqdm
-from random import randint
-from PIL import Image
-from transformers import CLIPTokenizer
-from typing import Union
-from shark.shark_inference import SharkInference
-from diffusers import (
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-    DEISMultistepScheduler,
-)
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-    StableDiffusionPipeline,
-)
-
-
-class Image2ImagePipeline(StableDiffusionPipeline):
-    def __init__(
-        self,
-        vae_encode: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-            DEISMultistepScheduler,
-        ],
-    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.vae_encode = vae_encode
-
-    def prepare_image_latents(
-        self,
-        image,
-        batch_size,
-        height,
-        width,
-        generator,
-        num_inference_steps,
-        strength,
-        dtype,
-    ):
-        # Pre process image -> get image encoded -> process latents
-
-        # TODO: process with variable HxW combos
-
-        # Pre process image
-        image = image.resize((width, height))
-        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
-        image_arr = image_arr / 255.0
-        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(dtype)
-        image_arr = 2 * (image_arr - 0.5)
-
-        # set scheduler steps
-        self.scheduler.set_timesteps(num_inference_steps)
-        init_timestep = min(
-            int(num_inference_steps * strength), num_inference_steps
-        )
-        t_start = max(num_inference_steps - init_timestep, 0)
-        # timesteps reduced as per strength
-        timesteps = self.scheduler.timesteps[t_start:]
-        # new number of steps to be used as per strength will be
-        # num_inference_steps = num_inference_steps - t_start
-
-        # image encode
-        latents = self.encode_image((image_arr,))
-        latents = torch.from_numpy(latents).to(dtype)
-        # add noise to data
-        noise = torch.randn(latents.shape, generator=generator, dtype=dtype)
-        latents = self.scheduler.add_noise(
-            latents, noise, timesteps[0].repeat(1)
-        )
-
-        return latents, timesteps
-
-    def encode_image(self, input_image):
-        vae_encode_start = time.time()
-        latents = self.vae_encode("forward", input_image)
-        vae_inf_time = (time.time() - vae_encode_start) * 1000
-        self.log += f"\nVAE Encode Inference time (ms): {vae_inf_time:.3f}"
-
-        return latents
-
-    def generate_images(
-        self,
-        prompts,
-        neg_prompts,
-        image,
-        batch_size,
-        height,
-        width,
-        num_inference_steps,
-        strength,
-        guidance_scale,
-        seed,
-        max_length,
-        dtype,
-        use_base_vae,
-        cpu_scheduling,
-        use_stencil,
-    ):
-        # prompts and negative prompts must be a list.
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        if isinstance(neg_prompts, str):
-            neg_prompts = [neg_prompts]
-
-        prompts = prompts * batch_size
-        neg_prompts = neg_prompts * batch_size
-
-        # seed generator to create the inital latent noise. Also handle out of range seeds.
-        uint32_info = np.iinfo(np.uint32)
-        uint32_min, uint32_max = uint32_info.min, uint32_info.max
-        if seed < uint32_min or seed >= uint32_max:
-            seed = randint(uint32_min, uint32_max)
-        generator = torch.manual_seed(seed)
-
-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
-
-        # guidance scale as a float32 tensor.
-        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
-
-        # Prepare input image latent
-        image_latents, final_timesteps = self.prepare_image_latents(
-            image=image,
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            generator=generator,
-            num_inference_steps=num_inference_steps,
-            strength=strength,
-            dtype=dtype,
-        )
-
-        # Get Image latents
-        latents = self.produce_img_latents(
-            latents=image_latents,
-            text_embeddings=text_embeddings,
-            guidance_scale=guidance_scale,
-            total_timesteps=final_timesteps,
-            dtype=dtype,
-            cpu_scheduling=cpu_scheduling,
-        )
-
-        # Img latents -> PIL images
-        all_imgs = []
-        for i in tqdm(range(0, latents.shape[0], batch_size)):
-            imgs = self.decode_latents(
-                latents=latents[i : i + batch_size],
-                use_base_vae=use_base_vae,
-                cpu_scheduling=cpu_scheduling,
-            )
-            all_imgs.extend(imgs)
-
-        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
@@ -1,445 +0,0 @@
-import torch
-from tqdm.auto import tqdm
-import numpy as np
-from random import randint
-from PIL import Image, ImageOps
-from transformers import CLIPTokenizer
-from typing import Union
-from shark.shark_inference import SharkInference
-from diffusers import (
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-    DEISMultistepScheduler,
-)
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-    StableDiffusionPipeline,
-)
-
-
-class InpaintPipeline(StableDiffusionPipeline):
-    def __init__(
-        self,
-        vae_encode: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-            DEISMultistepScheduler,
-        ],
-    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.vae_encode = vae_encode
-
-    def prepare_latents(
-        self,
-        batch_size,
-        height,
-        width,
-        generator,
-        num_inference_steps,
-        dtype,
-    ):
-        latents = torch.randn(
-            (
-                batch_size,
-                4,
-                height // 8,
-                width // 8,
-            ),
-            generator=generator,
-            dtype=torch.float32,
-        ).to(dtype)
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def get_crop_region(self, mask, pad=0):
-        h, w = mask.shape
-
-        crop_left = 0
-        for i in range(w):
-            if not (mask[:, i] == 0).all():
-                break
-            crop_left += 1
-
-        crop_right = 0
-        for i in reversed(range(w)):
-            if not (mask[:, i] == 0).all():
-                break
-            crop_right += 1
-
-        crop_top = 0
-        for i in range(h):
-            if not (mask[i] == 0).all():
-                break
-            crop_top += 1
-
-        crop_bottom = 0
-        for i in reversed(range(h)):
-            if not (mask[i] == 0).all():
-                break
-            crop_bottom += 1
-
-        return (
-            int(max(crop_left - pad, 0)),
-            int(max(crop_top - pad, 0)),
-            int(min(w - crop_right + pad, w)),
-            int(min(h - crop_bottom + pad, h)),
-        )
-
-    def expand_crop_region(
-        self,
-        crop_region,
-        processing_width,
-        processing_height,
-        image_width,
-        image_height,
-    ):
-        x1, y1, x2, y2 = crop_region
-
-        ratio_crop_region = (x2 - x1) / (y2 - y1)
-        ratio_processing = processing_width / processing_height
-
-        if ratio_crop_region > ratio_processing:
-            desired_height = (x2 - x1) / ratio_processing
-            desired_height_diff = int(desired_height - (y2 - y1))
-            y1 -= desired_height_diff // 2
-            y2 += desired_height_diff - desired_height_diff // 2
-            if y2 >= image_height:
-                diff = y2 - image_height
-                y2 -= diff
-                y1 -= diff
-            if y1 < 0:
-                y2 -= y1
-                y1 -= y1
-            if y2 >= image_height:
-                y2 = image_height
-        else:
-            desired_width = (y2 - y1) * ratio_processing
-            desired_width_diff = int(desired_width - (x2 - x1))
-            x1 -= desired_width_diff // 2
-            x2 += desired_width_diff - desired_width_diff // 2
-            if x2 >= image_width:
-                diff = x2 - image_width
-                x2 -= diff
-                x1 -= diff
-            if x1 < 0:
-                x2 -= x1
-                x1 -= x1
-            if x2 >= image_width:
-                x2 = image_width
-
-        return x1, y1, x2, y2
-
-    def resize_image(self, resize_mode, im, width, height):
-        """
-        resize_mode:
-            0: Resize the image to fill the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
-            1: Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
-        """
-
-        if resize_mode == 0:
-            ratio = width / height
-            src_ratio = im.width / im.height
-
-            src_w = (
-                width if ratio > src_ratio else im.width * height // im.height
-            )
-            src_h = (
-                height if ratio <= src_ratio else im.height * width // im.width
-            )
-
-            resized = im.resize((src_w, src_h), resample=Image.LANCZOS)
-            res = Image.new("RGB", (width, height))
-            res.paste(
-                resized,
-                box=(width // 2 - src_w // 2, height // 2 - src_h // 2),
-            )
-
-        else:
-            ratio = width / height
-            src_ratio = im.width / im.height
-
-            src_w = (
-                width if ratio < src_ratio else im.width * height // im.height
-            )
-            src_h = (
-                height if ratio >= src_ratio else im.height * width // im.width
-            )
-
-            resized = im.resize((src_w, src_h), resample=Image.LANCZOS)
-            res = Image.new("RGB", (width, height))
-            res.paste(
-                resized,
-                box=(width // 2 - src_w // 2, height // 2 - src_h // 2),
-            )
-
-            if ratio < src_ratio:
-                fill_height = height // 2 - src_h // 2
-                res.paste(
-                    resized.resize((width, fill_height), box=(0, 0, width, 0)),
-                    box=(0, 0),
-                )
-                res.paste(
-                    resized.resize(
-                        (width, fill_height),
-                        box=(0, resized.height, width, resized.height),
-                    ),
-                    box=(0, fill_height + src_h),
-                )
-            elif ratio > src_ratio:
-                fill_width = width // 2 - src_w // 2
-                res.paste(
-                    resized.resize(
-                        (fill_width, height), box=(0, 0, 0, height)
-                    ),
-                    box=(0, 0),
-                )
-                res.paste(
-                    resized.resize(
-                        (fill_width, height),
-                        box=(resized.width, 0, resized.width, height),
-                    ),
-                    box=(fill_width + src_w, 0),
-                )
-
-        return res
-
-    def prepare_mask_and_masked_image(
-        self,
-        image,
-        mask,
-        height,
-        width,
-        inpaint_full_res,
-        inpaint_full_res_padding,
-    ):
-        # preprocess image
-        image = image.resize((width, height))
-        mask = mask.resize((width, height))
-
-        paste_to = ()
-        overlay_image = None
-        if inpaint_full_res:
-            # prepare overlay image
-            overlay_image = Image.new("RGB", (image.width, image.height))
-            overlay_image.paste(
-                image.convert("RGB"),
-                mask=ImageOps.invert(mask.convert("L")),
-            )
-
-            # prepare mask
-            mask = mask.convert("L")
-            crop_region = self.get_crop_region(
-                np.array(mask), inpaint_full_res_padding
-            )
-            crop_region = self.expand_crop_region(
-                crop_region, width, height, mask.width, mask.height
-            )
-            x1, y1, x2, y2 = crop_region
-            mask = mask.crop(crop_region)
-            mask = self.resize_image(1, mask, width, height)
-            paste_to = (x1, y1, x2 - x1, y2 - y1)
-
-            # prepare image
-            image = image.crop(crop_region)
-            image = self.resize_image(1, image, width, height)
-
-        if isinstance(image, (Image.Image, np.ndarray)):
-            image = [image]
-
-        if isinstance(image, list) and isinstance(image[0], Image.Image):
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, (Image.Image, np.ndarray)):
-            mask = [mask]
-
-        if isinstance(mask, list) and isinstance(mask[0], Image.Image):
-            mask = np.concatenate(
-                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
-            )
-            mask = mask.astype(np.float32) / 255.0
-        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
-            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = torch.from_numpy(mask)
-
-        masked_image = image * (mask < 0.5)
-
-        return mask, masked_image, paste_to, overlay_image
-
-    def prepare_mask_latents(
-        self,
-        mask,
-        masked_image,
-        batch_size,
-        height,
-        width,
-        dtype,
-    ):
-        mask = torch.nn.functional.interpolate(
-            mask, size=(height // 8, width // 8)
-        )
-        mask = mask.to(dtype)
-
-        masked_image = masked_image.to(dtype)
-        masked_image_latents = self.vae_encode("forward", (masked_image,))
-        masked_image_latents = torch.from_numpy(masked_image_latents)
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if mask.shape[0] < batch_size:
-            if not batch_size % mask.shape[0] == 0:
-                raise ValueError(
-                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
-                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
-                    " of masks that you pass is divisible by the total requested batch size."
-                )
-            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
-        if masked_image_latents.shape[0] < batch_size:
-            if not batch_size % masked_image_latents.shape[0] == 0:
-                raise ValueError(
-                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
-                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
-                    " Make sure the number of images that you pass is divisible by the total requested batch size."
-                )
-            masked_image_latents = masked_image_latents.repeat(
-                batch_size // masked_image_latents.shape[0], 1, 1, 1
-            )
-        return mask, masked_image_latents
-
-    def apply_overlay(self, image, paste_loc, overlay):
-        x, y, w, h = paste_loc
-        image = self.resize_image(0, image, w, h)
-        overlay.paste(image, (x, y))
-
-        return overlay
-
-    def generate_images(
-        self,
-        prompts,
-        neg_prompts,
-        image,
-        mask_image,
-        batch_size,
-        height,
-        width,
-        inpaint_full_res,
-        inpaint_full_res_padding,
-        num_inference_steps,
-        guidance_scale,
-        seed,
-        max_length,
-        dtype,
-        use_base_vae,
-        cpu_scheduling,
-    ):
-        # prompts and negative prompts must be a list.
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        if isinstance(neg_prompts, str):
-            neg_prompts = [neg_prompts]
-
-        prompts = prompts * batch_size
-        neg_prompts = neg_prompts * batch_size
-
-        # seed generator to create the inital latent noise. Also handle out of range seeds.
-        uint32_info = np.iinfo(np.uint32)
-        uint32_min, uint32_max = uint32_info.min, uint32_info.max
-        if seed < uint32_min or seed >= uint32_max:
-            seed = randint(uint32_min, uint32_max)
-        generator = torch.manual_seed(seed)
-
-        # Get initial latents
-        init_latents = self.prepare_latents(
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            generator=generator,
-            num_inference_steps=num_inference_steps,
-            dtype=dtype,
-        )
-
-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
-
-        # guidance scale as a float32 tensor.
-        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
-
-        # Preprocess mask and image
-        (
-            mask,
-            masked_image,
-            paste_to,
-            overlay_image,
-        ) = self.prepare_mask_and_masked_image(
-            image,
-            mask_image,
-            height,
-            width,
-            inpaint_full_res,
-            inpaint_full_res_padding,
-        )
-
-        # Prepare mask latent variables
-        mask, masked_image_latents = self.prepare_mask_latents(
-            mask=mask,
-            masked_image=masked_image,
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            dtype=dtype,
-        )
-
-        # Get Image latents
-        latents = self.produce_img_latents(
-            latents=init_latents,
-            text_embeddings=text_embeddings,
-            guidance_scale=guidance_scale,
-            total_timesteps=self.scheduler.timesteps,
-            dtype=dtype,
-            cpu_scheduling=cpu_scheduling,
-            mask=mask,
-            masked_image_latents=masked_image_latents,
-        )
-
-        # Img latents -> PIL images
-        all_imgs = []
-        for i in tqdm(range(0, latents.shape[0], batch_size)):
-            imgs = self.decode_latents(
-                latents=latents[i : i + batch_size],
-                use_base_vae=use_base_vae,
-                cpu_scheduling=cpu_scheduling,
-            )
-            all_imgs.extend(imgs)
-
-        if inpaint_full_res:
-            output_image = self.apply_overlay(
-                all_imgs[0], paste_to, overlay_image
-            )
-            return [output_image]
-
-        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
@@ -1,541 +0,0 @@
-import torch
-from tqdm.auto import tqdm
-import numpy as np
-from random import randint
-from PIL import Image, ImageDraw, ImageFilter
-from transformers import CLIPTokenizer
-from typing import Union
-from shark.shark_inference import SharkInference
-from diffusers import (
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-    DEISMultistepScheduler,
-)
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-    StableDiffusionPipeline,
-)
-import math
-
-
-class OutpaintPipeline(StableDiffusionPipeline):
-    def __init__(
-        self,
-        vae_encode: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-            DEISMultistepScheduler,
-        ],
-    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.vae_encode = vae_encode
-
-    def prepare_latents(
-        self,
-        batch_size,
-        height,
-        width,
-        generator,
-        num_inference_steps,
-        dtype,
-    ):
-        latents = torch.randn(
-            (
-                batch_size,
-                4,
-                height // 8,
-                width // 8,
-            ),
-            generator=generator,
-            dtype=torch.float32,
-        ).to(dtype)
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def prepare_mask_and_masked_image(
-        self, image, mask, mask_blur, width, height
-    ):
-        if mask_blur > 0:
-            mask = mask.filter(ImageFilter.GaussianBlur(mask_blur))
-        image = image.resize((width, height))
-        mask = mask.resize((width, height))
-
-        # preprocess image
-        if isinstance(image, (Image.Image, np.ndarray)):
-            image = [image]
-
-        if isinstance(image, list) and isinstance(image[0], Image.Image):
-            image = [np.array(i.convert("RGB"))[None, :] for i in image]
-            image = np.concatenate(image, axis=0)
-        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
-            image = np.concatenate([i[None, :] for i in image], axis=0)
-
-        image = image.transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-        # preprocess mask
-        if isinstance(mask, (Image.Image, np.ndarray)):
-            mask = [mask]
-
-        if isinstance(mask, list) and isinstance(mask[0], Image.Image):
-            mask = np.concatenate(
-                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
-            )
-            mask = mask.astype(np.float32) / 255.0
-        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
-            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
-
-        mask[mask < 0.5] = 0
-        mask[mask >= 0.5] = 1
-        mask = torch.from_numpy(mask)
-
-        masked_image = image * (mask < 0.5)
-
-        return mask, masked_image
-
-    def prepare_mask_latents(
-        self,
-        mask,
-        masked_image,
-        batch_size,
-        height,
-        width,
-        dtype,
-    ):
-        mask = torch.nn.functional.interpolate(
-            mask, size=(height // 8, width // 8)
-        )
-        mask = mask.to(dtype)
-
-        masked_image = masked_image.to(dtype)
-        masked_image_latents = self.vae_encode("forward", (masked_image,))
-        masked_image_latents = torch.from_numpy(masked_image_latents)
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        if mask.shape[0] < batch_size:
-            if not batch_size % mask.shape[0] == 0:
-                raise ValueError(
-                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
-                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
-                    " of masks that you pass is divisible by the total requested batch size."
-                )
-            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
-        if masked_image_latents.shape[0] < batch_size:
-            if not batch_size % masked_image_latents.shape[0] == 0:
-                raise ValueError(
-                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
-                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
-                    " Make sure the number of images that you pass is divisible by the total requested batch size."
-                )
-            masked_image_latents = masked_image_latents.repeat(
-                batch_size // masked_image_latents.shape[0], 1, 1, 1
-            )
-        return mask, masked_image_latents
-
-    def get_matched_noise(
-        self, _np_src_image, np_mask_rgb, noise_q=1, color_variation=0.05
-    ):
-        # helper fft routines that keep ortho normalization and auto-shift before and after fft
-        def _fft2(data):
-            if data.ndim > 2:  # has channels
-                out_fft = np.zeros(
-                    (data.shape[0], data.shape[1], data.shape[2]),
-                    dtype=np.complex128,
-                )
-                for c in range(data.shape[2]):
-                    c_data = data[:, :, c]
-                    out_fft[:, :, c] = np.fft.fft2(
-                        np.fft.fftshift(c_data), norm="ortho"
-                    )
-                    out_fft[:, :, c] = np.fft.ifftshift(out_fft[:, :, c])
-            else:  # one channel
-                out_fft = np.zeros(
-                    (data.shape[0], data.shape[1]), dtype=np.complex128
-                )
-                out_fft[:, :] = np.fft.fft2(
-                    np.fft.fftshift(data), norm="ortho"
-                )
-                out_fft[:, :] = np.fft.ifftshift(out_fft[:, :])
-
-            return out_fft
-
-        def _ifft2(data):
-            if data.ndim > 2:  # has channels
-                out_ifft = np.zeros(
-                    (data.shape[0], data.shape[1], data.shape[2]),
-                    dtype=np.complex128,
-                )
-                for c in range(data.shape[2]):
-                    c_data = data[:, :, c]
-                    out_ifft[:, :, c] = np.fft.ifft2(
-                        np.fft.fftshift(c_data), norm="ortho"
-                    )
-                    out_ifft[:, :, c] = np.fft.ifftshift(out_ifft[:, :, c])
-            else:  # one channel
-                out_ifft = np.zeros(
-                    (data.shape[0], data.shape[1]), dtype=np.complex128
-                )
-                out_ifft[:, :] = np.fft.ifft2(
-                    np.fft.fftshift(data), norm="ortho"
-                )
-                out_ifft[:, :] = np.fft.ifftshift(out_ifft[:, :])
-
-            return out_ifft
-
-        def _get_gaussian_window(width, height, std=3.14, mode=0):
-            window_scale_x = float(width / min(width, height))
-            window_scale_y = float(height / min(width, height))
-
-            window = np.zeros((width, height))
-            x = (np.arange(width) / width * 2.0 - 1.0) * window_scale_x
-            for y in range(height):
-                fy = (y / height * 2.0 - 1.0) * window_scale_y
-                if mode == 0:
-                    window[:, y] = np.exp(-(x**2 + fy**2) * std)
-                else:
-                    window[:, y] = (
-                        1 / ((x**2 + 1.0) * (fy**2 + 1.0))
-                    ) ** (std / 3.14)
-
-            return window
-
-        def _get_masked_window_rgb(np_mask_grey, hardness=1.0):
-            np_mask_rgb = np.zeros(
-                (np_mask_grey.shape[0], np_mask_grey.shape[1], 3)
-            )
-            if hardness != 1.0:
-                hardened = np_mask_grey[:] ** hardness
-            else:
-                hardened = np_mask_grey[:]
-            for c in range(3):
-                np_mask_rgb[:, :, c] = hardened[:]
-            return np_mask_rgb
-
-        def _match_cumulative_cdf(source, template):
-            src_values, src_unique_indices, src_counts = np.unique(
-                source.ravel(), return_inverse=True, return_counts=True
-            )
-            tmpl_values, tmpl_counts = np.unique(
-                template.ravel(), return_counts=True
-            )
-
-            # calculate normalized quantiles for each array
-            src_quantiles = np.cumsum(src_counts) / source.size
-            tmpl_quantiles = np.cumsum(tmpl_counts) / template.size
-
-            interp_a_values = np.interp(
-                src_quantiles, tmpl_quantiles, tmpl_values
-            )
-            return interp_a_values[src_unique_indices].reshape(source.shape)
-
-        def _match_histograms(image, reference):
-            if image.ndim != reference.ndim:
-                raise ValueError(
-                    "Image and reference must have the same number of channels."
-                )
-
-            if image.shape[-1] != reference.shape[-1]:
-                raise ValueError(
-                    "Number of channels in the input image and reference image must match!"
-                )
-
-            matched = np.empty(image.shape, dtype=image.dtype)
-            for channel in range(image.shape[-1]):
-                matched_channel = _match_cumulative_cdf(
-                    image[..., channel], reference[..., channel]
-                )
-                matched[..., channel] = matched_channel
-
-            matched = matched.astype(np.float64, copy=False)
-            return matched
-
-        width = _np_src_image.shape[0]
-        height = _np_src_image.shape[1]
-        num_channels = _np_src_image.shape[2]
-
-        np_src_image = _np_src_image[:] * (1.0 - np_mask_rgb)
-        np_mask_grey = np.sum(np_mask_rgb, axis=2) / 3.0
-        img_mask = np_mask_grey > 1e-6
-        ref_mask = np_mask_grey < 1e-3
-
-        # rather than leave the masked area black, we get better results from fft by filling the average unmasked color
-        windowed_image = _np_src_image * (
-            1.0 - _get_masked_window_rgb(np_mask_grey)
-        )
-        windowed_image /= np.max(windowed_image)
-        windowed_image += np.average(_np_src_image) * np_mask_rgb
-
-        src_fft = _fft2(
-            windowed_image
-        )  # get feature statistics from masked src img
-        src_dist = np.absolute(src_fft)
-        src_phase = src_fft / src_dist
-
-        # create a generator with a static seed to make outpainting deterministic / only follow global seed
-        rng = np.random.default_rng(0)
-
-        noise_window = _get_gaussian_window(
-            width, height, mode=1
-        )  # start with simple gaussian noise
-        noise_rgb = rng.random((width, height, num_channels))
-        noise_grey = np.sum(noise_rgb, axis=2) / 3.0
-        # the colorfulness of the starting noise is blended to greyscale with a parameter
-        noise_rgb *= color_variation
-        for c in range(num_channels):
-            noise_rgb[:, :, c] += (1.0 - color_variation) * noise_grey
-
-        noise_fft = _fft2(noise_rgb)
-        for c in range(num_channels):
-            noise_fft[:, :, c] *= noise_window
-        noise_rgb = np.real(_ifft2(noise_fft))
-        shaped_noise_fft = _fft2(noise_rgb)
-        shaped_noise_fft[:, :, :] = (
-            np.absolute(shaped_noise_fft[:, :, :]) ** 2
-            * (src_dist**noise_q)
-            * src_phase
-        )  # perform the actual shaping
-
-        # color_variation
-        brightness_variation = 0.0
-        contrast_adjusted_np_src = (
-            _np_src_image[:] * (brightness_variation + 1.0)
-            - brightness_variation * 2.0
-        )
-
-        shaped_noise = np.real(_ifft2(shaped_noise_fft))
-        shaped_noise -= np.min(shaped_noise)
-        shaped_noise /= np.max(shaped_noise)
-        shaped_noise[img_mask, :] = _match_histograms(
-            shaped_noise[img_mask, :] ** 1.0,
-            contrast_adjusted_np_src[ref_mask, :],
-        )
-        shaped_noise = (
-            _np_src_image[:] * (1.0 - np_mask_rgb) + shaped_noise * np_mask_rgb
-        )
-
-        matched_noise = shaped_noise[:]
-
-        return np.clip(matched_noise, 0.0, 1.0)
-
-    def generate_images(
-        self,
-        prompts,
-        neg_prompts,
-        image,
-        pixels,
-        mask_blur,
-        is_left,
-        is_right,
-        is_top,
-        is_bottom,
-        noise_q,
-        color_variation,
-        batch_size,
-        height,
-        width,
-        num_inference_steps,
-        guidance_scale,
-        seed,
-        max_length,
-        dtype,
-        use_base_vae,
-        cpu_scheduling,
-    ):
-        # prompts and negative prompts must be a list.
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        if isinstance(neg_prompts, str):
-            neg_prompts = [neg_prompts]
-
-        prompts = prompts * batch_size
-        neg_prompts = neg_prompts * batch_size
-
-        # seed generator to create the inital latent noise. Also handle out of range seeds.
-        uint32_info = np.iinfo(np.uint32)
-        uint32_min, uint32_max = uint32_info.min, uint32_info.max
-        if seed < uint32_min or seed >= uint32_max:
-            seed = randint(uint32_min, uint32_max)
-        generator = torch.manual_seed(seed)
-
-        # Get initial latents
-        init_latents = self.prepare_latents(
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            generator=generator,
-            num_inference_steps=num_inference_steps,
-            dtype=dtype,
-        )
-
-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
-
-        # guidance scale as a float32 tensor.
-        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
-
-        process_width = width
-        process_height = height
-        left = pixels if is_left else 0
-        right = pixels if is_right else 0
-        up = pixels if is_top else 0
-        down = pixels if is_bottom else 0
-        target_w = math.ceil((image.width + left + right) / 64) * 64
-        target_h = math.ceil((image.height + up + down) / 64) * 64
-
-        if left > 0:
-            left = left * (target_w - image.width) // (left + right)
-        if right > 0:
-            right = target_w - image.width - left
-        if up > 0:
-            up = up * (target_h - image.height) // (up + down)
-        if down > 0:
-            down = target_h - image.height - up
-
-        def expand(
-            init_img,
-            expand_pixels,
-            is_left=False,
-            is_right=False,
-            is_top=False,
-            is_bottom=False,
-        ):
-            is_horiz = is_left or is_right
-            is_vert = is_top or is_bottom
-            pixels_horiz = expand_pixels if is_horiz else 0
-            pixels_vert = expand_pixels if is_vert else 0
-
-            res_w = init_img.width + pixels_horiz
-            res_h = init_img.height + pixels_vert
-            process_res_w = math.ceil(res_w / 64) * 64
-            process_res_h = math.ceil(res_h / 64) * 64
-
-            img = Image.new("RGB", (process_res_w, process_res_h))
-            img.paste(
-                init_img,
-                (pixels_horiz if is_left else 0, pixels_vert if is_top else 0),
-            )
-
-            msk = Image.new("RGB", (process_res_w, process_res_h), "white")
-            draw = ImageDraw.Draw(msk)
-            draw.rectangle(
-                (
-                    expand_pixels + mask_blur if is_left else 0,
-                    expand_pixels + mask_blur if is_top else 0,
-                    msk.width - expand_pixels - mask_blur
-                    if is_right
-                    else res_w,
-                    msk.height - expand_pixels - mask_blur
-                    if is_bottom
-                    else res_h,
-                ),
-                fill="black",
-            )
-
-            np_image = (np.asarray(img) / 255.0).astype(np.float64)
-            np_mask = (np.asarray(msk) / 255.0).astype(np.float64)
-            noised = self.get_matched_noise(
-                np_image, np_mask, noise_q, color_variation
-            )
-            output_image = Image.fromarray(
-                np.clip(noised * 255.0, 0.0, 255.0).astype(np.uint8),
-                mode="RGB",
-            )
-
-            target_width = (
-                min(width, init_img.width + pixels_horiz)
-                if is_horiz
-                else img.width
-            )
-            target_height = (
-                min(height, init_img.height + pixels_vert)
-                if is_vert
-                else img.height
-            )
-            crop_region = (
-                0 if is_left else output_image.width - target_width,
-                0 if is_top else output_image.height - target_height,
-                target_width if is_left else output_image.width,
-                target_height if is_top else output_image.height,
-            )
-            mask_to_process = msk.crop(crop_region)
-            image_to_process = output_image.crop(crop_region)
-
-            # Preprocess mask and image
-            mask, masked_image = self.prepare_mask_and_masked_image(
-                image_to_process, mask_to_process, mask_blur, width, height
-            )
-
-            # Prepare mask latent variables
-            mask, masked_image_latents = self.prepare_mask_latents(
-                mask=mask,
-                masked_image=masked_image,
-                batch_size=batch_size,
-                height=height,
-                width=width,
-                dtype=dtype,
-            )
-
-            # Get Image latents
-            latents = self.produce_img_latents(
-                latents=init_latents,
-                text_embeddings=text_embeddings,
-                guidance_scale=guidance_scale,
-                total_timesteps=self.scheduler.timesteps,
-                dtype=dtype,
-                cpu_scheduling=cpu_scheduling,
-                mask=mask,
-                masked_image_latents=masked_image_latents,
-            )
-
-            # Img latents -> PIL images
-            all_imgs = []
-            for i in tqdm(range(0, latents.shape[0], batch_size)):
-                imgs = self.decode_latents(
-                    latents=latents[i : i + batch_size],
-                    use_base_vae=use_base_vae,
-                    cpu_scheduling=cpu_scheduling,
-                )
-                all_imgs.extend(imgs)
-
-            res_img = all_imgs[0].resize(
-                (image_to_process.width, image_to_process.height)
-            )
-            output_image.paste(
-                res_img,
-                (
-                    0 if is_left else output_image.width - res_img.width,
-                    0 if is_top else output_image.height - res_img.height,
-                ),
-            )
-            output_image = output_image.crop((0, 0, res_w, res_h))
-
-            return output_image
-
-        img = image.resize((width, height))
-        if left > 0:
-            img = expand(img, left, is_left=True)
-        if right > 0:
-            img = expand(img, right, is_right=True)
-        if up > 0:
-            img = expand(img, up, is_top=True)
-        if down > 0:
-            img = expand(img, down, is_bottom=True)
-
-        return [img]
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -1,150 +0,0 @@
-import torch
-import time
-import numpy as np
-from tqdm.auto import tqdm
-from random import randint
-from PIL import Image
-from transformers import CLIPTokenizer
-from typing import Union
-from shark.shark_inference import SharkInference
-from diffusers import (
-    DDIMScheduler,
-    PNDMScheduler,
-    LMSDiscreteScheduler,
-    EulerDiscreteScheduler,
-    EulerAncestralDiscreteScheduler,
-    DPMSolverMultistepScheduler,
-)
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
-from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-    StableDiffusionPipeline,
-)
-from apps.stable_diffusion.src.utils import controlnet_hint_conversion
-
-
-class StencilPipeline(StableDiffusionPipeline):
-    def __init__(
-        self,
-        controlnet: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            SharkEulerDiscreteScheduler,
-        ],
-    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.controlnet = controlnet
-
-    def prepare_latents(
-        self,
-        batch_size,
-        height,
-        width,
-        generator,
-        num_inference_steps,
-        dtype,
-    ):
-        latents = torch.randn(
-            (
-                batch_size,
-                4,
-                height // 8,
-                width // 8,
-            ),
-            generator=generator,
-            dtype=torch.float32,
-        ).to(dtype)
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        self.scheduler.is_scale_input_called = True
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    def generate_images(
-        self,
-        prompts,
-        neg_prompts,
-        image,
-        batch_size,
-        height,
-        width,
-        num_inference_steps,
-        strength,
-        guidance_scale,
-        seed,
-        max_length,
-        dtype,
-        use_base_vae,
-        cpu_scheduling,
-        use_stencil,
-    ):
-        # Control Embedding check & conversion
-        # TODO: 1. Change `num_images_per_prompt`.
-        controlnet_hint = controlnet_hint_conversion(
-            image, use_stencil, height, width, dtype, num_images_per_prompt=1
-        )
-        # prompts and negative prompts must be a list.
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        if isinstance(neg_prompts, str):
-            neg_prompts = [neg_prompts]
-
-        prompts = prompts * batch_size
-        neg_prompts = neg_prompts * batch_size
-
-        # seed generator to create the inital latent noise. Also handle out of range seeds.
-        uint32_info = np.iinfo(np.uint32)
-        uint32_min, uint32_max = uint32_info.min, uint32_info.max
-        if seed < uint32_min or seed >= uint32_max:
-            seed = randint(uint32_min, uint32_max)
-        generator = torch.manual_seed(seed)
-
-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
-
-        # guidance scale as a float32 tensor.
-        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
-
-        # Prepare initial latent.
-        init_latents = self.prepare_latents(
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            generator=generator,
-            num_inference_steps=num_inference_steps,
-            dtype=dtype,
-        )
-        final_timesteps = self.scheduler.timesteps
-
-        # Get Image latents
-        latents = self.produce_stencil_latents(
-            latents=init_latents,
-            text_embeddings=text_embeddings,
-            guidance_scale=guidance_scale,
-            total_timesteps=final_timesteps,
-            dtype=dtype,
-            cpu_scheduling=cpu_scheduling,
-            controlnet_hint=controlnet_hint,
-            controlnet=self.controlnet,
-        )
-
-        # Img latents -> PIL images
-        all_imgs = []
-        for i in tqdm(range(0, latents.shape[0], batch_size)):
-            imgs = self.decode_latents(
-                latents=latents[i : i + batch_size],
-                use_base_vae=use_base_vae,
-                cpu_scheduling=cpu_scheduling,
-            )
-            all_imgs.extend(imgs)
-
-        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -9,20 +9,15 @@ from diffusers import (
    DDIMScheduler,
    PNDMScheduler,
    LMSDiscreteScheduler,
-    KDPM2DiscreteScheduler,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
    DPMSolverMultistepScheduler,
-    DEISMultistepScheduler,
 )
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    StableDiffusionPipeline,
 )

-import cv2
-from PIL import Image
-

 class Text2ImagePipeline(StableDiffusionPipeline):
    def __init__(
@@ -35,12 +30,10 @@ class Text2ImagePipeline(StableDiffusionPipeline):
            DDIMScheduler,
            PNDMScheduler,
            LMSDiscreteScheduler,
-            KDPM2DiscreteScheduler,
            EulerDiscreteScheduler,
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
-            DEISMultistepScheduler,
        ],
    ):
        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -1,5 +1,4 @@
 import torch
-import numpy as np
 from transformers import CLIPTokenizer
 from PIL import Image
 from tqdm.auto import tqdm
@@ -9,17 +8,14 @@ from diffusers import (
    DDIMScheduler,
    PNDMScheduler,
    LMSDiscreteScheduler,
-    KDPM2DiscreteScheduler,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
    DPMSolverMultistepScheduler,
-    DEISMultistepScheduler,
 )
 from shark.shark_inference import SharkInference
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.models import (
    SharkifyStableDiffusionModel,
-    get_vae_encode,
    get_vae,
    get_clip,
    get_unet,
@@ -42,12 +38,10 @@ class StableDiffusionPipeline:
            DDIMScheduler,
            PNDMScheduler,
            LMSDiscreteScheduler,
-            KDPM2DiscreteScheduler,
            EulerDiscreteScheduler,
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
-            DEISMultistepScheduler,
        ],
    ):
        self.vae = vae
@@ -110,118 +104,6 @@ class StableDiffusionPipeline:
        pil_images = [Image.fromarray(image) for image in images.numpy()]
        return pil_images

-    def produce_stencil_latents(
-        self,
-        latents,
-        text_embeddings,
-        guidance_scale,
-        total_timesteps,
-        dtype,
-        cpu_scheduling,
-        controlnet_hint=None,
-        controlnet=None,
-        controlnet_conditioning_scale: float = 1.0,
-        mask=None,
-        masked_image_latents=None,
-        return_all_latents=False,
-    ):
-        step_time_sum = 0
-        latent_history = [latents]
-        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-        for i, t in tqdm(enumerate(total_timesteps)):
-            step_start_time = time.time()
-            timestep = torch.tensor([t]).to(dtype)
-            latent_model_input = self.scheduler.scale_model_input(latents, t)
-            if mask is not None and masked_image_latents is not None:
-                latent_model_input = torch.cat(
-                    [
-                        torch.from_numpy(np.asarray(latent_model_input)),
-                        mask,
-                        masked_image_latents,
-                    ],
-                    dim=1,
-                ).to(dtype)
-            if cpu_scheduling:
-                latent_model_input = latent_model_input.detach().numpy()
-
-            if not torch.is_tensor(latent_model_input):
-                latent_model_input_1 = torch.from_numpy(
-                    np.asarray(latent_model_input)
-                ).to(dtype)
-            else:
-                latent_model_input_1 = latent_model_input
-            control = controlnet(
-                "forward",
-                (
-                    latent_model_input_1,
-                    timestep,
-                    text_embeddings,
-                    controlnet_hint,
-                ),
-                send_to_host=False,
-            )
-            down_block_res_samples = control[0:12]
-            mid_block_res_sample = control[12:]
-            down_block_res_samples = [
-                down_block_res_sample * controlnet_conditioning_scale
-                for down_block_res_sample in down_block_res_samples
-            ]
-            mid_block_res_sample = (
-                mid_block_res_sample[0] * controlnet_conditioning_scale
-            )
-            timestep = timestep.detach().numpy()
-            # Profiling Unet.
-            profile_device = start_profiling(file_path="unet.rdc")
-            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                    down_block_res_samples[0],
-                    down_block_res_samples[1],
-                    down_block_res_samples[2],
-                    down_block_res_samples[3],
-                    down_block_res_samples[4],
-                    down_block_res_samples[5],
-                    down_block_res_samples[6],
-                    down_block_res_samples[7],
-                    down_block_res_samples[8],
-                    down_block_res_samples[9],
-                    down_block_res_samples[10],
-                    down_block_res_samples[11],
-                    mid_block_res_sample,
-                ),
-                send_to_host=False,
-            )
-            end_profiling(profile_device)
-
-            if cpu_scheduling:
-                noise_pred = torch.from_numpy(noise_pred.to_host())
-                latents = self.scheduler.step(
-                    noise_pred, t, latents
-                ).prev_sample
-            else:
-                latents = self.scheduler.step(noise_pred, t, latents)
-
-            latent_history.append(latents)
-            step_time = (time.time() - step_start_time) * 1000
-            #  self.log += (
-            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
-            #  )
-            step_time_sum += step_time
-
-        avg_step_time = step_time_sum / len(total_timesteps)
-        self.log += f"\nAverage step time: {avg_step_time}ms/it"
-
-        if not return_all_latents:
-            return latents
-        all_latents = torch.cat(latent_history, dim=0)
-        return all_latents
-
    def produce_img_latents(
        self,
        latents,
@@ -230,8 +112,6 @@ class StableDiffusionPipeline:
        total_timesteps,
        dtype,
        cpu_scheduling,
-        mask=None,
-        masked_image_latents=None,
        return_all_latents=False,
    ):
        step_time_sum = 0
@@ -242,15 +122,6 @@ class StableDiffusionPipeline:
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype).detach().numpy()
            latent_model_input = self.scheduler.scale_model_input(latents, t)
-            if mask is not None and masked_image_latents is not None:
-                latent_model_input = torch.cat(
-                    [
-                        torch.from_numpy(np.asarray(latent_model_input)),
-                        mask,
-                        masked_image_latents,
-                    ],
-                    dim=1,
-                ).to(dtype)
            if cpu_scheduling:
                latent_model_input = latent_model_input.detach().numpy()

@@ -298,17 +169,14 @@ class StableDiffusionPipeline:
            DDIMScheduler,
            PNDMScheduler,
            LMSDiscreteScheduler,
-            KDPM2DiscreteScheduler,
            EulerDiscreteScheduler,
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
-            DEISMultistepScheduler,
        ],
        import_mlir: bool,
        model_id: str,
        ckpt_loc: str,
-        custom_vae: str,
        precision: str,
        max_length: int,
        batch_size: int,
@@ -316,18 +184,13 @@ class StableDiffusionPipeline:
        width: int,
        use_base_vae: bool,
        use_tuned: bool,
-        low_cpu_mem_usage: bool = False,
-        use_stencil: str = None,
    ):
-        is_inpaint = cls.__name__ in [
-            "InpaintPipeline",
-            "OutpaintPipeline",
-        ]
        if import_mlir:
+            # TODO: Delet this when on-the-fly tuning of models work.
+            use_tuned = False
            mlir_import = SharkifyStableDiffusionModel(
                model_id,
                ckpt_loc,
-                custom_vae,
                precision,
                max_len=max_length,
                batch_size=batch_size,
@@ -335,78 +198,9 @@ class StableDiffusionPipeline:
                width=width,
                use_base_vae=use_base_vae,
                use_tuned=use_tuned,
-                low_cpu_mem_usage=low_cpu_mem_usage,
-                is_inpaint=is_inpaint,
-                use_stencil=use_stencil,
            )
-            if cls.__name__ in [
-                "Image2ImagePipeline",
-                "InpaintPipeline",
-                "OutpaintPipeline",
-            ]:
-                clip, unet, vae, vae_encode = mlir_import()
-                return cls(
-                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            if cls.__name__ in ["StencilPipeline"]:
-                clip, unet, vae, controlnet = mlir_import()
-                return cls(
-                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            clip, unet, vae = mlir_import()
-            return cls(vae, clip, get_tokenizer(), unet, scheduler)
-        try:
-            if cls.__name__ in [
-                "Image2ImagePipeline",
-                "InpaintPipeline",
-                "OutpaintPipeline",
-            ]:
-                return cls(
-                    get_vae_encode(),
-                    get_vae(),
-                    get_clip(),
-                    get_tokenizer(),
-                    get_unet(),
-                    scheduler,
-                )
-            if cls.__name__ == "StencilPipeline":
-                import sys
-
-                sys.exit(
-                    "StencilPipeline not supported with SharkTank currently."
-                )
-            return cls(
-                get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
-            )
-        except:
-            print("download pipeline failed, falling back to import_mlir")
-            mlir_import = SharkifyStableDiffusionModel(
-                model_id,
-                ckpt_loc,
-                custom_vae,
-                precision,
-                max_len=max_length,
-                batch_size=batch_size,
-                height=height,
-                width=width,
-                use_base_vae=use_base_vae,
-                use_tuned=use_tuned,
-                low_cpu_mem_usage=low_cpu_mem_usage,
-                is_inpaint=is_inpaint,
-            )
-            if cls.__name__ in [
-                "Image2ImagePipeline",
-                "InpaintPipeline",
-                "OutpaintPipeline",
-            ]:
-                clip, unet, vae, vae_encode = mlir_import()
-                return cls(
-                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            if cls.__name__ == "StencilPipeline":
-                clip, unet, vae, controlnet = mlir_import()
-                return cls(
-                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
-                )
            clip, unet, vae = mlir_import()
            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+        return cls(
+            get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
+        )
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -3,10 +3,8 @@ from diffusers import (
    PNDMScheduler,
    DDIMScheduler,
    DPMSolverMultistepScheduler,
-    KDPM2DiscreteScheduler,
    EulerDiscreteScheduler,
    EulerAncestralDiscreteScheduler,
-    DEISMultistepScheduler,
 )
 from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
    SharkEulerDiscreteScheduler,
@@ -19,10 +17,6 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
-    schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
@@ -47,10 +41,6 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
-    schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
    schedulers[
        "SharkEulerDiscrete"
    ] = SharkEulerDiscreteScheduler.from_pretrained(
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -87,7 +87,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
        if sys.platform == "darwin":
            iree_flags.append("-iree-stream-fuse-binding=false")

-        def _import(self):
+        if args.import_mlir:
            scaling_model = ScalingModel()
            self.scaling_model = compile_through_fx(
                scaling_model,
@@ -105,28 +105,15 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
                + args.precision,
                extra_args=iree_flags,
            )
-
-        if args.import_mlir:
-            _import(self)
-
        else:
-            try:
-                self.scaling_model = get_shark_model(
-                    SCHEDULER_BUCKET,
-                    "euler_scale_model_input_" + args.precision,
-                    iree_flags,
-                )
-                self.step_model = get_shark_model(
-                    SCHEDULER_BUCKET,
-                    "euler_step_" + args.precision,
-                    iree_flags,
-                )
-            except:
-                print(
-                    "failed to download model, falling back and using import_mlir"
-                )
-                args.import_mlir = True
-                _import(self)
+            self.scaling_model = get_shark_model(
+                SCHEDULER_BUCKET,
+                "euler_scale_model_input_" + args.precision,
+                iree_flags,
+            )
+            self.step_model = get_shark_model(
+                SCHEDULER_BUCKET, "euler_step_" + args.precision, iree_flags
+            )

    def scale_model_input(self, sample, timestep):
        step_index = (self.timesteps == timestep).nonzero().item()
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -8,13 +8,10 @@ from apps.stable_diffusion.src.utils.resources import (
    base_models,
    opt_flags,
    resource_path,
+    fetch_and_update_base_model_id,
 )
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
 from apps.stable_diffusion.src.utils.stable_args import args
-from apps.stable_diffusion.src.utils.stencils.stencil_utils import (
-    controlnet_hint_conversion,
-    get_stencil_model_id,
-)
 from apps.stable_diffusion.src.utils.utils import (
    get_shark_model,
    compile_through_fx,
@@ -25,11 +22,6 @@ from apps.stable_diffusion.src.utils.utils import (
    get_opt_flags,
    preprocessCKPT,
    fetch_or_delete_vmfbs,
-    fetch_and_update_base_model_id,
    get_path_to_diffusers_checkpoint,
    sanitize_seed,
-    get_path_stem,
-    get_extended_name,
-    clear_all,
-    save_output_img,
 )
--- a/apps/stable_diffusion/src/utils/resources.py
+++ b/apps/stable_diffusion/src/utils/resources.py
@@ -35,3 +35,28 @@ base_models = get_json_file("resources/base_model.json")

 # Contains optimization flags for different models.
 opt_flags = get_json_file("resources/opt_flags.json")
+
+
+# `fetch_and_update_base_model_id` is a resource utility function which
+# helps maintaining mapping of the model to run with its base model.
+# If `base_model` is "", then this function tries to fetch the base model
+# info for the `model_to_run`.
+def fetch_and_update_base_model_id(model_to_run, base_model=""):
+    path = "resources/variants.json"
+    loc_json = resource_path(path)
+    data = {model_to_run: base_model}
+    json_data = {}
+    if os.path.exists(loc_json):
+        with open(loc_json, "r", encoding="utf-8") as jsonFile:
+            json_data = json.load(jsonFile)
+            # Return with base_model's info if base_model is "".
+            if base_model == "":
+                if model_to_run in json_data:
+                    base_model = json_data[model_to_run]
+                return base_model
+    elif base_model == "":
+        return base_model
+    # Update JSON data to contain an entry mapping model_to_run with base_model.
+    json_data.update(data)
+    with open(loc_json, "w", encoding="utf-8") as jsonFile:
+        json.dump(json_data, jsonFile)
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -29,14 +29,6 @@
                "dtype": "f32"
            }
        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
        "vae": {
            "latents" : {
                "shape" : [
@@ -85,236 +77,6 @@
                "dtype": "f32"
            }
        },
-        "stencil_adaptor": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    4,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    768
-                ],
-                "dtype": "f32"
-            },
-            "controlnet_hint": {
-                "shape": [1, 3, 512, 512],
-                "dtype": "f32"
-            }
-        },
-        "stencil_unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    4,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    768
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            },
-            "control1": {
-                "shape": [2, 320, 64, 64],
-                "dtype": "f32"
-            },
-            "control2": {
-                "shape": [2, 320, 64, 64],
-                "dtype": "f32"
-            },
-            "control3": {
-                "shape": [2, 320, 64, 64],
-                "dtype": "f32"
-            },
-            "control4": {
-                "shape": [2, 320, 32, 32],
-                "dtype": "f32"
-            },
-            "control5": {
-                "shape": [2, 640, 32, 32],
-                "dtype": "f32"
-            },
-            "control6": {
-                "shape": [2, 640, 32, 32],
-                "dtype": "f32"
-            },
-            "control7": {
-                "shape": [2, 640, 16, 16],
-                "dtype": "f32"
-            },
-            "control8": {
-                "shape": [2, 1280, 16, 16],
-                "dtype": "f32"
-            },
-            "control9": {
-                "shape": [2, 1280, 16, 16],
-                "dtype": "f32"
-            },
-            "control10": {
-                "shape": [2, 1280, 8, 8],
-                "dtype": "f32"
-            },
-            "control11": {
-                "shape": [2, 1280, 8, 8],
-                "dtype": "f32"
-            },
-            "control12": {
-                "shape": [2, 1280, 8, 8],
-                "dtype": "f32"
-            },
-            "control13": {
-                "shape": [2, 1280, 8, 8],
-                "dtype": "f32"
-            }
-        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
-        }
-    },
-    "stabilityai/stable-diffusion-2-inpainting": {
-        "unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    9,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    1024
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            }
-        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
-        }
-    },
-    "runwayml/stable-diffusion-inpainting": {
-        "unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    9,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    768
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            }
-        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
        "vae": {
            "latents" : {
                "shape" : [
@@ -333,4 +95,4 @@
            }
        }
    }
-}
+}
--- a/apps/stable_diffusion/src/utils/resources/model_config.json
+++ b/apps/stable_diffusion/src/utils/resources/model_config.json
@@ -3,8 +3,6 @@
    "stablediffusion/v1_4":"CompVis/stable-diffusion-v1-4",
    "stablediffusion/v2_1base":"stabilityai/stable-diffusion-2-1-base",
    "stablediffusion/v2_1":"stabilityai/stable-diffusion-2-1",
-    "stablediffusion/inpaint_v1":"runwayml/stable-diffusion-inpainting",
-    "stablediffusion/inpaint_v2":"stabilityai/stable-diffusion-2-inpainting",
    "anythingv3/v1_4":"Linaqruf/anything-v3.0",
    "analogdiffusion/v1_4":"wavymulder/Analog-Diffusion",
    "openjourney/v1_4":"prompthero/openjourney",
--- a/apps/stable_diffusion/src/utils/resources/model_db.json
+++ b/apps/stable_diffusion/src/utils/resources/model_db.json
@@ -22,6 +22,8 @@
    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
@@ -40,41 +42,41 @@
    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "anythingv3/v1_4/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
-    "anythingv3/v1_4/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
-    "anythingv3/v1_4/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
-    "anythingv3/v1_4/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
-    "anythingv3/v1_4/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
-    "anythingv3/v1_4/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
-    "anythingv3/v1_4/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
-    "anythingv3/v1_4/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
-    "anythingv3/v1_4/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
-    "anythingv3/v1_4/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
-    "anythingv3/v1_4/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
-    "analogdiffusion/v1_4/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
-    "analogdiffusion/v1_4/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
-    "analogdiffusion/v1_4/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
-    "analogdiffusion/v1_4/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
-    "analogdiffusion/v1_4/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
-    "analogdiffusion/v1_4/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
-    "analogdiffusion/v1_4/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
-    "analogdiffusion/v1_4/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
-    "analogdiffusion/v1_4/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
-    "analogdiffusion/v1_4/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
-    "analogdiffusion/v1_4/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
-    "openjourney/v1_4/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
-    "openjourney/v1_4/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
-    "openjourney/v1_4/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
-    "openjourney/v1_4/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
-    "openjourney/v1_4/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
-    "openjourney/v1_4/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
-    "openjourney/v1_4/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
-    "dreamlike/v1_4/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
-    "dreamlike/v1_4/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
-    "dreamlike/v1_4/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
-    "dreamlike/v1_4/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
-    "dreamlike/v1_4/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
-    "dreamlike/v1_4/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
-    "dreamlike/v1_4/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
  }
 ]
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -45,12 +45,12 @@
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
        ]
      }
    }
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -1,5 +1,4 @@
 import os
-import io
 from shark.model_annotation import model_annotation, create_context
 from shark.iree_utils._common import iree_target_map, run_cmd
 from shark.shark_downloader import (
@@ -20,22 +19,6 @@ def get_device():
    return device


-def get_device_args():
-    device = get_device()
-    device_spec_args = []
-    if device == "cuda":
-        from shark.iree_utils.gpu_utils import get_iree_gpu_args
-
-        gpu_flags = get_iree_gpu_args()
-        for flag in gpu_flags:
-            device_spec_args.append(flag)
-    elif device == "vulkan":
-        device_spec_args.append(
-            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
-        )
-    return device, device_spec_args
-
-
 # Download the model (Unet or VAE fp16) from shark_tank
 def load_model_from_tank():
    from apps.stable_diffusion.src.models import (
@@ -70,7 +53,7 @@ def load_winograd_configs():
    config_bucket = "gs://shark_tank/sd_tuned/configs/"
    config_name = f"{args.annotation_model}_winograd_{device}.json"
    full_gs_url = config_bucket + config_name
-    winograd_config_dir = os.path.join(WORKDIR, "configs", config_name)
+    winograd_config_dir = f"{WORKDIR}configs/" + config_name
    print("Loading Winograd config file from ", winograd_config_dir)
    download_public_file(full_gs_url, winograd_config_dir, True)
    return winograd_config_dir
@@ -78,48 +61,20 @@ def load_winograd_configs():

 def load_lower_configs():
    from apps.stable_diffusion.src.models import get_variant_version
-    from apps.stable_diffusion.src.utils.utils import (
-        fetch_and_update_base_model_id,
-    )

-    if args.ckpt_loc != "":
-        base_model_id = fetch_and_update_base_model_id(args.ckpt_loc)
-    else:
-        base_model_id = fetch_and_update_base_model_id(args.hf_model_id)
-        if base_model_id == "":
-            base_model_id = args.hf_model_id
-
-    variant, version = get_variant_version(base_model_id)
-
-    if version == "inpaint_v1":
-        version = "v1_4"
-    elif version == "inpaint_v2":
-        version = "v2_1base"
-
-    config_bucket = "gs://shark_tank/sd_tuned_configs/"
-
-    device, device_spec_args = get_device_args()
-    spec = ""
-    if device_spec_args:
-        spec = device_spec_args[-1].split("=")[-1].strip()
-        if device == "vulkan":
-            spec = spec.split("-")[0]
+    variant, version = get_variant_version(args.hf_model_id)

+    config_bucket = "gs://shark_tank/sd_tuned/configs/"
+    config_version = version
+    if variant in ["anythingv3", "analogdiffusion"]:
+        args.max_length = 77
+        config_version = "v1_4"
    if args.annotation_model == "vae":
-        if not spec or spec in ["rdna3", "sm_80"]:
-            config_name = (
-                f"{args.annotation_model}_{args.precision}_{device}.json"
-            )
-        else:
-            config_name = f"{args.annotation_model}_{args.precision}_{device}_{spec}.json"
-    else:
-        if not spec or spec in ["rdna3", "sm_80"]:
-            config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}.json"
-        else:
-            config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}_{spec}.json"
-
+        args.max_length = 77
+    device = get_device()
+    config_name = f"{args.annotation_model}_{config_version}_{args.precision}_len{args.max_length}_{device}.json"
    full_gs_url = config_bucket + config_name
-    lowering_config_dir = os.path.join(WORKDIR, "configs", config_name)
+    lowering_config_dir = f"{WORKDIR}configs/" + config_name
    print("Loading lowering config file from ", lowering_config_dir)
    download_public_file(full_gs_url, lowering_config_dir, True)
    return lowering_config_dir
@@ -127,6 +82,13 @@ def load_lower_configs():

 # Annotate the model with Winograd attribute on selected conv ops
 def annotate_with_winograd(input_mlir, winograd_config_dir, model_name):
+    if model_name.split("_")[-1] != "tuned":
+        out_file_path = (
+            f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
+        )
+    else:
+        out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+
    with create_context() as ctx:
        winograd_model = model_annotation(
            ctx,
@@ -135,110 +97,119 @@ def annotate_with_winograd(input_mlir, winograd_config_dir, model_name):
            search_op="conv",
            winograd=True,
        )
-
-    bytecode_stream = io.BytesIO()
-    winograd_model.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    if args.save_annotation:
-        if model_name.split("_")[-1] != "tuned":
-            out_file_path = os.path.join(
-                args.annotation_output, model_name + "_tuned_torch.mlir"
-            )
-        else:
-            out_file_path = os.path.join(
-                args.annotation_output, model_name + "_torch.mlir"
-            )
        with open(out_file_path, "w") as f:
            f.write(str(winograd_model))
            f.close()
-
-    return bytecode
-
-
-def dump_after_mlir(input_mlir, use_winograd):
-    import iree.compiler as ireec
-
-    device, device_spec_args = get_device_args()
-    if use_winograd:
-        preprocess_flag = "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
-    else:
-        preprocess_flag = "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
-
-    dump_module = ireec.compile_str(
-        input_mlir,
-        target_backends=[iree_target_map(device)],
-        extra_args=device_spec_args
-        + [
-            preprocess_flag,
-            "--compile-to=preprocessing",
-        ],
-    )
-    return dump_module
+    return winograd_model, out_file_path


 # For Unet annotate the model with tuned lowering configs
 def annotate_with_lower_configs(
    input_mlir, lowering_config_dir, model_name, use_winograd
 ):
+    if use_winograd:
+        dump_after = "iree-linalg-ext-convert-conv2d-to-winograd"
+        preprocess_flag = (
+            "--iree-preprocessing-pass-pipeline='builtin.module"
+            "(func.func(iree-preprocessing-convert-conv2d-to-img2col,"
+            "iree-preprocessing-pad-linalg-ops{pad-size=32},"
+            "iree-linalg-ext-convert-conv2d-to-winograd))' "
+        )
+    else:
+        dump_after = "iree-preprocessing-pad-linalg-ops"
+        preprocess_flag = (
+            "--iree-preprocessing-pass-pipeline='builtin.module"
+            "(func.func(iree-preprocessing-convert-conv2d-to-img2col,"
+            "iree-preprocessing-pad-linalg-ops{pad-size=32}))' "
+        )
+
    # Dump IR after padding/img2col/winograd passes
-    dump_module = dump_after_mlir(input_mlir, use_winograd)
+    device_spec_args = ""
+    device = get_device()
+    if device == "cuda":
+        from shark.iree_utils.gpu_utils import get_iree_gpu_args
+
+        gpu_flags = get_iree_gpu_args()
+        for flag in gpu_flags:
+            device_spec_args += flag + " "
+    elif device == "vulkan":
+        device_spec_args = (
+            f"--iree-vulkan-target-triple={args.iree_vulkan_target_triple} "
+        )
    print("Applying tuned configs on", model_name)

+    run_cmd(
+        f"iree-compile {input_mlir} "
+        "--iree-input-type=tm_tensor "
+        f"--iree-hal-target-backends={iree_target_map(device)} "
+        f"{device_spec_args}"
+        f"{preprocess_flag}"
+        "--iree-stream-resource-index-bits=64 "
+        "--iree-vm-target-index-bits=64 "
+        f"--mlir-print-ir-after={dump_after} "
+        "--compile-to=flow "
+        f"2>{args.annotation_output}/dump_after_winograd.mlir "
+    )
+
    # Annotate the model with lowering configs in the config file
    with create_context() as ctx:
        tuned_model = model_annotation(
            ctx,
-            input_contents=dump_module,
+            input_contents=f"{args.annotation_output}/dump_after_winograd.mlir",
            config_path=lowering_config_dir,
            search_op="all",
        )

-    bytecode_stream = io.BytesIO()
-    tuned_model.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    if args.save_annotation:
-        if model_name.split("_")[-1] != "tuned":
-            out_file_path = (
-                f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
-            )
-        else:
-            out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
-        with open(out_file_path, "w") as f:
-            f.write(str(tuned_model))
-            f.close()
-
-    return bytecode
+    # Remove the intermediate mlir and save the final annotated model
+    os.remove(f"{args.annotation_output}/dump_after_winograd.mlir")
+    if model_name.split("_")[-1] != "tuned":
+        out_file_path = (
+            f"{args.annotation_output}/{model_name}_tuned_torch.mlir"
+        )
+    else:
+        out_file_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+    with open(out_file_path, "w") as f:
+        f.write(str(tuned_model))
+        f.close()
+    return tuned_model, out_file_path


-def sd_model_annotation(mlir_model, model_name):
+def sd_model_annotation(mlir_model, model_name, model_from_tank=False):
    device = get_device()
    if args.annotation_model == "unet" and device == "vulkan":
        use_winograd = True
        winograd_config_dir = load_winograd_configs()
-        winograd_model = annotate_with_winograd(
+        winograd_model, model_path = annotate_with_winograd(
            mlir_model, winograd_config_dir, model_name
        )
        lowering_config_dir = load_lower_configs()
-        tuned_model = annotate_with_lower_configs(
-            winograd_model, lowering_config_dir, model_name, use_winograd
+        tuned_model, output_path = annotate_with_lower_configs(
+            model_path, lowering_config_dir, model_name, use_winograd
        )
    elif args.annotation_model == "vae" and device == "vulkan":
        use_winograd = True
        winograd_config_dir = load_winograd_configs()
-        tuned_model = annotate_with_winograd(
+        tuned_model, output_path = annotate_with_winograd(
            mlir_model, winograd_config_dir, model_name
        )
    else:
        use_winograd = False
+        if model_from_tank:
+            mlir_model = f"{WORKDIR}{model_name}_torch/{model_name}_torch.mlir"
+        else:
+            # Just use this function to convert bytecode to string
+            orig_model, model_path = annotate_with_winograd(
+                mlir_model, "", model_name
+            )
+            mlir_model = model_path
        lowering_config_dir = load_lower_configs()
-        tuned_model = annotate_with_lower_configs(
+        tuned_model, output_path = annotate_with_lower_configs(
            mlir_model, lowering_config_dir, model_name, use_winograd
        )
-    return tuned_model
+    print(f"Saved the annotated mlir in {output_path}.")
+    return tuned_model, output_path


 if __name__ == "__main__":
    mlir_model, model_name = load_model_from_tank()
-    sd_model_annotation(mlir_model, model_name)
+    sd_model_annotation(mlir_model, model_name, model_from_tank=True)
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 from pathlib import Path


@@ -6,6 +7,13 @@ def path_expand(s):
    return Path(s).expanduser().resolve()


+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
 p = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )
@@ -17,24 +25,18 @@ p = argparse.ArgumentParser(
 p.add_argument(
    "-p",
    "--prompts",
-    nargs="+",
-    default=["cyberpunk forest by Salvador Dali"],
+    action="append",
+    default=[],
    help="text of which images to be generated.",
 )

 p.add_argument(
    "--negative_prompts",
    nargs="+",
-    default=["trees, green"],
+    default=[""],
    help="text you don't want to see in the generated image.",
 )

-p.add_argument(
-    "--img_path",
-    type=str,
-    help="Path to the image input for img2img/inpainting",
-)
-
 p.add_argument(
    "--steps",
    type=int,
@@ -45,8 +47,8 @@ p.add_argument(
 p.add_argument(
    "--seed",
    type=int,
-    default=-1,
-    help="the seed to use. -1 for a random one.",
+    default=42,
+    help="the seed to use.",
 )

 p.add_argument(
@@ -54,14 +56,13 @@ p.add_argument(
    type=int,
    default=1,
    choices=range(1, 4),
-    help="the number of inferences to be made in a single `batch_count`.",
+    help="the number of inferences to be made in a single `run`.",
 )

 p.add_argument(
    "--height",
    type=int,
    default=512,
-    choices=range(384, 769, 8),
    help="the height of the output image.",
 )

@@ -69,7 +70,6 @@ p.add_argument(
    "--width",
    type=int,
    default=512,
-    choices=range(384, 769, 8),
    help="the width of the output image.",
 )

@@ -87,96 +87,6 @@ p.add_argument(
    help="max length of the tokenizer output, options are 64 and 77.",
 )

-p.add_argument(
-    "--strength",
-    type=float,
-    default=0.8,
-    help="the strength of change applied on the given input image for img2img",
-)
-
-##############################################################################
-### Inpainting and Outpainting Params
-##############################################################################
-
-p.add_argument(
-    "--mask_path",
-    type=str,
-    help="Path to the mask image input for inpainting",
-)
-
-p.add_argument(
-    "--inpaint_full_res",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="If inpaint only masked area or whole picture",
-)
-
-p.add_argument(
-    "--inpaint_full_res_padding",
-    type=int,
-    default=32,
-    choices=range(0, 257, 4),
-    help="Number of pixels for only masked padding",
-)
-
-p.add_argument(
-    "--pixels",
-    type=int,
-    default=128,
-    choices=range(8, 257, 8),
-    help="Number of expended pixels for one direction for outpainting",
-)
-
-p.add_argument(
-    "--mask_blur",
-    type=int,
-    default=8,
-    choices=range(0, 65),
-    help="Number of blur pixels for outpainting",
-)
-
-p.add_argument(
-    "--left",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="If expend left for outpainting",
-)
-
-p.add_argument(
-    "--right",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="If expend right for outpainting",
-)
-
-p.add_argument(
-    "--top",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="If expend top for outpainting",
-)
-
-p.add_argument(
-    "--bottom",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="If expend bottom for outpainting",
-)
-
-p.add_argument(
-    "--noise_q",
-    type=float,
-    default=1.0,
-    help="Fall-off exponent for outpainting (lower=higher detail) (min=0.0, max=4.0)",
-)
-
-p.add_argument(
-    "--color_variation",
-    type=float,
-    default=0.05,
-    help="Color variation for outpainting (min=0.0, max=1.0)",
-)
-
 ##############################################################################
 ### Model Config and Usage Params
 ##############################################################################
@@ -246,10 +156,10 @@ p.add_argument(
 )

 p.add_argument(
-    "--batch_count",
+    "--runs",
    type=int,
    default=1,
-    help="number of batch to be generated with random seeds in single execution",
+    help="number of images to be generated with random seeds in single execution",
 )

 p.add_argument(
@@ -259,13 +169,6 @@ p.add_argument(
    help="Path to SD's .ckpt file.",
 )

-p.add_argument(
-    "--custom_vae",
-    type=str,
-    default="",
-    help="HuggingFace repo-id or path to SD model's checkpoint whose Vae needs to be plugged in.",
-)
-
 p.add_argument(
    "--hf_model_id",
    type=str,
@@ -274,23 +177,10 @@ p.add_argument(
 )

 p.add_argument(
-    "--low_cpu_mem_usage",
+    "--enable_stack_trace",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="Use the accelerate package to reduce cpu memory consumption",
-)
-
-p.add_argument(
-    "--attention_slicing",
-    type=str,
-    default="none",
-    help="Amount of attention slicing to use (one of 'max', 'auto', 'none', or an integer)",
-)
-
-p.add_argument(
-    "--use_stencil",
-    choices=["canny"],
-    help="Enable the stencil feature.",
+    help="Enable showing the stack trace when retrying the base model configuration",
 )

 ##############################################################################
@@ -298,7 +188,7 @@ p.add_argument(
 ##############################################################################

 p.add_argument(
-    "--iree_vulkan_target_triple",
+    "--iree-vulkan-target-triple",
    type=str,
    default="",
    help="Specify target triple for vulkan",
@@ -397,7 +287,7 @@ p.add_argument(

 p.add_argument(
    "--write_metadata_to_png",
-    default=True,
+    default=False,
    action=argparse.BooleanOptionalAction,
    help="flag for whether or not to save generation information in PNG chunk text to generated images.",
 )
@@ -410,7 +300,7 @@ p.add_argument(
    "--progress_bar",
    default=True,
    action=argparse.BooleanOptionalAction,
-    help="flag for removing the progress bar animation during image generation",
+    help="flag for removing the pregress bar animation during image generation",
 )

 p.add_argument(
@@ -454,10 +344,10 @@ p.add_argument(
 )

 p.add_argument(
-    "--save_annotation",
+    "--use_winograd",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="Save annotated mlir file",
+    help="Apply Winograd on selected conv ops.",
 )

 args, unknown = p.parse_known_args()
--- a/apps/stable_diffusion/src/utils/stencils/canny/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/canny/init.py
@@ -1,6 +0,0 @@
-import cv2
-
-
-class CannyDetector:
-    def __call__(self, img, low_threshold, high_threshold):
-        return cv2.Canny(img, low_threshold, high_threshold)
--- a/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
+++ b/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
@@ -1,173 +0,0 @@
-import cv2
-import numpy as np
-from PIL import Image
-import torch
-from apps.stable_diffusion.src.utils.stencils.canny import CannyDetector
-
-stencil = {}
-
-
-def HWC3(x):
-    assert x.dtype == np.uint8
-    if x.ndim == 2:
-        x = x[:, :, None]
-    assert x.ndim == 3
-    H, W, C = x.shape
-    assert C == 1 or C == 3 or C == 4
-    if C == 3:
-        return x
-    if C == 1:
-        return np.concatenate([x, x, x], axis=2)
-    if C == 4:
-        color = x[:, :, 0:3].astype(np.float32)
-        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
-        y = color * alpha + 255.0 * (1.0 - alpha)
-        y = y.clip(0, 255).astype(np.uint8)
-        return y
-
-
-def resize_image(input_image, resolution):
-    H, W, C = input_image.shape
-    H = float(H)
-    W = float(W)
-    k = float(resolution) / min(H, W)
-    H *= k
-    W *= k
-    H = int(np.round(H / 64.0)) * 64
-    W = int(np.round(W / 64.0)) * 64
-    img = cv2.resize(
-        input_image,
-        (W, H),
-        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
-    )
-    return img
-
-
-def controlnet_hint_shaping(
-    controlnet_hint, height, width, dtype, num_images_per_prompt=1
-):
-    channels = 3
-    if isinstance(controlnet_hint, torch.Tensor):
-        # torch.Tensor: acceptble shape are any of chw, bchw(b==1) or bchw(b==num_images_per_prompt)
-        shape_chw = (channels, height, width)
-        shape_bchw = (1, channels, height, width)
-        shape_nchw = (num_images_per_prompt, channels, height, width)
-        if controlnet_hint.shape in [shape_chw, shape_bchw, shape_nchw]:
-            controlnet_hint = controlnet_hint.to(
-                dtype=dtype, device=torch.device("cpu")
-            )
-            if controlnet_hint.shape != shape_nchw:
-                controlnet_hint = controlnet_hint.repeat(
-                    num_images_per_prompt, 1, 1, 1
-                )
-            return controlnet_hint
-        else:
-            raise ValueError(
-                f"Acceptble shape of `stencil` are any of ({channels}, {height}, {width}),"
-                + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
-                + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
-            )
-    elif isinstance(controlnet_hint, np.ndarray):
-        # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
-        # hwc is opencv compatible image format. Color channel must be BGR Format.
-        if controlnet_hint.shape == (height, width):
-            controlnet_hint = np.repeat(
-                controlnet_hint[:, :, np.newaxis], channels, axis=2
-            )  # hw -> hwc(c==3)
-        shape_hwc = (height, width, channels)
-        shape_bhwc = (1, height, width, channels)
-        shape_nhwc = (num_images_per_prompt, height, width, channels)
-        if controlnet_hint.shape in [shape_hwc, shape_bhwc, shape_nhwc]:
-            controlnet_hint = torch.from_numpy(controlnet_hint.copy())
-            controlnet_hint = controlnet_hint.to(
-                dtype=dtype, device=torch.device("cpu")
-            )
-            controlnet_hint /= 255.0
-            if controlnet_hint.shape != shape_nhwc:
-                controlnet_hint = controlnet_hint.repeat(
-                    num_images_per_prompt, 1, 1, 1
-                )
-            controlnet_hint = controlnet_hint.permute(
-                0, 3, 1, 2
-            )  # b h w c -> b c h w
-            return controlnet_hint
-        else:
-            raise ValueError(
-                f"Acceptble shape of `stencil` are any of ({width}, {channels}), "
-                + f"({height}, {width}, {channels}), "
-                + f"(1, {height}, {width}, {channels}) or "
-                + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
-            )
-    elif isinstance(controlnet_hint, Image.Image):
-        if controlnet_hint.size == (width, height):
-            controlnet_hint = controlnet_hint.convert(
-                "RGB"
-            )  # make sure 3 channel RGB format
-            controlnet_hint = np.array(controlnet_hint)  # to numpy
-            controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
-            return controlnet_hint_shaping(
-                controlnet_hint, height, width, num_images_per_prompt
-            )
-        else:
-            raise ValueError(
-                f"Acceptable image size of `stencil` is ({width}, {height}) but is {controlnet_hint.size}"
-            )
-    else:
-        raise ValueError(
-            f"Acceptable type of `stencil` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
-        )
-
-
-def controlnet_hint_conversion(
-    image, use_stencil, height, width, dtype, num_images_per_prompt=1
-):
-    controlnet_hint = None
-    match use_stencil:
-        case "canny":
-            print("Detecting edge with canny")
-            controlnet_hint = hint_canny(image, width)
-        case _:
-            return None
-    controlnet_hint = controlnet_hint_shaping(
-        controlnet_hint, height, width, dtype, num_images_per_prompt
-    )
-    return controlnet_hint
-
-
-stencil_to_model_id_map = {
-    "canny": "lllyasviel/sd-controlnet-canny",
-    "depth": "lllyasviel/sd-controlnet-depth",
-    "hed": "lllyasviel/sd-controlnet-hed",
-    "mlsd": "lllyasviel/sd-controlnet-mlsd",
-    "normal": "lllyasviel/sd-controlnet-normal",
-    "openpose": "lllyasviel/sd-controlnet-openpose",
-    "scribble": "lllyasviel/sd-controlnet-scribble",
-    "seg": "lllyasviel/sd-controlnet-seg",
-}
-
-
-def get_stencil_model_id(use_stencil):
-    if use_stencil in stencil_to_model_id_map:
-        return stencil_to_model_id_map[use_stencil]
-    return None
-
-
-# Stencil 1. Canny
-def hint_canny(
-    image: Image.Image,
-    width=512,
-    height=512,
-    low_threshold=100,
-    high_threshold=200,
-):
-    with torch.no_grad():
-        input_image = np.array(image)
-        image_resolution = width
-
-        img = resize_image(HWC3(input_image), image_resolution)
-
-        if not "canny" in stencil:
-            stencil["canny"] = CannyDetector()
-        detected_map = stencil["canny"](img, low_threshold, high_threshold)
-        detected_map = HWC3(detected_map)
-        return detected_map
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -1,13 +1,9 @@
 import os
 import gc
-import json
-import re
-from PIL import PngImagePlugin
-from datetime import datetime as dt
-from csv import DictWriter
 from pathlib import Path
 import numpy as np
 from random import randint
+import tempfile
 from shark.shark_inference import SharkInference
 from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
@@ -18,26 +14,26 @@ from shark.iree_utils.gpu_utils import get_cuda_sm_cc
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.resources import opt_flags
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
-import sys
+import sys, functools, operator
 from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
    load_pipeline_from_original_stable_diffusion_ckpt,
 )


-def get_extended_name(model_name):
-    device = args.device.split("://", 1)[0]
-    extended_name = "{}_{}".format(model_name, device)
-    return extended_name
-
-
 def get_vmfb_path_name(model_name):
-    vmfb_path = os.path.join(os.getcwd(), model_name + ".vmfb")
-    return vmfb_path
+    device = (
+        args.device
+        if "://" not in args.device
+        else "-".join(args.device.split("://"))
+    )
+    extended_name = "{}_{}".format(model_name, device)
+    vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
+    return [vmfb_path, extended_name]


 def _compile_module(shark_module, model_name, extra_args=[]):
    if args.load_vmfb or args.save_vmfb:
-        vmfb_path = get_vmfb_path_name(model_name)
+        [vmfb_path, extended_name] = get_vmfb_path_name(model_name)
        if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
            print(f"loading existing vmfb from: {vmfb_path}")
            shark_module.load_module(vmfb_path, extra_args=extra_args)
@@ -51,7 +47,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
                    )
                )
            path = shark_module.save_module(
-                os.getcwd(), model_name, extra_args
+                os.getcwd(), extended_name, extra_args
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -61,13 +57,11 @@ def _compile_module(shark_module, model_name, extra_args=[]):

 # Downloads the model from shark_tank and returns the shark_module.
 def get_shark_model(tank_url, model_name, extra_args=[]):
+    from shark.shark_downloader import download_model
    from shark.parser import shark_args

    # Set local shark_tank cache directory.
    shark_args.local_tank_cache = args.local_tank_cache
-
-    from shark.shark_downloader import download_model
-
    if "cuda" in args.device:
        shark_args.enable_tf32 = True

@@ -90,6 +84,9 @@ def compile_through_fx(
    is_f16=False,
    f16_input_mask=None,
    use_tuned=False,
+    save_dir=tempfile.gettempdir(),
+    debug=False,
+    generate_vmfb=True,
    extra_args=[],
 ):
    from shark.parser import shark_args
@@ -97,25 +94,40 @@ def compile_through_fx(
    if "cuda" in args.device:
        shark_args.enable_tf32 = True

-    mlir_module, func_name = import_with_fx(
-        model, inputs, is_f16, f16_input_mask
-    )
-
-    if use_tuned:
-        if "vae" in model_name.split("_")[0]:
-            args.annotation_model = "vae"
-        mlir_module = sd_model_annotation(mlir_module, model_name)
-
-    shark_module = SharkInference(
+    (
        mlir_module,
-        device=args.device,
-        mlir_dialect="linalg",
+        func_name,
+    ) = import_with_fx(
+        model=model,
+        inputs=inputs,
+        is_f16=is_f16,
+        f16_input_mask=f16_input_mask,
+        debug=debug,
+        model_name=model_name,
+        save_dir=save_dir,
    )
+    if use_tuned:
+        tuned_model_path = f"{args.annotation_output}/{model_name}_torch.mlir"
+        if not os.path.exists(tuned_model_path):
+            if "vae" in model_name.split("_")[0]:
+                args.annotation_model = "vae"

-    del mlir_module
-    gc.collect()
+            tuned_model, tuned_model_path = sd_model_annotation(
+                mlir_module, model_name
+            )
+            del mlir_module, tuned_model
+            gc.collect()

-    return _compile_module(shark_module, model_name, extra_args)
+        with open(tuned_model_path, "rb") as f:
+            mlir_module = f.read()
+            f.close()
+    if generate_vmfb:
+        shark_module = SharkInference(
+            mlir_module,
+            device=args.device,
+            mlir_dialect="linalg",
+        )
+        return _compile_module(shark_module, model_name, extra_args)


 def set_iree_runtime_flags():
@@ -235,15 +247,10 @@ def set_init_device_flags():
        args.max_length = 64

    # Use tuned models in the case of fp16, vulkan rdna3 or cuda sm devices.
-    if args.ckpt_loc != "":
-        base_model_id = fetch_and_update_base_model_id(args.ckpt_loc)
-    else:
-        base_model_id = fetch_and_update_base_model_id(args.hf_model_id)
-        if base_model_id == "":
-            base_model_id = args.hf_model_id
-
    if (
-        args.precision != "fp16"
+        args.hf_model_id == "prompthero/openjourney"
+        or args.ckpt_loc != ""
+        or args.precision != "fp16"
        or args.height != 512
        or args.width != 512
        or args.batch_size != 1
@@ -251,26 +258,17 @@ def set_init_device_flags():
    ):
        args.use_tuned = False

-    elif base_model_id not in [
-        "Linaqruf/anything-v3.0",
-        "dreamlike-art/dreamlike-diffusion-1.0",
-        "prompthero/openjourney",
-        "wavymulder/Analog-Diffusion",
-        "stabilityai/stable-diffusion-2-1",
-        "stabilityai/stable-diffusion-2-1-base",
-        "CompVis/stable-diffusion-v1-4",
-        "runwayml/stable-diffusion-v1-5",
-        "runwayml/stable-diffusion-inpainting",
-        "stabilityai/stable-diffusion-2-inpainting",
-    ]:
-        args.use_tuned = False
-
-    elif "vulkan" in args.device and not any(
-        x in args.iree_vulkan_target_triple for x in ["rdna2", "rdna3"]
+    elif (
+        "vulkan" in args.device
+        and "rdna3" not in args.iree_vulkan_target_triple
    ):
        args.use_tuned = False

-    elif "cuda" in args.device and get_cuda_sm_cc() not in ["sm_80", "sm_89"]:
+    elif "cuda" in args.device and get_cuda_sm_cc() not in [
+        "sm_80",
+        "sm_84",
+        "sm_86",
+    ]:
        args.use_tuned = False

    elif args.use_base_vae and args.hf_model_id not in [
@@ -280,7 +278,7 @@ def set_init_device_flags():
        args.use_tuned = False

    if args.use_tuned:
-        print(f"Using tuned models for {base_model_id}/fp16/{args.device}.")
+        print(f"Using tuned models for {args.hf_model_id}/fp16/{args.device}.")
    else:
        print("Tuned models are currently not supported for this setting.")

@@ -302,27 +300,6 @@ def set_init_device_flags():
    elif args.height != 512 or args.width != 512 or args.batch_size != 1:
        args.import_mlir = True

-    elif args.use_tuned and args.hf_model_id in [
-        "dreamlike-art/dreamlike-diffusion-1.0",
-        "prompthero/openjourney",
-        "stabilityai/stable-diffusion-2-1",
-    ]:
-        args.import_mlir = True
-
-    elif (
-        args.use_tuned
-        and "vulkan" in args.device
-        and "rdna2" in args.iree_vulkan_target_triple
-    ):
-        args.import_mlir = True
-
-    elif (
-        args.use_tuned
-        and "cuda" in args.device
-        and get_cuda_sm_cc() == "sm_89"
-    ):
-        args.import_mlir = True
-

 # Utility to get list of devices available.
 def get_available_devices():
@@ -397,11 +374,6 @@ def get_opt_flags(model, precision="fp16"):
    return iree_flags


-def get_path_stem(path):
-    path = Path(path)
-    return path.stem
-
-
 def get_path_to_diffusers_checkpoint(custom_weights):
    path = Path(custom_weights)
    diffusers_path = path.parent.absolute()
@@ -412,7 +384,7 @@ def get_path_to_diffusers_checkpoint(custom_weights):
    return path_to_diffusers


-def preprocessCKPT(custom_weights, is_inpaint=False):
+def preprocessCKPT(custom_weights):
    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights)
    if next(Path(path_to_diffusers).iterdir(), None):
        print("Checkpoint already loaded at : ", path_to_diffusers)
@@ -433,20 +405,17 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
    print(
        "Loading diffusers' pipeline from original stable diffusion checkpoint"
    )
-    num_in_channels = 9 if is_inpaint else 4
    pipe = load_pipeline_from_original_stable_diffusion_ckpt(
        checkpoint_path=custom_weights,
        extract_ema=extract_ema,
        from_safetensors=from_safetensors,
-        num_in_channels=num_in_channels,
    )
    pipe.save_pretrained(path_to_diffusers)
    print("Loading complete")


 def load_vmfb(vmfb_path, model, precision):
-    model = "vae" if "base_vae" in model or "vae_encode" in model else model
-    model = "unet" if "stencil" in model else model
+    model = "vae" if "base_vae" in model else model
    precision = "fp32" if "clip" in model else precision
    extra_args = get_opt_flags(model, precision)
    shark_module = SharkInference(mlir_module=None, device=args.device)
@@ -454,60 +423,30 @@ def load_vmfb(vmfb_path, model, precision):
    return shark_module


-# This utility returns vmfbs of Clip, Unet, Vae and Vae_encode, in case all of them
+# This utility returns vmfbs of Clip, Unet and Vae, in case all three of them
 # are present; deletes them otherwise.
-def fetch_or_delete_vmfbs(extended_model_name, precision="fp32"):
+def fetch_or_delete_vmfbs(basic_model_name, use_base_vae, precision="fp32"):
+    model_name = ["clip", "unet", "base_vae" if use_base_vae else "vae"]
    vmfb_path = [
-        get_vmfb_path_name(extended_model_name[model])
-        for model in extended_model_name
+        get_vmfb_path_name(model + basic_model_name)[0] for model in model_name
    ]
-    number_of_vmfbs = len(vmfb_path)
    vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
-    all_vmfb_present = True
-    compiled_models = [None] * number_of_vmfbs
-
-    for i in range(number_of_vmfbs):
-        all_vmfb_present = all_vmfb_present and vmfb_present[i]
-
+    all_vmfb_present = functools.reduce(operator.__and__, vmfb_present)
+    compiled_models = [None] * 3
    # We need to delete vmfbs only if some of the models were compiled.
    if not all_vmfb_present:
-        for i in range(number_of_vmfbs):
+        for i in range(len(vmfb_path)):
            if vmfb_present[i]:
                os.remove(vmfb_path[i])
                print("Deleted: ", vmfb_path[i])
    else:
-        model_name = [model for model in extended_model_name.keys()]
-        for i in range(number_of_vmfbs):
+        for i in range(len(vmfb_path)):
            compiled_models[i] = load_vmfb(
                vmfb_path[i], model_name[i], precision
            )
    return compiled_models


-# `fetch_and_update_base_model_id` is a resource utility function which
-# helps maintaining mapping of the model to run with its base model.
-# If `base_model` is "", then this function tries to fetch the base model
-# info for the `model_to_run`.
-def fetch_and_update_base_model_id(model_to_run, base_model=""):
-    variants_path = os.path.join(os.getcwd(), "variants.json")
-    data = {model_to_run: base_model}
-    json_data = {}
-    if os.path.exists(variants_path):
-        with open(variants_path, "r", encoding="utf-8") as jsonFile:
-            json_data = json.load(jsonFile)
-            # Return with base_model's info if base_model is "".
-            if base_model == "":
-                if model_to_run in json_data:
-                    base_model = json_data[model_to_run]
-                return base_model
-    elif base_model == "":
-        return base_model
-    # Update JSON data to contain an entry mapping model_to_run with base_model.
-    json_data.update(data)
-    with open(variants_path, "w", encoding="utf-8") as jsonFile:
-        json.dump(json_data, jsonFile)
-
-
 # Generate and return a new seed if the provided one is not in the supported range (including -1)
 def sanitize_seed(seed):
    uint32_info = np.iinfo(np.uint32)
@@ -515,97 +454,3 @@ def sanitize_seed(seed):
    if seed < uint32_min or seed >= uint32_max:
        seed = randint(uint32_min, uint32_max)
    return seed
-
-
-# clear all the cached objects to recompile cleanly.
-def clear_all():
-    print("CLEARING ALL, EXPECT SEVERAL MINUTES TO RECOMPILE")
-    from glob import glob
-    import shutil
-
-    vmfbs = glob(os.path.join(os.getcwd(), "*.vmfb"))
-    for vmfb in vmfbs:
-        if os.path.exists(vmfb):
-            os.remove(vmfb)
-    # Temporary workaround of deleting yaml files to incorporate diffusers' pipeline.
-    # TODO: Remove this once we have better weight updation logic.
-    inference_yaml = ["v2-inference-v.yaml", "v1-inference.yaml"]
-    for yaml in inference_yaml:
-        if os.path.exists(yaml):
-            os.remove(yaml)
-    home = os.path.expanduser("~")
-    if os.name == "nt":  # Windows
-        appdata = os.getenv("LOCALAPPDATA")
-        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
-        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
-    elif os.name == "unix":
-        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
-        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
-
-
-# save output images and the inputs corresponding to it.
-def save_output_img(output_img, img_seed, extra_info={}):
-    output_path = args.output_dir if args.output_dir else Path.cwd()
-    generated_imgs_path = Path(
-        output_path, "generated_imgs", dt.now().strftime("%Y%m%d")
-    )
-    generated_imgs_path.mkdir(parents=True, exist_ok=True)
-    csv_path = Path(generated_imgs_path, "imgs_details.csv")
-
-    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", args.prompts[0][:15])
-    out_img_name = (
-        f"{prompt_slice}_{img_seed}_{dt.now().strftime('%y%m%d_%H%M%S')}"
-    )
-
-    img_model = args.hf_model_id
-    if args.ckpt_loc:
-        img_model = os.path.basename(args.ckpt_loc)
-
-    if args.output_img_format == "jpg":
-        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
-        output_img.save(out_img_path, quality=95, subsampling=0)
-    else:
-        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
-        pngInfo = PngImagePlugin.PngInfo()
-
-        if args.write_metadata_to_png:
-            pngInfo.add_text(
-                "parameters",
-                f"{args.prompts[0]}\nNegative prompt: {args.negative_prompts[0]}\nSteps:{args.steps}, Sampler: {args.scheduler}, CFG scale: {args.guidance_scale}, Seed: {img_seed}, Size: {args.width}x{args.height}, Model: {img_model}",
-            )
-
-        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
-
-        if args.output_img_format not in ["png", "jpg"]:
-            print(
-                f"[ERROR] Format {args.output_img_format} is not supported yet."
-                "Image saved as png instead. Supported formats: png / jpg"
-            )
-
-    new_entry = {
-        "VARIANT": img_model,
-        "SCHEDULER": args.scheduler,
-        "PROMPT": args.prompts[0],
-        "NEG_PROMPT": args.negative_prompts[0],
-        "SEED": img_seed,
-        "CFG_SCALE": args.guidance_scale,
-        "PRECISION": args.precision,
-        "STEPS": args.steps,
-        "HEIGHT": args.height,
-        "WIDTH": args.width,
-        "MAX_LENGTH": args.max_length,
-        "OUTPUT": out_img_path,
-    }
-
-    new_entry.update(extra_info)
-
-    with open(csv_path, "a", encoding="utf-8") as csv_obj:
-        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
-        dictwriter_obj.writerow(new_entry)
-        csv_obj.close()
-
-    if args.save_metadata_to_json:
-        del new_entry["OUTPUT"]
-        json_path = Path(generated_imgs_path, f"{out_img_name}.json")
-        with open(json_path, "w") as f:
-            json.dump(new_entry, f, indent=4)
--- a/apps/stable_diffusion/stable_diffusion_amd.md
+++ b/apps/stable_diffusion/stable_diffusion_amd.md
@@ -0,0 +1,70 @@
+# Stable Diffusion optimized for AMD RDNA2/RDNA3 GPUs
+
+Before you start, please be aware that this is beta software that relies on a special AMD driver. Like all StableDiffusion GUIs published so far, you need some technical expertise to set it up. We apologize in advance if you bump into issues. If that happens, please don't hesitate to ask our Discord community for help! Please be assured that we (Nod and AMD) are working hard to improve the user experience in coming months.
+If it works well for you, please "star" the following GitHub projects... this is one of the best ways to help and spread the word!
+
+* https://github.com/nod-ai/SHARK
+* https://github.com/iree-org/iree
+
+## Install this specific AMD Drivers (AMD latest may not have all the fixes).
+
+### AMD KB Drivers for RDNA2 and RDNA3:
+
+*AMD Software: Adrenalin Edition 22.11.1 for MLIR/IREE Driver Version 22.20.29.09 for Windows® 10 and Windows® 11 (Windows Driver Store Version 31.0.12029.9003)*
+
+First, for RDNA2 users, download this special driver in a folder of your choice. We recommend you keep the installation files around, since you may need to re-install it later, if Windows Update decides to overwrite it:
+https://www.amd.com/en/support/kb/release-notes/rn-rad-win-22-11-1-mlir-iree
+
+For RDNA3, the latest driver 23.1.2 supports MLIR/IREE as well: https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-1-2-kb
+
+KNOWN ISSUES with this special AMD driver:
+* `Windows Update` may (depending how it's configured) automatically install a new official AMD driver that overwrites this IREE-specific driver. If Stable Diffusion used to work, then a few days later, it slows down a lot or produces incorrect results (e.g. black images), this may be the cause. To fix this problem, please check the installed driver version, and re-install the special driver if needed. (TODO: document how to prevent this `Windows Update` behavior!)
+* Some people using this special driver experience mouse pointer accuracy issues, especially if using a larger-than-default mouse pointer. The clicked point isn't centered properly. One possible work-around is to reset the pointer size to "1" in "Change pointer size and color".
+
+## Installation
+
+Download the latest Windows SHARK SD binary [492 here](https://github.com/nod-ai/SHARK/releases/download/20230203.492/shark_sd_20230203_492.exe) in a folder of your choice. If you want nighly builds, you can look for them on the GitHub releases page.
+
+Notes:
+* We recommend that you download this EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files. Those contain Vulkan dispatches compiled from MLIR which can be outdated if you run a new EXE from the same folder. You can use `--clear_all` flag once to clean all the old files. 
+* If you recently updated the driver or this binary (EXE file), we recommend you:
+  * clear all the local artifacts with `--clear_all` OR 
+  * clear the Vulkan shader cache: For Windows users this can be done by clearing the contents of `C:\Users\%username%\AppData\Local\AMD\VkCache\`. On Linux the same cache is typically located at `~/.cache/AMD/VkCache/`.
+  * clear the `huggingface` cache. In Windows, this is `C:\Users\%username%\.cache\huggingface`.
+
+## Running
+
+* Open a Command Prompt or Powershell terminal, change folder (`cd`) to the .exe folder. Then run the EXE from the command prompt. That way, if an error occurs, you'll be able to cut-and-paste it to ask for help. (if it always works for you without error, you may simply double-click the EXE to start the web browser)
+* The first run may take about 10-15 minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
+* If successful, you will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
+* Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/?__theme=dark.
+
+## Stopping
+
+* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment. The application should stop. 
+* Please make sure to do the above step before you attempt to update the EXE to a new version.
+
+# Results
+
+<img width="1607" alt="webui" src="https://user-images.githubusercontent.com/74956/204939260-b8308bc2-8dc4-47f6-9ac0-f60b66edab99.png">
+
+
+Here are some samples generated:
+
+![tajmahal, snow, sunflowers, oil on canvas_0](https://user-images.githubusercontent.com/74956/204934186-141f7e43-6eb2-4e89-a99c-4704d20444b3.jpg)
+
+![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
+
+
+The output on a 7900XTX would like:
+
+```shell 
+Stats for run 0:
+Average step time: 47.19188690185547ms/it
+Clip Inference time (ms) = 109.531
+VAE Inference time (ms): 78.590
+
+Total image generation time: 2.5788655281066895sec
+```
+
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -144,30 +144,19 @@
    --dataset-table-border-hover: var(--color-grey-800);
 }

-/* SHARK theme */
-body {
+/* SHARK theme customization */
+
+.gradio-container {
    background-color: var(--color-background-primary);
 }

-/* display in full width for desktop devices */
-@media (min-width: 1536px)
-{
-    .gradio-container {
-        max-width: var(--size-full) !important;
-    }
-}
-
-.gradio-container .contain {
-    padding: 0 var(--size-4) !important;
-}
-
 .container {
    background-color: black !important;
-    padding-top: var(--size-5) !important;
+    padding-top: 20px !important;
 }

 #ui_title {
-    padding: var(--size-2) 0 0 var(--size-1);
+    padding: 10px !important;
 }

 #top_logo {
@@ -176,6 +165,15 @@ body {
    border: 0;
 }

+#demo_title {
+    background-color: var(--color-background-primary);
+    border-radius: 0 !important;
+    border: 0;
+    padding-top: 15px;
+    padding-bottom: 0px;
+    width: 350px !important;
+}
+
 #demo_title_outer {
    border-radius: 0;
 }
@@ -184,7 +182,7 @@ body {
    border-radius: 0 !important
 }

-#prompt_box textarea, #negative_prompt_box textarea {
+#prompt_box textarea {
    background-color: var(--color-background-primary) !important;
 }

@@ -198,7 +196,7 @@ body {

 #ui_body {
    background-color: var(--color-background-secondary) !important;
-    padding: var(--size-2) !important;
+    padding: 10px !important;
    border-radius: 0.5em !important;
 }

@@ -209,13 +207,3 @@ body {
 footer {
    display: none !important;
 }
-
-#gallery + div {
-    border-radius: 0 !important;
-}
-
-/* Prevent progress bar to block gallery navigation while building images (Gradio V3.19.0) */
-#gallery .wrap.default {
-    pointer-events: none;
-}
-
--- a/apps/stable_diffusion/web/gradio/img2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/img2img_ui.py
--- a/apps/stable_diffusion/web/gradio/txt2img_ui.py
+++ b/apps/stable_diffusion/web/gradio/txt2img_ui.py
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,21 +1,14 @@
 import os
 import sys
+from pathlib import Path
+import glob
+
+if "AMD_ENABLE_LLPC" not in os.environ:
+    os.environ["AMD_ENABLE_LLPC"] = "1"

 if sys.platform == "darwin":
    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"

-import gradio as gr
-from apps.stable_diffusion.src import args, clear_all
-from apps.stable_diffusion.web.utils.gradio_configs import (
-    clear_gradio_tmp_imgs_folder,
-)
-
-# clear all gradio tmp images from the last session
-clear_gradio_tmp_imgs_folder()
-
-if args.clear_all:
-    clear_all()
-

 def resource_path(relative_path):
    """Get absolute path to resource, works for dev and for PyInstaller"""
@@ -25,114 +18,245 @@ def resource_path(relative_path):
    return os.path.join(base_path, relative_path)


-dark_theme = resource_path("ui/css/sd_dark_theme.css")
-
-from apps.stable_diffusion.web.ui import (
-    txt2img_web,
-    txt2img_gallery,
-    txt2img_sendto_img2img,
-    txt2img_sendto_inpaint,
-    txt2img_sendto_outpaint,
-    img2img_web,
-    img2img_gallery,
-    img2img_init_image,
-    img2img_sendto_inpaint,
-    img2img_sendto_outpaint,
-    inpaint_web,
-    inpaint_gallery,
-    inpaint_init_image,
-    inpaint_sendto_img2img,
-    inpaint_sendto_outpaint,
-    outpaint_web,
-    outpaint_gallery,
-    outpaint_init_image,
-    outpaint_sendto_img2img,
-    outpaint_sendto_inpaint,
+import gradio as gr
+from PIL import Image
+from apps.stable_diffusion.src import (
+    prompt_examples,
+    args,
+    get_available_devices,
 )
+from apps.stable_diffusion.scripts import txt2img_inf
+
+nodlogo_loc = resource_path("logos/nod-logo.png")
+sdlogo_loc = resource_path("logos/sd-demo-logo.png")


-def register_button_click(button, selectedid, inputs, outputs):
-    button.click(
-        lambda x: (
-            x[0]["name"] if len(x) != 0 else None,
-            gr.Tabs.update(selected=selectedid),
-        ),
-        inputs,
-        outputs,
-    )
+demo_css = resource_path("css/sd_dark_theme.css")


-with gr.Blocks(
-    css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
-) as sd_web:
-    with gr.Tabs() as tabs:
-        with gr.TabItem(label="Text-to-Image", id=0):
-            txt2img_web.render()
-        with gr.TabItem(label="Image-to-Image", id=1):
-            img2img_web.render()
-        with gr.TabItem(label="Inpainting", id=2):
-            inpaint_web.render()
-        with gr.TabItem(label="Outpainting", id=3):
-            outpaint_web.render()
+with gr.Blocks(title="Stable Diffusion", css=demo_css) as shark_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        logo2 = Image.open(sdlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="top_logo",
+                ).style(width=150, height=100)
+            with gr.Column(scale=5, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=logo2,
+                    show_label=False,
+                    interactive=False,
+                    elem_id="demo_title",
+                ).style(width=150, height=100)

-    register_button_click(
-        txt2img_sendto_img2img,
-        1,
-        [txt2img_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
-        txt2img_sendto_inpaint,
-        2,
-        [txt2img_gallery],
-        [inpaint_init_image, tabs],
-    )
-    register_button_click(
-        txt2img_sendto_outpaint,
-        3,
-        [txt2img_gallery],
-        [outpaint_init_image, tabs],
-    )
-    register_button_click(
-        img2img_sendto_inpaint,
-        2,
-        [img2img_gallery],
-        [inpaint_init_image, tabs],
-    )
-    register_button_click(
-        img2img_sendto_outpaint,
-        3,
-        [img2img_gallery],
-        [outpaint_init_image, tabs],
-    )
-    register_button_click(
-        inpaint_sendto_img2img,
-        1,
-        [inpaint_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
-        inpaint_sendto_outpaint,
-        3,
-        [inpaint_gallery],
-        [outpaint_init_image, tabs],
-    )
-    register_button_click(
-        outpaint_sendto_img2img,
-        1,
-        [outpaint_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
-        outpaint_sendto_inpaint,
-        2,
-        [outpaint_gallery],
-        [inpaint_init_image, tabs],
-    )
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    ckpt_path = (
+                        Path(args.ckpt_dir)
+                        if args.ckpt_dir
+                        else Path(Path.cwd(), "models")
+                    )
+                    ckpt_path.mkdir(parents=True, exist_ok=True)
+                    types = (
+                        "*.ckpt",
+                        "*.safetensors",
+                    )  # the tuple of file types
+                    ckpt_files = ["None"]
+                    for extn in types:
+                        files = glob.glob(os.path.join(ckpt_path, extn))
+                        ckpt_files.extend(files)
+                    custom_model = gr.Dropdown(
+                        label=f"Models (Custom Model path: {ckpt_path})",
+                        value="None",
+                        choices=ckpt_files
+                        + [
+                            "Linaqruf/anything-v3.0",
+                            "prompthero/openjourney",
+                            "wavymulder/Analog-Diffusion",
+                            "stabilityai/stable-diffusion-2-1",
+                            "stabilityai/stable-diffusion-2-1-base",
+                            "CompVis/stable-diffusion-v1-4",
+                        ],
+                    )
+                    hf_model_id = gr.Textbox(
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                        value="",
+                        label="HuggingFace Model ID",
+                    )

+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value="cyberpunk forest by Salvador Dali",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value="trees, green",
+                        lines=1,
+                        elem_id="prompt_box",
+                    )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            label="Scheduler",
+                            value="SharkEulerDiscrete",
+                            choices=[
+                                "DDIM",
+                                "PNDM",
+                                "LMSDiscrete",
+                                "DPMSolverMultistep",
+                                "EulerDiscrete",
+                                "EulerAncestralDiscrete",
+                                "SharkEulerDiscrete",
+                            ],
+                        )
+                        with gr.Group():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=True,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=False,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        height = gr.Slider(
+                            384, 786, value=512, step=8, label="Height"
+                        )
+                        width = gr.Slider(
+                            384, 786, value=512, step=8, label="Width"
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value="fp16",
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=64,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        steps = gr.Slider(
+                            1, 100, value=50, step=1, label="Steps"
+                        )
+                        guidance_scale = gr.Slider(
+                            0,
+                            50,
+                            value=7.5,
+                            step=0.1,
+                            label="CFG Scale",
+                        )
+                    with gr.Row():
+                        batch_count = gr.Slider(
+                            1,
+                            10,
+                            value=1,
+                            step=1,
+                            label="Batch Count",
+                            interactive=True,
+                        )
+                        batch_size = gr.Slider(
+                            1,
+                            4,
+                            value=1,
+                            step=1,
+                            label="Batch Size",
+                            interactive=True,
+                        )
+                with gr.Row():
+                    seed = gr.Number(value=-1, precision=0, label="Seed")
+                    available_devices = get_available_devices()
+                    device = gr.Dropdown(
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                    )
+                with gr.Row():
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        None,
+                        inputs=[],
+                        outputs=[seed],
+                        _js="() => Math.floor(Math.random() * 4294967295)",
+                    )
+                    stable_diffusion = gr.Button("Generate Image")
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )

-sd_web.queue()
-sd_web.launch(
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                    ).style(grid=[2], height="auto")
+                    std_output = gr.Textbox(
+                        value="Nothing to show.",
+                        lines=4,
+                        show_label=False,
+                    )
+                output_dir = args.output_dir if args.output_dir else Path.cwd()
+                output_dir = Path(output_dir, "generated_imgs")
+                output_loc = gr.Textbox(
+                    label="Saving Images at",
+                    value=output_dir,
+                    interactive=False,
+                )
+        kwargs = dict(
+            fn=txt2img_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                custom_model,
+                hf_model_id,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+            ],
+            outputs=[gallery, std_output],
+            show_progress=args.progress_bar,
+        )
+
+        prompt.submit(**kwargs)
+        stable_diffusion.click(**kwargs)
+
+shark_web.queue()
+shark_web.launch(
    share=args.share,
    inbrowser=True,
    server_name="0.0.0.0",
--- a/apps/stable_diffusion/web/logos/Nod_logo.png
+++ b/apps/stable_diffusion/web/logos/Nod_logo.png
--- a/apps/stable_diffusion/web/ui/logos/nod-logo.png
+++ b/apps/stable_diffusion/web/ui/logos/nod-logo.png
--- a/apps/stable_diffusion/web/logos/sd-demo-logo.png
+++ b/apps/stable_diffusion/web/logos/sd-demo-logo.png
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -1,28 +0,0 @@
-from apps.stable_diffusion.web.ui.txt2img_ui import (
-    txt2img_web,
-    txt2img_gallery,
-    txt2img_sendto_img2img,
-    txt2img_sendto_inpaint,
-    txt2img_sendto_outpaint,
-)
-from apps.stable_diffusion.web.ui.img2img_ui import (
-    img2img_web,
-    img2img_gallery,
-    img2img_init_image,
-    img2img_sendto_inpaint,
-    img2img_sendto_outpaint,
-)
-from apps.stable_diffusion.web.ui.inpaint_ui import (
-    inpaint_web,
-    inpaint_gallery,
-    inpaint_init_image,
-    inpaint_sendto_img2img,
-    inpaint_sendto_outpaint,
-)
-from apps.stable_diffusion.web.ui.outpaint_ui import (
-    outpaint_web,
-    outpaint_gallery,
-    outpaint_init_image,
-    outpaint_sendto_img2img,
-    outpaint_sendto_inpaint,
-)
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -1,247 +0,0 @@
-import os
-import sys
-import glob
-from pathlib import Path
-import gradio as gr
-from PIL import Image
-from apps.stable_diffusion.scripts import img2img_inf
-from apps.stable_diffusion.src import args
-from apps.stable_diffusion.web.ui.utils import (
-    available_devices,
-    nodlogo_loc,
-)
-
-
-with gr.Blocks(title="Image-to-Image") as img2img_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=50)
-    with gr.Row(elem_id="ui_body"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Row():
-                    ckpt_path = (
-                        Path(args.ckpt_dir)
-                        if args.ckpt_dir
-                        else Path(Path.cwd(), "models")
-                    )
-                    ckpt_path.mkdir(parents=True, exist_ok=True)
-                    types = (
-                        "*.ckpt",
-                        "*.safetensors",
-                    )  # the tuple of file types
-                    ckpt_files = ["None"]
-                    for extn in types:
-                        files = glob.glob(os.path.join(ckpt_path, extn))
-                        ckpt_files.extend(files)
-                    custom_model = gr.Dropdown(
-                        label=f"Models (Custom Model path: {ckpt_path})",
-                        value=args.ckpt_loc if args.ckpt_loc else "None",
-                        choices=ckpt_files
-                        + [
-                            "Linaqruf/anything-v3.0",
-                            "prompthero/openjourney",
-                            "wavymulder/Analog-Diffusion",
-                            "stabilityai/stable-diffusion-2-1",
-                            "stabilityai/stable-diffusion-2-1-base",
-                            "CompVis/stable-diffusion-v1-4",
-                        ],
-                    )
-                    hf_model_id = gr.Textbox(
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
-                        value="",
-                        label="HuggingFace Model ID",
-                        lines=3,
-                    )
-
-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value=args.prompts[0],
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value=args.negative_prompts[0],
-                        lines=1,
-                        elem_id="negative_prompt_box",
-                    )
-
-                img2img_init_image = gr.Image(
-                    label="Input Image", type="pil"
-                ).style(height=300)
-
-                with gr.Accordion(label="Stencil Options", open=False):
-                    with gr.Row():
-                        use_stencil = gr.Dropdown(
-                            label="Stencil model",
-                            value="None",
-                            choices=["None", "canny"],
-                        )
-                with gr.Accordion(label="Advanced Options", open=False):
-                    with gr.Row():
-                        scheduler = gr.Dropdown(
-                            label="Scheduler",
-                            value="PNDM",
-                            choices=[
-                                "DDIM",
-                                "PNDM",
-                                "DPMSolverMultistep",
-                                "EulerAncestralDiscrete",
-                            ],
-                        )
-                        with gr.Group():
-                            save_metadata_to_png = gr.Checkbox(
-                                label="Save prompt information to PNG",
-                                value=args.write_metadata_to_png,
-                                interactive=True,
-                            )
-                            save_metadata_to_json = gr.Checkbox(
-                                label="Save prompt information to JSON file",
-                                value=args.save_metadata_to_json,
-                                interactive=True,
-                            )
-                    with gr.Row():
-                        height = gr.Slider(
-                            384, 768, value=args.height, step=8, label="Height"
-                        )
-                        width = gr.Slider(
-                            384, 768, value=args.width, step=8, label="Width"
-                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value=args.precision,
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=True,
-                        )
-                        max_length = gr.Radio(
-                            label="Max Length",
-                            value=args.max_length,
-                            choices=[
-                                64,
-                                77,
-                            ],
-                            visible=False,
-                        )
-                    with gr.Row():
-                        steps = gr.Slider(
-                            1, 100, value=args.steps, step=1, label="Steps"
-                        )
-                        strength = gr.Slider(
-                            0,
-                            1,
-                            value=args.strength,
-                            step=0.01,
-                            label="Strength",
-                        )
-                    with gr.Row():
-                        guidance_scale = gr.Slider(
-                            0,
-                            50,
-                            value=args.guidance_scale,
-                            step=0.1,
-                            label="CFG Scale",
-                        )
-                        batch_count = gr.Slider(
-                            1,
-                            100,
-                            value=args.batch_count,
-                            step=1,
-                            label="Batch Count",
-                            interactive=True,
-                        )
-                        batch_size = gr.Slider(
-                            1,
-                            4,
-                            value=args.batch_size,
-                            step=1,
-                            label="Batch Size",
-                            interactive=False,
-                            visible=False,
-                        )
-                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
-                    )
-                    device = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image(s)")
-
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    img2img_gallery = gr.Gallery(
-                        label="Generated images",
-                        show_label=False,
-                        elem_id="gallery",
-                    ).style(grid=[2])
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=1,
-                        show_label=False,
-                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
-                with gr.Row():
-                    img2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
-                    img2img_sendto_outpaint = gr.Button(
-                        value="SendTo Outpaint"
-                    )
-
-        kwargs = dict(
-            fn=img2img_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                img2img_init_image,
-                height,
-                width,
-                steps,
-                strength,
-                guidance_scale,
-                seed,
-                batch_count,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                use_stencil,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[img2img_gallery, std_output],
-            show_progress=args.progress_bar,
-        )
-
-        prompt.submit(**kwargs)
-        negative_prompt.submit(**kwargs)
-        stable_diffusion.click(**kwargs)
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -1,246 +0,0 @@
-import os
-import sys
-import glob
-from pathlib import Path
-import gradio as gr
-from PIL import Image
-from apps.stable_diffusion.scripts import inpaint_inf
-from apps.stable_diffusion.src import args
-from apps.stable_diffusion.web.ui.utils import (
-    available_devices,
-    nodlogo_loc,
-)
-
-
-with gr.Blocks(title="Inpainting") as inpaint_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=50)
-    with gr.Row(elem_id="ui_body"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Row():
-                    ckpt_path = (
-                        Path(args.ckpt_dir)
-                        if args.ckpt_dir
-                        else Path(Path.cwd(), "models")
-                    )
-                    ckpt_path.mkdir(parents=True, exist_ok=True)
-                    types = (
-                        "*.ckpt",
-                        "*.safetensors",
-                    )  # the tuple of file types
-                    ckpt_files = ["None"]
-                    for extn in types:
-                        files = glob.glob(os.path.join(ckpt_path, extn))
-                        ckpt_files.extend(files)
-                    custom_model = gr.Dropdown(
-                        label=f"Models (Custom Model path: {ckpt_path})",
-                        value=args.ckpt_loc if args.ckpt_loc else "None",
-                        choices=ckpt_files
-                        + [
-                            "runwayml/stable-diffusion-inpainting",
-                            "stabilityai/stable-diffusion-2-inpainting",
-                        ],
-                    )
-                    hf_model_id = gr.Textbox(
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting",
-                        value="",
-                        label="HuggingFace Model ID",
-                        lines=3,
-                    )
-
-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value=args.prompts[0],
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value=args.negative_prompts[0],
-                        lines=1,
-                        elem_id="negative_prompt_box",
-                    )
-
-                inpaint_init_image = gr.Image(
-                    label="Masked Image",
-                    source="upload",
-                    tool="sketch",
-                    type="pil",
-                ).style(height=350)
-
-                with gr.Accordion(label="Advanced Options", open=False):
-                    with gr.Row():
-                        scheduler = gr.Dropdown(
-                            label="Scheduler",
-                            value="PNDM",
-                            choices=[
-                                "DDIM",
-                                "PNDM",
-                                "DPMSolverMultistep",
-                                "EulerAncestralDiscrete",
-                            ],
-                        )
-                        with gr.Group():
-                            save_metadata_to_png = gr.Checkbox(
-                                label="Save prompt information to PNG",
-                                value=args.write_metadata_to_png,
-                                interactive=True,
-                            )
-                            save_metadata_to_json = gr.Checkbox(
-                                label="Save prompt information to JSON file",
-                                value=args.save_metadata_to_json,
-                                interactive=True,
-                            )
-                    with gr.Row():
-                        height = gr.Slider(
-                            384, 768, value=args.height, step=8, label="Height"
-                        )
-                        width = gr.Slider(
-                            384, 768, value=args.width, step=8, label="Width"
-                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value=args.precision,
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=False,
-                        )
-                        max_length = gr.Radio(
-                            label="Max Length",
-                            value=args.max_length,
-                            choices=[
-                                64,
-                                77,
-                            ],
-                            visible=False,
-                        )
-                    with gr.Row():
-                        inpaint_full_res = gr.Radio(
-                            choices=["Whole picture", "Only masked"],
-                            type="index",
-                            value="Whole picture",
-                            label="Inpaint area",
-                        )
-                        inpaint_full_res_padding = gr.Slider(
-                            minimum=0,
-                            maximum=256,
-                            step=4,
-                            value=32,
-                            label="Only masked padding, pixels",
-                        )
-                    with gr.Row():
-                        steps = gr.Slider(
-                            1, 100, value=args.steps, step=1, label="Steps"
-                        )
-                    with gr.Row():
-                        guidance_scale = gr.Slider(
-                            0,
-                            50,
-                            value=args.guidance_scale,
-                            step=0.1,
-                            label="CFG Scale",
-                        )
-                        batch_count = gr.Slider(
-                            1,
-                            100,
-                            value=args.batch_count,
-                            step=1,
-                            label="Batch Count",
-                            interactive=True,
-                        )
-                        batch_size = gr.Slider(
-                            1,
-                            4,
-                            value=args.batch_size,
-                            step=1,
-                            label="Batch Size",
-                            interactive=False,
-                            visible=False,
-                        )
-                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
-                    )
-                    device = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image(s)")
-
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    inpaint_gallery = gr.Gallery(
-                        label="Generated images",
-                        show_label=False,
-                        elem_id="gallery",
-                    ).style(grid=[2])
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=1,
-                        show_label=False,
-                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
-                with gr.Row():
-                    inpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
-                    inpaint_sendto_outpaint = gr.Button(
-                        value="SendTo Outpaint"
-                    )
-
-        kwargs = dict(
-            fn=inpaint_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                inpaint_init_image,
-                height,
-                width,
-                inpaint_full_res,
-                inpaint_full_res_padding,
-                steps,
-                guidance_scale,
-                seed,
-                batch_count,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[inpaint_gallery, std_output],
-            show_progress=args.progress_bar,
-        )
-
-        prompt.submit(**kwargs)
-        negative_prompt.submit(**kwargs)
-        stable_diffusion.click(**kwargs)
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -1,266 +0,0 @@
-import os
-import sys
-import glob
-from pathlib import Path
-import gradio as gr
-from PIL import Image
-from apps.stable_diffusion.scripts import outpaint_inf
-from apps.stable_diffusion.src import args
-from apps.stable_diffusion.web.ui.utils import (
-    available_devices,
-    nodlogo_loc,
-)
-
-
-with gr.Blocks(title="Outpainting") as outpaint_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=50)
-    with gr.Row(elem_id="ui_body"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Row():
-                    ckpt_path = (
-                        Path(args.ckpt_dir)
-                        if args.ckpt_dir
-                        else Path(Path.cwd(), "models")
-                    )
-                    ckpt_path.mkdir(parents=True, exist_ok=True)
-                    types = (
-                        "*.ckpt",
-                        "*.safetensors",
-                    )  # the tuple of file types
-                    ckpt_files = ["None"]
-                    for extn in types:
-                        files = glob.glob(os.path.join(ckpt_path, extn))
-                        ckpt_files.extend(files)
-                    custom_model = gr.Dropdown(
-                        label=f"Models (Custom Model path: {ckpt_path})",
-                        value=args.ckpt_loc if args.ckpt_loc else "None",
-                        choices=ckpt_files
-                        + [
-                            "runwayml/stable-diffusion-inpainting",
-                            "stabilityai/stable-diffusion-2-inpainting",
-                        ],
-                    )
-                    hf_model_id = gr.Textbox(
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting",
-                        value="",
-                        label="HuggingFace Model ID",
-                        lines=3,
-                    )
-
-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value=args.prompts[0],
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value=args.negative_prompts[0],
-                        lines=1,
-                        elem_id="negative_prompt_box",
-                    )
-
-                outpaint_init_image = gr.Image(
-                    label="Input Image", type="pil"
-                ).style(height=300)
-
-                with gr.Accordion(label="Advanced Options", open=False):
-                    with gr.Row():
-                        scheduler = gr.Dropdown(
-                            label="Scheduler",
-                            value="PNDM",
-                            choices=[
-                                "DDIM",
-                                "PNDM",
-                                "DPMSolverMultistep",
-                                "EulerAncestralDiscrete",
-                            ],
-                        )
-                        with gr.Group():
-                            save_metadata_to_png = gr.Checkbox(
-                                label="Save prompt information to PNG",
-                                value=args.write_metadata_to_png,
-                                interactive=True,
-                            )
-                            save_metadata_to_json = gr.Checkbox(
-                                label="Save prompt information to JSON file",
-                                value=args.save_metadata_to_json,
-                                interactive=True,
-                            )
-                    with gr.Row():
-                        pixels = gr.Slider(
-                            8,
-                            256,
-                            value=args.pixels,
-                            step=8,
-                            label="Pixels to expand",
-                        )
-                        mask_blur = gr.Slider(
-                            0,
-                            64,
-                            value=args.mask_blur,
-                            step=1,
-                            label="Mask blur",
-                        )
-                    with gr.Row():
-                        directions = gr.CheckboxGroup(
-                            label="Outpainting direction",
-                            choices=["left", "right", "up", "down"],
-                            value=["left", "right", "up", "down"],
-                        )
-                    with gr.Row():
-                        noise_q = gr.Slider(
-                            0.0,
-                            4.0,
-                            value=1.0,
-                            step=0.01,
-                            label="Fall-off exponent (lower=higher detail)",
-                        )
-                        color_variation = gr.Slider(
-                            0.0,
-                            1.0,
-                            value=0.05,
-                            step=0.01,
-                            label="Color variation",
-                        )
-                    with gr.Row():
-                        height = gr.Slider(
-                            384, 768, value=args.height, step=8, label="Height"
-                        )
-                        width = gr.Slider(
-                            384, 768, value=args.width, step=8, label="Width"
-                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value=args.precision,
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=False,
-                        )
-                        max_length = gr.Radio(
-                            label="Max Length",
-                            value=args.max_length,
-                            choices=[
-                                64,
-                                77,
-                            ],
-                            visible=False,
-                        )
-                    with gr.Row():
-                        steps = gr.Slider(
-                            1, 100, value=20, step=1, label="Steps"
-                        )
-                    with gr.Row():
-                        guidance_scale = gr.Slider(
-                            0,
-                            50,
-                            value=args.guidance_scale,
-                            step=0.1,
-                            label="CFG Scale",
-                        )
-                        batch_count = gr.Slider(
-                            1,
-                            100,
-                            value=args.batch_count,
-                            step=1,
-                            label="Batch Count",
-                            interactive=True,
-                        )
-                        batch_size = gr.Slider(
-                            1,
-                            4,
-                            value=args.batch_size,
-                            step=1,
-                            label="Batch Size",
-                            interactive=False,
-                            visible=False,
-                        )
-                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
-                    )
-                    device = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image(s)")
-
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    outpaint_gallery = gr.Gallery(
-                        label="Generated images",
-                        show_label=False,
-                        elem_id="gallery",
-                    ).style(grid=[2])
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=1,
-                        show_label=False,
-                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
-                with gr.Row():
-                    outpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
-                    outpaint_sendto_inpaint = gr.Button(value="SendTo Inpaint")
-
-        kwargs = dict(
-            fn=outpaint_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                outpaint_init_image,
-                pixels,
-                mask_blur,
-                directions,
-                noise_q,
-                color_variation,
-                height,
-                width,
-                steps,
-                guidance_scale,
-                seed,
-                batch_count,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[outpaint_gallery, std_output],
-            show_progress=args.progress_bar,
-        )
-
-        prompt.submit(**kwargs)
-        negative_prompt.submit(**kwargs)
-        stable_diffusion.click(**kwargs)
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -1,236 +0,0 @@
-import os
-import sys
-import glob
-from pathlib import Path
-import gradio as gr
-from PIL import Image
-from apps.stable_diffusion.scripts import txt2img_inf
-from apps.stable_diffusion.src import prompt_examples, args
-from apps.stable_diffusion.web.ui.utils import (
-    available_devices,
-    nodlogo_loc,
-)
-
-
-with gr.Blocks(title="Text-to-Image") as txt2img_web:
-    with gr.Row(elem_id="ui_title"):
-        nod_logo = Image.open(nodlogo_loc)
-        with gr.Row():
-            with gr.Column(scale=1, elem_id="demo_title_outer"):
-                gr.Image(
-                    value=nod_logo,
-                    show_label=False,
-                    interactive=False,
-                    elem_id="top_logo",
-                ).style(width=150, height=50)
-    with gr.Row(elem_id="ui_body"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=600):
-                with gr.Row():
-                    ckpt_path = (
-                        Path(args.ckpt_dir)
-                        if args.ckpt_dir
-                        else Path(Path.cwd(), "models")
-                    )
-                    ckpt_path.mkdir(parents=True, exist_ok=True)
-                    types = (
-                        "*.ckpt",
-                        "*.safetensors",
-                    )  # the tuple of file types
-                    ckpt_files = ["None"]
-                    for extn in types:
-                        files = glob.glob(os.path.join(ckpt_path, extn))
-                        ckpt_files.extend(files)
-                    custom_model = gr.Dropdown(
-                        label=f"Models (Custom Model path: {ckpt_path})",
-                        value=args.ckpt_loc if args.ckpt_loc else "None",
-                        choices=ckpt_files
-                        + [
-                            "Linaqruf/anything-v3.0",
-                            "prompthero/openjourney",
-                            "wavymulder/Analog-Diffusion",
-                            "stabilityai/stable-diffusion-2-1",
-                            "stabilityai/stable-diffusion-2-1-base",
-                            "CompVis/stable-diffusion-v1-4",
-                        ],
-                    )
-                    hf_model_id = gr.Textbox(
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
-                        value="",
-                        label="HuggingFace Model ID",
-                        lines=3,
-                    )
-
-                with gr.Group(elem_id="prompt_box_outer"):
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        value=args.prompts[0],
-                        lines=1,
-                        elem_id="prompt_box",
-                    )
-                    negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value=args.negative_prompts[0],
-                        lines=1,
-                        elem_id="negative_prompt_box",
-                    )
-                with gr.Accordion(label="Advanced Options", open=False):
-                    with gr.Row():
-                        scheduler = gr.Dropdown(
-                            label="Scheduler",
-                            value=args.scheduler,
-                            choices=[
-                                "DDIM",
-                                "PNDM",
-                                "LMSDiscrete",
-                                "KDPM2Discrete",
-                                "DPMSolverMultistep",
-                                "EulerDiscrete",
-                                "EulerAncestralDiscrete",
-                                "SharkEulerDiscrete",
-                            ],
-                        )
-                        with gr.Group():
-                            save_metadata_to_png = gr.Checkbox(
-                                label="Save prompt information to PNG",
-                                value=args.write_metadata_to_png,
-                                interactive=True,
-                            )
-                            save_metadata_to_json = gr.Checkbox(
-                                label="Save prompt information to JSON file",
-                                value=args.save_metadata_to_json,
-                                interactive=True,
-                            )
-                    with gr.Row():
-                        height = gr.Slider(
-                            384, 768, value=args.height, step=8, label="Height"
-                        )
-                        width = gr.Slider(
-                            384, 768, value=args.width, step=8, label="Width"
-                        )
-                        precision = gr.Radio(
-                            label="Precision",
-                            value=args.precision,
-                            choices=[
-                                "fp16",
-                                "fp32",
-                            ],
-                            visible=False,
-                        )
-                        max_length = gr.Radio(
-                            label="Max Length",
-                            value=args.max_length,
-                            choices=[
-                                64,
-                                77,
-                            ],
-                            visible=False,
-                        )
-                    with gr.Row():
-                        steps = gr.Slider(
-                            1, 100, value=args.steps, step=1, label="Steps"
-                        )
-                        guidance_scale = gr.Slider(
-                            0,
-                            50,
-                            value=args.guidance_scale,
-                            step=0.1,
-                            label="CFG Scale",
-                        )
-                    with gr.Row():
-                        batch_count = gr.Slider(
-                            1,
-                            100,
-                            value=args.batch_count,
-                            step=1,
-                            label="Batch Count",
-                            interactive=True,
-                        )
-                        batch_size = gr.Slider(
-                            1,
-                            4,
-                            value=args.batch_size,
-                            step=1,
-                            label="Batch Size",
-                            interactive=True,
-                        )
-                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
-                    )
-                    device = gr.Dropdown(
-                        label="Device",
-                        value=available_devices[0],
-                        choices=available_devices,
-                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        None,
-                        inputs=[],
-                        outputs=[seed],
-                        _js="() => Math.floor(Math.random() * 4294967295)",
-                    )
-                    stable_diffusion = gr.Button("Generate Image(s)")
-                with gr.Accordion(label="Prompt Examples!", open=False):
-                    ex = gr.Examples(
-                        examples=prompt_examples,
-                        inputs=prompt,
-                        cache_examples=False,
-                        elem_id="prompt_examples",
-                    )
-
-            with gr.Column(scale=1, min_width=600):
-                with gr.Group():
-                    txt2img_gallery = gr.Gallery(
-                        label="Generated images",
-                        show_label=False,
-                        elem_id="gallery",
-                    ).style(grid=[2])
-                    std_output = gr.Textbox(
-                        value="Nothing to show.",
-                        lines=1,
-                        show_label=False,
-                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
-                with gr.Row():
-                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
-                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
-                    txt2img_sendto_outpaint = gr.Button(
-                        value="SendTo Outpaint"
-                    )
-
-        kwargs = dict(
-            fn=txt2img_inf,
-            inputs=[
-                prompt,
-                negative_prompt,
-                height,
-                width,
-                steps,
-                guidance_scale,
-                seed,
-                batch_count,
-                batch_size,
-                scheduler,
-                custom_model,
-                hf_model_id,
-                precision,
-                device,
-                max_length,
-                save_metadata_to_json,
-                save_metadata_to_png,
-            ],
-            outputs=[txt2img_gallery, std_output],
-            show_progress=args.progress_bar,
-        )
-
-        prompt.submit(**kwargs)
-        negative_prompt.submit(**kwargs)
-        stable_diffusion.click(**kwargs)
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -1,15 +0,0 @@
-import os
-import sys
-from apps.stable_diffusion.src import get_available_devices
-
-
-def resource_path(relative_path):
-    """Get absolute path to resource, works for dev and for PyInstaller"""
-    base_path = getattr(
-        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
-    )
-    return os.path.join(base_path, relative_path)
-
-
-nodlogo_loc = resource_path("logos/nod-logo.png")
-available_devices = get_available_devices()
--- a/apps/stable_diffusion/web/utils/gradio_configs.py
+++ b/apps/stable_diffusion/web/utils/gradio_configs.py
@@ -1,31 +0,0 @@
-import os
-import tempfile
-import gradio
-from os import listdir
-
-gradio_tmp_imgs_folder = os.path.join(os.getcwd(), "shark_tmp/")
-
-
-# Clear all gradio tmp images
-def clear_gradio_tmp_imgs_folder():
-    if not os.path.exists(gradio_tmp_imgs_folder):
-        return
-    for fileName in listdir(gradio_tmp_imgs_folder):
-        # Delete tmp png files
-        if fileName.startswith("tmp") and fileName.endswith(".png"):
-            os.remove(gradio_tmp_imgs_folder + fileName)
-
-
-# Overwrite save_pil_to_file from gradio to save tmp images generated by gradio into our own tmp folder
-def save_pil_to_file(pil_image, dir=None):
-    if not os.path.exists(gradio_tmp_imgs_folder):
-        os.mkdir(gradio_tmp_imgs_folder)
-    file_obj = tempfile.NamedTemporaryFile(
-        delete=False, suffix=".png", dir=gradio_tmp_imgs_folder
-    )
-    pil_image.save(file_obj)
-    return file_obj
-
-
-# Register save_pil_to_file override
-gradio.processing_utils.save_pil_to_file = save_pil_to_file
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -30,15 +30,9 @@ def compare_images(new_filename, golden_filename):
    diff = np.abs(new - golden)
    mean = np.mean(diff)
    if mean > 0.1:
-        if os.name != "nt":
-            subprocess.run(
-                [
-                    "gsutil",
-                    "cp",
-                    new_filename,
-                    "gs://shark_tank/testdata/builder/",
-                ]
-            )
+        subprocess.run(
+            ["gsutil", "cp", new_filename, "gs://shark_tank/testdata/builder/"]
+        )
        raise SystemExit("new and golden not close")
    else:
        print("SUCCESS")
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -1,16 +1,13 @@
 import os
-from sys import executable
 import subprocess
 from apps.stable_diffusion.src.utils.resources import (
    get_json_file,
 )
-from datetime import datetime as dt
 from shark.shark_downloader import download_public_file
 from image_comparison import compare_images
 import argparse
 from glob import glob
 import shutil
-import requests

 model_config_dicts = get_json_file(
    os.path.join(
@@ -20,179 +17,51 @@ model_config_dicts = get_json_file(
 )


-def parse_sd_out(filename, command, device, use_tune, model_name, import_mlir):
-    with open(filename, "r+") as f:
-        lines = f.readlines()
-    metrics = {}
-    vals_to_read = [
-        "Clip Inference time",
-        "Average step",
-        "VAE Inference time",
-        "Total image generation",
-    ]
-    for line in lines:
-        for val in vals_to_read:
-            if val in line:
-                metrics[val] = line.split(" ")[-1].strip("\n")
-
-    metrics["Average step"] = metrics["Average step"].strip("ms/it")
-    metrics["Total image generation"] = metrics[
-        "Total image generation"
-    ].strip("sec")
-    metrics["device"] = device
-    metrics["use_tune"] = use_tune
-    metrics["model_name"] = model_name
-    metrics["import_mlir"] = import_mlir
-    metrics["command"] = command
-    return metrics
-
-
-def get_inpaint_inputs():
-    os.mkdir("./test_images/inputs")
-    img_url = (
-        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
-        "/main/stable_diffusion_inpaint/input_bench_image.png"
-    )
-    mask_url = (
-        "https://huggingface.co/datasets/diffusers/test-arrays/resolve"
-        "/main/stable_diffusion_inpaint/input_bench_mask.png"
-    )
-    img = requests.get(img_url)
-    mask = requests.get(mask_url)
-    open("./test_images/inputs/image.png", "wb").write(img.content)
-    open("./test_images/inputs/mask.png", "wb").write(mask.content)
-
-
 def test_loop(device="vulkan", beta=False, extra_flags=[]):
    # Get golden values from tank
    shutil.rmtree("./test_images", ignore_errors=True)
-    model_metrics = []
    os.mkdir("./test_images")
    os.mkdir("./test_images/golden")
-    get_inpaint_inputs()
    hf_model_names = model_config_dicts[0].values()
-    tuned_options = ["--no-use_tuned", "--use_tuned"]
-    import_options = ["--import_mlir", "--no-import_mlir"]
-    prompt_text = "--prompt=cyberpunk forest by Salvador Dali"
-    inpaint_prompt_text = "--prompt=Face of a yellow cat, high resolution, sitting on a park bench"
-    if os.name == "nt":
-        prompt_text = '--prompt="cyberpunk forest by Salvador Dali"'
-        inpaint_prompt_text = '--prompt="Face of a yellow cat, high resolution, sitting on a park bench"'
+    tuned_options = ["--no-use_tuned", "use_tuned"]
    if beta:
        extra_flags.append("--beta_models=True")
-    extra_flags.append("--no-progress_bar")
-    to_skip = [
-        "Linaqruf/anything-v3.0",
-        "prompthero/openjourney",
-        "wavymulder/Analog-Diffusion",
-        "dreamlike-art/dreamlike-diffusion-1.0",
-    ]
-    for import_opt in import_options:
-        for model_name in hf_model_names:
-            if model_name in to_skip:
-                continue
-            for use_tune in tuned_options:
-                command = (
-                    [
-                        executable,  # executable is the python from the venv used to run this
-                        "apps/stable_diffusion/scripts/txt2img.py",
-                        "--device=" + device,
-                        prompt_text,
-                        "--negative_prompts=" + '""',
-                        "--seed=42",
-                        import_opt,
-                        "--output_dir="
-                        + os.path.join(os.getcwd(), "test_images", model_name),
-                        "--hf_model_id=" + model_name,
-                        use_tune,
-                    ]
-                    if "inpainting" not in model_name
-                    else [
-                        executable,
-                        "apps/stable_diffusion/scripts/inpaint.py",
-                        "--device=" + device,
-                        inpaint_prompt_text,
-                        "--negative_prompts=" + '""',
-                        "--img_path=./test_images/inputs/image.png",
-                        "--mask_path=./test_images/inputs/mask.png",
-                        "--seed=42",
-                        "--import_mlir",
-                        "--output_dir="
-                        + os.path.join(os.getcwd(), "test_images", model_name),
-                        "--hf_model_id=" + model_name,
-                        use_tune,
-                    ]
-                )
-                command += extra_flags
-                if os.name == "nt":
-                    command = " ".join(command)
-                dumpfile_name = "_".join(model_name.split("/")) + ".txt"
-                dumpfile_name = os.path.join(os.getcwd(), dumpfile_name)
-                with open(dumpfile_name, "w+") as f:
-                    generated_image = not subprocess.call(
-                        command,
-                        stdout=f,
-                        stderr=f,
-                    )
-                if os.name != "nt":
-                    command = " ".join(command)
-                if generated_image:
-                    model_metrics.append(
-                        parse_sd_out(
-                            dumpfile_name,
-                            command,
-                            device,
-                            use_tune,
-                            model_name,
-                            import_opt,
-                        )
-                    )
-                    print(command)
-                    print("Successfully generated image")
-                    os.makedirs(
-                        "./test_images/golden/" + model_name, exist_ok=True
-                    )
-                    download_public_file(
-                        "gs://shark_tank/testdata/golden/" + model_name,
-                        "./test_images/golden/" + model_name,
-                    )
-                    test_file_path = os.path.join(
-                        os.getcwd(),
-                        "test_images",
-                        model_name,
-                        "generated_imgs",
-                        dt.now().strftime("%Y%m%d"),
-                        "*.png",
-                    )
-                    test_file = glob(test_file_path)[0]
-
-                    golden_path = (
-                        "./test_images/golden/" + model_name + "/*.png"
-                    )
-                    golden_file = glob(golden_path)[0]
-                    compare_images(test_file, golden_file)
-                else:
-                    print(command)
-                    print("failed to generate image for this configuration")
-                    if "2_1_base" in model_name:
-                        print("failed a known successful model.")
-                        exit(1)
-    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
-        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
-        f.write(header)
-        for metric in model_metrics:
-            output = [
-                metric["model_name"],
-                metric["device"],
-                metric["use_tune"],
-                metric["import_mlir"],
-                metric["Clip Inference time"],
-                metric["Average step"],
-                metric["VAE Inference time"],
-                metric["Total image generation"],
-                metric["command"],
+    for model_name in hf_model_names:
+        for use_tune in tuned_options:
+            command = [
+                "python",
+                "apps/stable_diffusion/scripts/txt2img.py",
+                "--device=" + device,
+                "--prompt=cyberpunk forest by Salvador Dali",
+                "--output_dir="
+                + os.path.join(os.getcwd(), "test_images", model_name),
+                "--hf_model_id=" + model_name,
+                use_tune,
            ]
-            f.write(";".join(output) + "\n")
+            command += extra_flags
+            generated_image = not subprocess.call(
+                command, stdout=subprocess.DEVNULL
+            )
+            if generated_image:
+                print(" ".join(command))
+                print("Successfully generated image")
+                os.makedirs(
+                    "./test_images/golden/" + model_name, exist_ok=True
+                )
+                download_public_file(
+                    "gs://shark_tank/testdata/golden/" + model_name,
+                    "./test_images/golden/" + model_name,
+                )
+                test_file_path = os.path.join(
+                    os.getcwd(), "test_images", model_name, "generated_imgs"
+                )
+                test_file = glob(test_file_path + "/*.png")[0]
+                golden_path = "./test_images/golden/" + model_name + "/*.png"
+                golden_file = glob(golden_path)[0]
+                compare_images(test_file, golden_file)
+            else:
+                print(" ".join(command))
+                print("failed to generate image for this configuration")


 parser = argparse.ArgumentParser()
--- a/conftest.py
+++ b/conftest.py
@@ -60,13 +60,3 @@ def pytest_addoption(parser):
        default="gs://shark_tank/latest",
        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
    )
-    parser.addoption(
-        "--benchmark_dispatches",
-        default=None,
-        help="Benchmark individual dispatch kernels produced by IREE compiler. Use 'All' for all, or specific dispatches e.g. '0 1 2 10'",
-    )
-    parser.addoption(
-        "--dispatch_benchmarks_dir",
-        default="./temp_dispatch_benchmarks",
-        help="Directory in which dispatch benchmarks are saved.",
-    )
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvm-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -65,18 +65,18 @@ A tool for benchmarking other models is built and can be invoked with a command
 see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
 ```bash
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
 ```
 VAE and Autoencoder are also available
 ```bash
 # VAE
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32

 # CLIP Autoencoder
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvm-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -1,118 +0,0 @@
-# Overview
-
-This document is intended to provide a starting point for profiling with SHARK/IREE. At it's core
-[SHARK](https://github.com/nod-ai/SHARK/tree/main/tank) is a python API that links the MLIR lowerings from various
-frameworks + frontends (e.g. PyTorch -> Torch-MLIR) with the compiler + runtime offered by IREE. More information
-on model coverage and framework support can be found [here](https://github.com/nod-ai/SHARK/tree/main/tank). The intended
-use case for SHARK is for compilation and deployment of performant state of the art AI models.
-
-![image](https://user-images.githubusercontent.com/22101546/217151219-9bb184a3-cfb9-4788-bb7e-5b502953525c.png)
-
-## Benchmarking with SHARK
-
-TODO: Expand this section.
-
-SHARK offers native benchmarking support, although because it is model focused, fine grain profiling is
-hidden when compared against the common "model benchmarking suite" use case SHARK is good at.
-
-### SharkBenchmarkRunner
-
-SharkBenchmarkRunner is a class designed for benchmarking models against other runtimes.
-TODO: List supported runtimes for comparison + example on how to benchmark with it.
-
-## Directly profiling IREE
-
-A number of excellent developer resources on profiling with IREE can be
-found [here](https://github.com/iree-org/iree/tree/main/docs/developers/developing_iree). As a result this section will
-focus on the bridging the gap between the two.
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling.md
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_with_tracy.md
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_vulkan_gpu.md
- - https://github.com/iree-org/iree/blob/main/docs/developers/developing_iree/profiling_cpu_events.md
-
-Internally, SHARK builds a pair of IREE commands to compile + run a model. At a high level the flow starts with the
-model represented with a high level dialect (commonly Linalg) and is compiled to a flatbuffer (.vmfb) that
-the runtime is capable of ingesting. At this point (with potentially a few runtime flags) the compiled model is then run
-through the IREE runtime. This is all facilitated with the IREE python bindings, which offers a convenient method
-to capture the compile command SHARK comes up with. This is done by setting the environment variable
-`IREE_SAVE_TEMPS` to point to a directory of choice, e.g. for stable diffusion
-```
-# Linux
-$ export IREE_SAVE_TEMPS=/path/to/some/directory
-# Windows
-$ $env:IREE_SAVE_TEMPS="C:\path\to\some\directory"
-$ python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse" --save_vmfb
-```
-NOTE: Currently this will only save the compile command + input MLIR for a single model if run in a pipeline.
-In the case of stable diffusion this (should) be UNet so to get examples for other models in the pipeline they
-need to be extracted and tested individually.
-
-The save temps directory should contain three files: `core-command-line.txt`, `core-input.mlir`, and `core-output.bin`.
-The command line for compilation will start something like this, where the `-` needs to be replaced with the path to `core-input.mlir`.
-```
-/home/quinn/nod/iree-build/compiler/bindings/python/iree/compiler/tools/../_mlir_libs/iree-compile - --iree-input-type=none ...
-```
-The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
-dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
-```
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
-
-iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
-```
-Where `${NUM}` is the dispatch number that you want to benchmark/profile in isolation.
-
-### Enabling Tracy for Vulkan profiling
-
-To begin profiling with Tracy, a build of IREE runtime with tracing enabled is needed. SHARK-Runtime builds an
-instrumented version alongside the normal version nightly (.whls typically found [here](https://github.com/nod-ai/SHARK-Runtime/releases)), however this is only available for Linux. For Windows, tracing can be enabled by enabling a CMake flag.
-```
-$env:IREE_ENABLE_RUNTIME_TRACING="ON"
-```
-Getting a trace can then be done by setting environment variable `TRACY_NO_EXIT=1` and running the program that is to be
-traced. Then, to actually capture the trace, use the `iree-tracy-capture` tool in a different terminal. Note that to get
-the capture and profiler tools the `IREE_BUILD_TRACY=ON` CMake flag needs to be set.
-```
-TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
-
-# (in another terminal, either on the same machine or through ssh with a tunnel through port 8086)
-iree-tracy-capture -o trace_filename.tracy
-```
-To do it over ssh, the flow looks like this
-```
-# From terminal 1 on local machine
-ssh -L 8086:localhost:8086 <remote_server_name>
-TRACY_NO_EXIT=1 python apps/stable_diffusion/scripts/txt2img.py -p "a photograph of an astronaut riding a horse"
-
-# From terminal 2 on local machine. Requires having built IREE with the CMake flag `IREE_BUILD_TRACY=ON` to build the required tooling.
-iree-tracy-capture -o /path/to/trace.tracy
-```
-
-The trace can then be viewed with
-```
-iree-tracy-profiler /path/to/trace.tracy
-```
-Capturing a runtime trace will work with any IREE tooling that uses the runtime. For example, `iree-benchmark-module`
-can be used for benchmarking an individual module. Importantly this means that any SHARK script can be profiled with tracy.
-
-NOTE: Not all backends have the same tracy support. This writeup is focused on CPU/Vulkan backends but there is recently added support for tracing on CUDA (requires the `--cuda_tracing` flag).
-
-## Experimental RGP support
-
-TODO: This section is temporary until proper RGP support is added.
-
-Currently, for stable diffusion there is a flag for enabling UNet to be visible to RGP with `--enable_rgp`. To get a proper capture though, the `DevModeSqttPrepareFrameCount=1` flag needs to be set for the driver (done with `VkPanel` on Windows).
-With these two settings, a single iteration of UNet can be captured.
-
-(AMD only) To get a dump of the pipelines (result of compiled SPIR-V) the `EnablePipelineDump=1` driver flag can be set. The
-files will typically be dumped to a directory called `spvPipeline` (on Linux `/var/tmp/spvPipeline`. The dumped files will
-include header information that can be used to map back to the source dispatch/SPIR-V, e.g.
-```
-[Version]
-version = 57 
-
-[CsSpvFile]
-fileName = Shader_0x946C08DFD0C10D9A.spv
-
-[CsInfo]
-entryPoint = forward_dispatch_193_matmul_256x65536x2304
-```
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -2,10 +2,11 @@
 """SHARK Tank"""
 # python generate_sharktank.py, you have to give a csv tile with [model_name, model_download_url]
 # will generate local shark tank folder like this:
-#   /SHARK
-#     /gen_shark_tank
-#       /albert_lite_base
-#       /...model_name...
+#   HOME
+#     /.local
+#       /shark_tank
+#           /albert_lite_base
+#           /...model_name...
 #

 import os
@@ -105,12 +106,6 @@ def save_torch_model(torch_model_list):
                dir=torch_model_dir,
                model_name=torch_model_name,
            )
-            mlir_hash = create_hash(
-                os.path.join(
-                    torch_model_dir, torch_model_name + "_torch" + ".mlir"
-                )
-            )
-            np.save(os.path.join(torch_model_dir, "hash"), np.array(mlir_hash))
            # Generate torch dynamic models.
            if is_dynamic:
                mlir_importer.import_debug(
@@ -162,13 +157,13 @@ def save_tf_model(tf_model_list):
            tf_model_name = tf_model_name.replace("/", "_")
            tf_model_dir = os.path.join(WORKDIR, str(tf_model_name) + "_tf")
            os.makedirs(tf_model_dir, exist_ok=True)
+
            mlir_importer = SharkImporter(
                model,
-                inputs=input,
+                input,
                frontend="tf",
            )
            mlir_importer.import_debug(
-                is_dynamic=False,
                dir=tf_model_dir,
                model_name=tf_model_name,
            )
@@ -277,5 +272,8 @@ if __name__ == "__main__":
    )

    save_torch_model(torch_model_csv)
+    save_torch_model(
+        os.path.join(os.path.dirname(__file__), "tank", "torch_sd_list.csv")
+    )
    save_tf_model(tf_model_csv)
    save_tflite_model(tflite_model_csv)
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -1,44 +0,0 @@
-# This script will toggle the comment/uncommenting aspect for dealing
-# with __file__ AttributeError arising in case of a few modules in
-# `torch/_dynamo/skipfiles.py` (within shark.venv)
-
-from distutils.sysconfig import get_python_lib
-import fileinput
-from pathlib import Path
-
-# Diffusers 0.13.1 fails with transformers __init.py errros in BLIP. So remove it for now until we fork it
-pix2pix_file = Path(
-    get_python_lib()
-    + "/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py"
-)
-if pix2pix_file.exists():
-    print("Removing..%s", pix2pix_file)
-    pix2pix_file.unlink()
-
-
-path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")
-
-modules_to_comment = ["abc,", "os,", "posixpath,", "_collections_abc,"]
-startMonitoring = 0
-for line in fileinput.input(path_to_skipfiles, inplace=True):
-    if "SKIP_DIRS = " in line:
-        startMonitoring = 1
-        print(line, end="")
-    elif startMonitoring in [1, 2]:
-        if "]" in line:
-            startMonitoring += 1
-            print(line, end="")
-        else:
-            flag = True
-            for module in modules_to_comment:
-                if module in line:
-                    if not line.startswith("#"):
-                        print(f"#{line}", end="")
-                    else:
-                        print(f"{line[1:]}", end="")
-                    flag = False
-                    break
-            if flag:
-                print(line, end="")
-    else:
-        print(line, end="")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,3 @@ requires = [
    "iree-runtime>=20221022.190",
 ]
 build-backend = "setuptools.build_meta"
-
-[tool.black]
-line-length = 79
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -1,7 +1,7 @@
 -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 --pre

-numpy>1.22.4
+numpy==1.22.4
 torchvision
 pytorch-triton
 tabulate
@@ -15,8 +15,8 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tf-nightly
-keras>=2.10
+tensorflow==2.10.1
+keras==2.10
 #tf-models-nightly
 #tensorflow-text-nightly
 transformers
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,14 +16,13 @@ parameterized

 # Add transformers, diffusers and scipy since it most commonly used
 transformers
-diffusers @ git+https://github.com/nod-ai/diffusers@stable_stencil
+diffusers
 scipy
 ftfy
 gradio
 altair
 omegaconf
 safetensors
-opencv-python

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -1,54 +1,19 @@
-<#
-.SYNOPSIS
-  A script to update and install the SHARK runtime and its dependencies.
-
-.DESCRIPTION
-  This script updates and installs the SHARK runtime and its dependencies.
-  It checks the Python version installed and installs any required build
-  dependencies into a Python virtual environment.
-  If that environment does not exist, it creates it.
-  
-.PARAMETER update-src
-  git pulls latest version
-
-.PARAMETER force
-  removes and recreates venv to force update of all dependencies
-  
-.EXAMPLE
-  .\setup_venv.ps1 --force
-
-.EXAMPLE
-  .\setup_venv.ps1 --update-src
-
-.INPUTS
-  None
-
-.OUTPUTS
-  None
-
-#>
-
 param([string]$arguments)

 if ($arguments -eq "--update-src"){
 	git pull
 }

-if ($arguments -eq "--force"){
-	if (Test-Path env:VIRTUAL_ENV) {
-        Write-Host "deactivating..."
-        Deactivate
-    }
-    
-    if (Test-Path .\shark.venv\) {
-        Write-Host "removing and recreating venv..."
-        Remove-Item .\shark.venv -Force -Recurse
-        if (Test-Path .\shark.venv\) {
-            Write-Host 'could not remove .\shark-venv - please try running ".\setup_venv.ps1 --force" again!'
-            break
-        }
-    }
-}
+#Write-Host "Installing python"
+
+#Start-Process winget install Python.Python.3.10 '/quiet InstallAllUsers=1 PrependPath=1' -wait -NoNewWindow
+
+#Write-Host "python installation completed successfully"
+
+#Write-Host "Reload environment variables"
+#$env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User")
+#Write-Host "Reloaded environment variables"
+

 # redirect stderr into stdout
 $p = &{python -V} 2>&1
@@ -60,36 +25,19 @@ $version = if($p -is [System.Management.Automation.ErrorRecord])
 }
 else
 {
-    # otherwise return complete Python list
-    $ErrorActionPreference = 'SilentlyContinue'
-    $PyVer = py --list
+    # otherwise return as is
+    $p
 }

-# deactivate any activated venvs
-if ($PyVer -like "*venv*")
-{
-  deactivate # make sure we don't update the wrong venv
-  $PyVer = py --list # update list
-}
+Write-Host "Python version found is"
+Write-Host $p

-Write-Host "Python versions found are"
-Write-Host ($PyVer | Out-String) # formatted output with line breaks
-if (!($PyVer.length -ne 0)) {$p} # return Python --version String if py.exe is unavailable
-if (!($PyVer -like "*3.11*") -and !($p -like "*3.11*")) # if 3.11 is not in any list
-{
-    Write-Host "Please install Python 3.11 and try again"
-    break
-}

 Write-Host "Installing Build Dependencies"
-# make sure we really use 3.11 from list, even if it's not the default.
-if (!($PyVer.length -ne 0)) {py -3.11 -m venv .\shark.venv\}
-else {python -m venv .\shark.venv\}
+python -m venv .\shark.venv\
 .\shark.venv\Scripts\activate
-python -m pip install --upgrade pip
-pip install wheel
 pip install -r requirements.txt
-pip install --pre torch-mlir==20230228.763 torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
+pip install --pre torch-mlir torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
 pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
 Write-Host "Building SHARK..."
 pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -42,7 +42,7 @@ Green=`tput setaf 2`
 Yellow=`tput setaf 3`

 # Assume no binary torch-mlir.
-# Currently available for macOS m1&intel (3.11) and Linux(3.8,3.10,3.11)
+# Currently available for macOS m1&intel (3.10) and Linux(3.7,3.8,3.9,3.10)
 torch_mlir_bin=false
 if [[ $(uname -s) = 'Darwin' ]]; then
  echo "${Yellow}Apple macOS detected"
@@ -60,12 +60,12 @@ if [[ $(uname -s) = 'Darwin' ]]; then
  fi
  echo "${Yellow}Run the following commands to setup your SSL certs for your Python version if you see SSL errors with tests"
  echo "${Yellow}/Applications/Python\ 3.XX/Install\ Certificates.command"
-  if [ "$PYTHON_VERSION_X_Y" == "3.11" ]; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.10" ]; then
    torch_mlir_bin=true
  fi
 elif [[ $(uname -s) = 'Linux' ]]; then
  echo "${Yellow}Linux detected"
-  if [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.10" ] || [ "$PYTHON_VERSION_X_Y" == "3.11" ] ; then
+  if [ "$PYTHON_VERSION_X_Y" == "3.7" ] || [ "$PYTHON_VERSION_X_Y" == "3.8" ]  || [ "$PYTHON_VERSION_X_Y" == "3.9" ] || [ "$PYTHON_VERSION_X_Y" == "3.10" ] ; then
    torch_mlir_bin=true
  fi
 else
@@ -78,9 +78,9 @@ $PYTHON -m pip install --upgrade -r "$TD/requirements.txt"
 if [ "$torch_mlir_bin" = true ]; then
  if [[ $(uname -s) = 'Darwin' ]]; then
    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
-    $PYTHON -m pip install --pre --no-cache-dir torch-mlir==20230228.763 -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
+    $PYTHON -m pip install --pre --no-cache-dir  torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
  else
-    $PYTHON -m pip install --pre torch-mlir==20230228.763 -f https://llvm.github.io/torch-mlir/package-index/
+    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
    if [ $? -eq 0 ];then
      echo "Successfully Installed torch-mlir"
    else
@@ -89,7 +89,7 @@ if [ "$torch_mlir_bin" = true ]; then
  fi
 else
  echo "${Red}No binaries found for Python $PYTHON_VERSION_X_Y on $(uname -s)"
-  echo "${Yello}Python 3.11 supported on macOS and 3.8,3.10 and 3.11 on Linux"
+  echo "${Yello}Python 3.10 supported on macOS and 3.7,3.8,3.9 and 3.10 on Linux"
  echo "${Red}Please build torch-mlir from source in your environment"
  exit 1
 fi
@@ -98,11 +98,11 @@ if [[ -z "${USE_IREE}" ]]; then
  RUNTIME="https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html"
 else
  touch ./.use-iree
-  RUNTIME="https://openxla.github.io/iree/pip-release-links.html"
+  RUNTIME="https://iree-org.github.io/iree/pip-release-links.html"
 fi
 if [[ -z "${NO_BACKEND}" ]]; then
  echo "Installing ${RUNTIME}..."
-  $PYTHON -m pip install --pre --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
+  $PYTHON -m pip install --upgrade --find-links ${RUNTIME} iree-compiler iree-runtime
 else
  echo "Not installing a backend, please make sure to add your backend to PYTHONPATH"
 fi
@@ -112,7 +112,7 @@ if [[ ! -z "${IMPORTER}" ]]; then
  if [[ $(uname -s) = 'Linux' ]]; then
    echo "${Yellow}Linux detected.. installing Linux importer tools"
    #Always get the importer tools from upstream IREE
-    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://openxla.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+    $PYTHON -m pip install --no-warn-conflicts --upgrade -r "$TD/requirements-importer.txt" -f https://iree-org.github.io/iree/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
  elif [[ $(uname -s) = 'Darwin' ]]; then
    echo "${Yellow}macOS detected.. installing macOS importer tools"
    #Conda seems to have some problems installing these packages and hope they get resolved upstream.
@@ -129,7 +129,7 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  TV_VERSION=${TV_VER:9:18}
  $PYTHON -m pip uninstall -y torch torchvision
  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp311-cp311-linux_x86_64.whl
+  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp310-cp310-linux_x86_64.whl
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch + cu117."
  else
--- a/shark/examples/shark_dynamo/basic_examples.py
+++ b/shark/examples/shark_dynamo/basic_examples.py
@@ -1,6 +1,6 @@
+import torchdynamo
 import torch
 import torch_mlir
-import torch._dynamo as torchdynamo
 from shark.sharkdynamo.utils import make_shark_compiler


--- a/shark/examples/shark_inference/sharded_bloom.py
+++ b/shark/examples/shark_inference/sharded_bloom.py
@@ -1,842 +0,0 @@
-####################################################################################
-# Please make sure you have transformers 4.21.2 installed before running this demo
-#
-# -p --model_path: the directory in which you want to store the bloom files.
-# -dl --device_list: the list of device indices you want to use.  if you want to only use the first device, or you are running on cpu leave this blank.
-#                     Otherwise, please give this argument in this format: "[0, 1, 2]"
-# -de --device: the device you want to run bloom on.  E.G. cpu, cuda
-# -c, --recompile: set to true if you want to recompile to vmfb.
-# -d, --download: set to true if you want to redownload the mlir files
-# -cm, --create_mlirs: set to true if you want to create the mlir files from scratch.  please make sure you have transformers 4.21.2 before using this option
-# -t --token_count: the number of tokens you want to generate
-# -pr --prompt: the prompt you want to feed to the model
-# -m --model_name: the name of the model, e.g. bloom-560m
-#
-# If you don't specify a prompt when you run this example, you will be able to give prompts through the terminal.  Run the
-# example in this way if you want to run multiple examples without reinitializing the model
-#####################################################################################
-
-import os
-import io
-import torch
-import torch.nn as nn
-from collections import OrderedDict
-import torch_mlir
-from torch_mlir import TensorPlaceholder
-import re
-from transformers.models.bloom.configuration_bloom import BloomConfig
-import json
-import sys
-import argparse
-import json
-import urllib.request
-import subprocess
-
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_public_file
-from transformers import (
-    BloomTokenizerFast,
-    BloomForSequenceClassification,
-    BloomForCausalLM,
-)
-from transformers.models.bloom.modeling_bloom import (
-    BloomBlock,
-    build_alibi_tensor,
-)
-
-IS_CUDA = False
-
-
-class ShardedBloom:
-    def __init__(self, src_folder):
-        f = open(f"{src_folder}/config.json")
-        config = json.load(f)
-        f.close()
-
-        self.layers_initialized = False
-
-        self.src_folder = src_folder
-        try:
-            self.n_embed = config["n_embed"]
-        except KeyError:
-            self.n_embed = config["hidden_size"]
-        self.vocab_size = config["vocab_size"]
-        self.n_layer = config["n_layer"]
-        try:
-            self.n_head = config["num_attention_heads"]
-        except KeyError:
-            self.n_head = config["n_head"]
-
-    def _init_layer(self, layer_name, device, replace, device_idx):
-        if replace or not os.path.exists(
-            f"{self.src_folder}/{layer_name}.vmfb"
-        ):
-            f_ = open(f"{self.src_folder}/{layer_name}.mlir", encoding="utf-8")
-            module = f_.read()
-            f_.close()
-            module = bytes(module, "utf-8")
-            shark_module = SharkInference(
-                module,
-                device=device,
-                mlir_dialect="tm_tensor",
-                device_idx=device_idx,
-            )
-            shark_module.save_module(
-                module_name=f"{self.src_folder}/{layer_name}",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-        else:
-            shark_module = SharkInference(
-                "",
-                device=device,
-                mlir_dialect="tm_tensor",
-                device_idx=device_idx,
-            )
-
-        return shark_module
-
-    def init_layers(self, device, replace=False, device_idx=[0]):
-        if device_idx is not None:
-            n_devices = len(device_idx)
-
-        self.word_embeddings_module = self._init_layer(
-            "word_embeddings",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[0 % n_devices],
-        )
-        self.word_embeddings_layernorm_module = self._init_layer(
-            "word_embeddings_layernorm",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[1 % n_devices],
-        )
-        self.ln_f_module = self._init_layer(
-            "ln_f",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[2 % n_devices],
-        )
-        self.lm_head_module = self._init_layer(
-            "lm_head",
-            device,
-            replace,
-            device_idx if device_idx is None else device_idx[3 % n_devices],
-        )
-        self.block_modules = [
-            self._init_layer(
-                f"bloom_block_{i}",
-                device,
-                replace,
-                device_idx
-                if device_idx is None
-                else device_idx[(i + 4) % n_devices],
-            )
-            for i in range(self.n_layer)
-        ]
-
-        self.layers_initialized = True
-
-    def load_layers(self):
-        assert self.layers_initialized
-
-        self.word_embeddings_module.load_module(
-            f"{self.src_folder}/word_embeddings.vmfb"
-        )
-        self.word_embeddings_layernorm_module.load_module(
-            f"{self.src_folder}/word_embeddings_layernorm.vmfb"
-        )
-        for block_module, i in zip(self.block_modules, range(self.n_layer)):
-            block_module.load_module(f"{self.src_folder}/bloom_block_{i}.vmfb")
-        self.ln_f_module.load_module(f"{self.src_folder}/ln_f.vmfb")
-        self.lm_head_module.load_module(f"{self.src_folder}/lm_head.vmfb")
-
-    def forward_pass(self, input_ids, device):
-        if IS_CUDA:
-            cudaSetDevice(self.word_embeddings_module.device_idx)
-
-        input_embeds = self.word_embeddings_module(
-            inputs=(input_ids,), function_name="forward"
-        )
-
-        input_embeds = torch.tensor(input_embeds).float()
-        if IS_CUDA:
-            cudaSetDevice(self.word_embeddings_layernorm_module.device_idx)
-        hidden_states = self.word_embeddings_layernorm_module(
-            inputs=(input_embeds,), function_name="forward"
-        )
-
-        hidden_states = torch.tensor(hidden_states).float()
-
-        attention_mask = torch.ones(
-            [hidden_states.shape[0], len(input_ids[0])]
-        )
-        alibi = build_alibi_tensor(
-            attention_mask,
-            self.n_head,
-            hidden_states.dtype,
-            hidden_states.device,
-        )
-
-        causal_mask = _prepare_attn_mask(
-            attention_mask, input_ids.size(), input_embeds, 0
-        )
-        causal_mask = torch.tensor(causal_mask).float()
-
-        presents = ()
-        all_hidden_states = tuple(hidden_states)
-
-        for block_module, i in zip(self.block_modules, range(self.n_layer)):
-            if IS_CUDA:
-                cudaSetDevice(block_module.device_idx)
-
-            output = block_module(
-                inputs=(
-                    hidden_states.detach().numpy(),
-                    alibi.detach().numpy(),
-                    causal_mask.detach().numpy(),
-                ),
-                function_name="forward",
-            )
-            hidden_states = torch.tensor(output[0]).float()
-            all_hidden_states = all_hidden_states + (hidden_states,)
-            presents = presents + (
-                tuple(
-                    (
-                        output[1],
-                        output[2],
-                    )
-                ),
-            )
-        if IS_CUDA:
-            cudaSetDevice(self.ln_f_module.device_idx)
-
-        hidden_states = self.ln_f_module(
-            inputs=(hidden_states,), function_name="forward"
-        )
-        if IS_CUDA:
-            cudaSetDevice(self.lm_head_module.device_idx)
-
-        logits = self.lm_head_module(
-            inputs=(hidden_states,), function_name="forward"
-        )
-        logits = torch.tensor(logits).float()
-
-        return torch.argmax(logits[:, -1, :], dim=-1)
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    past_key_values_length: int = 0,
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
-    mask_cond = torch.arange(mask.size(-1))
-    intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
-    mask.masked_fill_(intermediate_mask, 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    target_length, past_key_values_length, dtype=dtype
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    expanded_mask = mask[None, None, :, :].expand(
-        batch_size, 1, target_length, target_length + past_key_values_length
-    )
-    return expanded_mask
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    batch_size, source_length = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else source_length
-
-    expanded_mask = (
-        mask[:, None, None, :]
-        .expand(batch_size, 1, tgt_len, source_length)
-        .to(dtype)
-    )
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(
-        inverted_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-
-def _prepare_attn_mask(
-    attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # create causal mask
-    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-    combined_attention_mask = None
-    if input_shape[-1] > 1:
-        combined_attention_mask = _make_causal_mask(
-            input_shape,
-            inputs_embeds.dtype,
-            past_key_values_length=past_key_values_length,
-        ).to(attention_mask.device)
-
-    if attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        expanded_attn_mask = _expand_mask(
-            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-        )
-        combined_attention_mask = (
-            expanded_attn_mask
-            if combined_attention_mask is None
-            else expanded_attn_mask + combined_attention_mask
-        )
-
-    return combined_attention_mask
-
-
-def download_model(destination_folder, model_name):
-    download_public_file(
-        f"gs://shark_tank/sharded_bloom/{model_name}/", destination_folder
-    )
-
-
-def compile_embeddings(embeddings_layer, input_ids, path):
-    input_ids_placeholder = torch_mlir.TensorPlaceholder.like(
-        input_ids, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        embeddings_layer,
-        (input_ids_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def compile_word_embeddings_layernorm(
-    embeddings_layer_layernorm, embeds, path
-):
-    embeds_placeholder = torch_mlir.TensorPlaceholder.like(
-        embeds, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        embeddings_layer_layernorm,
-        (embeds_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def strip_overloads(gm):
-    """
-    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
-    Args:
-        gm(fx.GraphModule): The input Fx graph module to be modified
-    """
-    for node in gm.graph.nodes:
-        if isinstance(node.target, torch._ops.OpOverload):
-            node.target = node.target.overloadpacket
-    gm.recompile()
-
-
-def compile_to_mlir(
-    bblock,
-    hidden_states,
-    layer_past=None,
-    attention_mask=None,
-    head_mask=None,
-    use_cache=None,
-    output_attentions=False,
-    alibi=None,
-    block_index=0,
-    path=".",
-):
-    fx_g = make_fx(
-        bblock,
-        decomposition_table=get_decompositions(
-            [
-                torch.ops.aten.split.Tensor,
-                torch.ops.aten.split_with_sizes,
-            ]
-        ),
-        tracing_mode="real",
-        _allow_non_fake_inputs=False,
-    )(hidden_states, alibi, attention_mask)
-
-    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
-    fx_g.recompile()
-
-    strip_overloads(fx_g)
-
-    hidden_states_placeholder = TensorPlaceholder.like(
-        hidden_states, dynamic_axes=[1]
-    )
-    attention_mask_placeholder = TensorPlaceholder.like(
-        attention_mask, dynamic_axes=[2, 3]
-    )
-    alibi_placeholder = TensorPlaceholder.like(alibi, dynamic_axes=[2])
-
-    ts_g = torch.jit.script(fx_g)
-
-    module = torch_mlir.compile(
-        ts_g,
-        (
-            hidden_states_placeholder,
-            alibi_placeholder,
-            attention_mask_placeholder,
-        ),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    module_placeholder = module
-    module_context = module_placeholder.context
-
-    def check_valid_line(line, line_n, mlir_file_len):
-        if "private" in line:
-            return False
-        if "attributes" in line:
-            return False
-        if mlir_file_len - line_n == 2:
-            return False
-
-        return True
-
-    mlir_file_len = len(str(module).split("\n"))
-
-    def remove_constant_dim(line):
-        if "17x" in line:
-            line = re.sub("17x", "?x", line)
-            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
-        if "tensor.empty" in line and "?x?" in line:
-            line = re.sub(
-                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
-            )
-        if "arith.cmpi eq" in line:
-            line = re.sub("c17", "dim", line)
-        if " 17," in line:
-            line = re.sub(" 17,", " %dim,", line)
-        return line
-
-    module = "\n".join(
-        [
-            remove_constant_dim(line)
-            for line, line_n in zip(
-                str(module).split("\n"), range(mlir_file_len)
-            )
-            if check_valid_line(line, line_n, mlir_file_len)
-        ]
-    )
-
-    module = module_placeholder.parse(module, context=module_context)
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def compile_ln_f(ln_f, hidden_layers, path):
-    hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
-        hidden_layers, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        ln_f,
-        (hidden_layers_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def compile_lm_head(lm_head, hidden_layers, path):
-    hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
-        hidden_layers, dynamic_axes=[1]
-    )
-    module = torch_mlir.compile(
-        lm_head,
-        (hidden_layers_placeholder),
-        torch_mlir.OutputType.LINALG_ON_TENSORS,
-        use_tracing=False,
-        verbose=False,
-    )
-
-    bytecode_stream = io.BytesIO()
-    module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    f_ = open(path, "w+")
-    f_.write(str(module))
-    f_.close()
-    return
-
-
-def create_mlirs(destination_folder, model_name):
-    model_config = "bigscience/" + model_name
-    sample_input_ids = torch.ones([1, 17], dtype=torch.int64)
-
-    urllib.request.urlretrieve(
-        f"https://huggingface.co/bigscience/{model_name}/resolve/main/config.json",
-        filename=f"{destination_folder}/config.json",
-    )
-    urllib.request.urlretrieve(
-        f"https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json",
-        filename=f"{destination_folder}/tokenizer.json",
-    )
-
-    class HuggingFaceLanguage(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.model = BloomForCausalLM.from_pretrained(model_config)
-
-        def forward(self, tokens):
-            return self.model.forward(tokens)[0]
-
-    class HuggingFaceBlock(torch.nn.Module):
-        def __init__(self, block):
-            super().__init__()
-            self.model = block
-
-        def forward(self, tokens, alibi, attention_mask):
-            output = self.model(
-                hidden_states=tokens,
-                alibi=alibi,
-                attention_mask=attention_mask,
-                use_cache=True,
-                output_attentions=False,
-            )
-            return (output[0], output[1][0], output[1][1])
-
-    model = HuggingFaceLanguage()
-
-    compile_embeddings(
-        model.model.transformer.word_embeddings,
-        sample_input_ids,
-        f"{destination_folder}/word_embeddings.mlir",
-    )
-
-    inputs_embeds = model.model.transformer.word_embeddings(sample_input_ids)
-
-    compile_word_embeddings_layernorm(
-        model.model.transformer.word_embeddings_layernorm,
-        inputs_embeds,
-        f"{destination_folder}/word_embeddings_layernorm.mlir",
-    )
-
-    hidden_states = model.model.transformer.word_embeddings_layernorm(
-        inputs_embeds
-    )
-
-    input_shape = sample_input_ids.size()
-
-    current_sequence_length = hidden_states.shape[1]
-    past_key_values_length = 0
-    past_key_values = tuple([None] * len(model.model.transformer.h))
-
-    attention_mask = torch.ones(
-        (hidden_states.shape[0], current_sequence_length), device="cpu"
-    )
-
-    alibi = build_alibi_tensor(
-        attention_mask,
-        model.model.transformer.n_head,
-        hidden_states.dtype,
-        "cpu",
-    )
-
-    causal_mask = _prepare_attn_mask(
-        attention_mask, input_shape, inputs_embeds, past_key_values_length
-    )
-
-    head_mask = model.model.transformer.get_head_mask(
-        None, model.model.transformer.config.n_layer
-    )
-    output_attentions = model.model.transformer.config.output_attentions
-
-    all_hidden_states = ()
-
-    for i, (block, layer_past) in enumerate(
-        zip(model.model.transformer.h, past_key_values)
-    ):
-        all_hidden_states = all_hidden_states + (hidden_states,)
-
-        proxy_model = HuggingFaceBlock(block)
-
-        compile_to_mlir(
-            proxy_model,
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=causal_mask,
-            head_mask=head_mask[i],
-            use_cache=True,
-            output_attentions=output_attentions,
-            alibi=alibi,
-            block_index=i,
-            path=f"{destination_folder}/bloom_block_{i}.mlir",
-        )
-
-    compile_ln_f(
-        model.model.transformer.ln_f,
-        hidden_states,
-        f"{destination_folder}/ln_f.mlir",
-    )
-    hidden_states = model.model.transformer.ln_f(hidden_states)
-    compile_lm_head(
-        model.model.lm_head,
-        hidden_states,
-        f"{destination_folder}/lm_head.mlir",
-    )
-
-
-def run_large_model(
-    token_count,
-    recompile,
-    model_path,
-    prompt,
-    device_list,
-    script_path,
-    device,
-):
-    f = open(f"{model_path}/prompt.txt", "w+")
-    f.write(prompt)
-    f.close()
-    for i in range(token_count):
-        if i == 0:
-            will_compile = recompile
-        else:
-            will_compile = False
-            f = open(f"{model_path}/prompt.txt", "r")
-            prompt = f.read()
-            f.close()
-
-        subprocess.run(
-            [
-                "python",
-                script_path,
-                model_path,
-                "start",
-                str(will_compile),
-                "cpu",
-                "None",
-                prompt,
-            ]
-        )
-        for i in range(config["n_layer"]):
-            if device_list is not None:
-                device_idx = str(device_list[i % len(device_list)])
-            else:
-                device_idx = "None"
-            subprocess.run(
-                [
-                    "python",
-                    script_path,
-                    model_path,
-                    str(i),
-                    str(will_compile),
-                    device,
-                    device_idx,
-                    prompt,
-                ]
-            )
-        subprocess.run(
-            [
-                "python",
-                script_path,
-                model_path,
-                "end",
-                str(will_compile),
-                "cpu",
-                "None",
-                prompt,
-            ]
-        )
-
-    f = open(f"{model_path}/prompt.txt", "r")
-    output = f.read()
-    f.close()
-    print(output)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(prog="Bloom-560m")
-    parser.add_argument("-p", "--model_path")
-    parser.add_argument("-dl", "--device_list", default=None)
-    parser.add_argument("-de", "--device", default="cpu")
-    parser.add_argument("-c", "--recompile", default=False, type=bool)
-    parser.add_argument("-d", "--download", default=False, type=bool)
-    parser.add_argument("-t", "--token_count", default=10, type=int)
-    parser.add_argument("-m", "--model_name", default="bloom-560m")
-    parser.add_argument("-cm", "--create_mlirs", default=False, type=bool)
-
-    parser.add_argument(
-        "-lm", "--large_model_memory_efficient", default=False, type=bool
-    )
-
-    parser.add_argument(
-        "-pr",
-        "--prompt",
-        default=None,
-    )
-    args = parser.parse_args()
-
-    if args.create_mlirs and args.large_model_memory_efficient:
-        print(
-            "Warning: If you need to use memory efficient mode, you probably want to use 'download' instead"
-        )
-
-    if not os.path.isdir(args.model_path):
-        os.mkdir(args.model_path)
-
-    if args.device_list is not None:
-        args.device_list = json.loads(args.device_list)
-
-    if args.device == "cuda" and args.device_list is not None:
-        IS_CUDA = True
-        from cuda.cudart import cudaSetDevice
-    if args.download and args.create_mlirs:
-        print(
-            "WARNING: It is not advised to turn on both download and create_mlirs"
-        )
-    if args.download:
-        download_model(args.model_path, args.model_name)
-    if args.create_mlirs:
-        create_mlirs(args.model_path, args.model_name)
-    from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    if args.prompt is not None:
-        input_ids = tokenizer.encode(args.prompt, return_tensors="pt")
-
-    if args.large_model_memory_efficient:
-        f = open(f"{args.model_path}/config.json")
-        config = json.load(f)
-        f.close()
-
-        self_path = os.path.dirname(os.path.abspath(__file__))
-        script_path = os.path.join(self_path, "sharded_bloom_large_models.py")
-
-        if args.prompt is not None:
-            run_large_model(
-                args.token_count,
-                args.recompile,
-                args.model_path,
-                args.prompt,
-                args.device_list,
-                script_path,
-                args.device,
-            )
-
-        else:
-            while True:
-                prompt = input("Enter Prompt: ")
-                try:
-                    token_count = int(
-                        input("Enter number of tokens you want to generate: ")
-                    )
-                except:
-                    print(
-                        "Invalid integer entered.  Using default value of 10"
-                    )
-                    token_count = 10
-
-                run_large_model(
-                    token_count,
-                    args.recompile,
-                    args.model_path,
-                    prompt,
-                    args.device_list,
-                    script_path,
-                    args.device,
-                )
-
-    else:
-        shardedbloom = ShardedBloom(args.model_path)
-        shardedbloom.init_layers(
-            device=args.device,
-            replace=args.recompile,
-            device_idx=args.device_list,
-        )
-        shardedbloom.load_layers()
-
-        if args.prompt is not None:
-            for _ in range(args.token_count):
-                next_token = shardedbloom.forward_pass(
-                    torch.tensor(input_ids), device=args.device
-                )
-                input_ids = torch.cat(
-                    [input_ids, next_token.unsqueeze(-1)], dim=-1
-                )
-
-            print(tokenizer.decode(input_ids.squeeze()))
-
-        else:
-            while True:
-                prompt = input("Enter Prompt: ")
-                try:
-                    token_count = int(
-                        input("Enter number of tokens you want to generate: ")
-                    )
-                except:
-                    print(
-                        "Invalid integer entered.  Using default value of 10"
-                    )
-                    token_count = 10
-
-                input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-                for _ in range(token_count):
-                    next_token = shardedbloom.forward_pass(
-                        torch.tensor(input_ids), device=args.device
-                    )
-                    input_ids = torch.cat(
-                        [input_ids, next_token.unsqueeze(-1)], dim=-1
-                    )
-
-                print(tokenizer.decode(input_ids.squeeze()))
--- a/shark/examples/shark_inference/sharded_bloom_large_models.py
+++ b/shark/examples/shark_inference/sharded_bloom_large_models.py
@@ -1,381 +0,0 @@
-import sys
-import os
-from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
-import re
-from shark.shark_inference import SharkInference
-import torch
-import torch.nn as nn
-from collections import OrderedDict
-from transformers.models.bloom.modeling_bloom import (
-    BloomBlock,
-    build_alibi_tensor,
-)
-import time
-import json
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    batch_size, source_length = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else source_length
-
-    expanded_mask = (
-        mask[:, None, None, :]
-        .expand(batch_size, 1, tgt_len, source_length)
-        .to(dtype)
-    )
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(
-        inverted_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-
-def _prepare_attn_mask(
-    attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # create causal mask
-    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-    combined_attention_mask = None
-    if input_shape[-1] > 1:
-        combined_attention_mask = _make_causal_mask(
-            input_shape,
-            inputs_embeds.dtype,
-            past_key_values_length=past_key_values_length,
-        ).to(attention_mask.device)
-
-    if attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        expanded_attn_mask = _expand_mask(
-            attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-        )
-        combined_attention_mask = (
-            expanded_attn_mask
-            if combined_attention_mask is None
-            else expanded_attn_mask + combined_attention_mask
-        )
-
-    return combined_attention_mask
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    past_key_values_length: int = 0,
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
-    mask_cond = torch.arange(mask.size(-1))
-    intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
-    mask.masked_fill_(intermediate_mask, 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    target_length, past_key_values_length, dtype=dtype
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    expanded_mask = mask[None, None, :, :].expand(
-        batch_size, 1, target_length, target_length + past_key_values_length
-    )
-    return expanded_mask
-
-
-if __name__ == "__main__":
-    working_dir = sys.argv[1]
-    layer_name = sys.argv[2]
-    will_compile = sys.argv[3]
-    device = sys.argv[4]
-    device_idx = sys.argv[5]
-    prompt = sys.argv[6]
-
-    if device_idx.lower().strip() == "none":
-        device_idx = None
-    else:
-        device_idx = int(device_idx)
-
-    if will_compile.lower().strip() == "true":
-        will_compile = True
-    else:
-        will_compile = False
-
-    f = open(f"{working_dir}/config.json")
-    config = json.load(f)
-    f.close()
-
-    layers_initialized = False
-    try:
-        n_embed = config["n_embed"]
-    except KeyError:
-        n_embed = config["hidden_size"]
-    vocab_size = config["vocab_size"]
-    n_layer = config["n_layer"]
-    try:
-        n_head = config["num_attention_heads"]
-    except KeyError:
-        n_head = config["n_head"]
-
-    if not os.path.isdir(working_dir):
-        os.mkdir(working_dir)
-
-    if layer_name == "start":
-        tokenizer = AutoTokenizer.from_pretrained(working_dir)
-        input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(f"{working_dir}/word_embeddings.mlir", encoding="utf-8")
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        shark_module = SharkInference(
-            mlir_str,
-            device="cpu",
-            mlir_dialect="tm_tensor",
-            device_idx=None,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/word_embeddings",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(f"{working_dir}/word_embeddings.vmfb")
-        input_embeds = shark_module(
-            inputs=(input_ids,), function_name="forward"
-        )
-        input_embeds = torch.tensor(input_embeds).float()
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(
-                f"{working_dir}/word_embeddings_layernorm.mlir",
-                encoding="utf-8",
-            )
-            mlir_str = f.read()
-            f.close()
-
-        shark_module = SharkInference(
-            mlir_str,
-            device="cpu",
-            mlir_dialect="tm_tensor",
-            device_idx=None,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/word_embeddings_layernorm",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(
-            f"{working_dir}/word_embeddings_layernorm.vmfb"
-        )
-        hidden_states = shark_module(
-            inputs=(input_embeds,), function_name="forward"
-        )
-        hidden_states = torch.tensor(hidden_states).float()
-
-        torch.save(hidden_states, f"{working_dir}/hidden_states_0.pt")
-
-        attention_mask = torch.ones(
-            [hidden_states.shape[0], len(input_ids[0])]
-        )
-
-        attention_mask = torch.tensor(attention_mask).float()
-
-        alibi = build_alibi_tensor(
-            attention_mask,
-            n_head,
-            hidden_states.dtype,
-            device="cpu",
-        )
-
-        torch.save(alibi, f"{working_dir}/alibi.pt")
-
-        causal_mask = _prepare_attn_mask(
-            attention_mask, input_ids.size(), input_embeds, 0
-        )
-        causal_mask = torch.tensor(causal_mask).float()
-
-        torch.save(causal_mask, f"{working_dir}/causal_mask.pt")
-
-    elif layer_name in [str(x) for x in range(n_layer)]:
-        hidden_states = torch.load(
-            f"{working_dir}/hidden_states_{layer_name}.pt"
-        )
-        alibi = torch.load(f"{working_dir}/alibi.pt")
-        causal_mask = torch.load(f"{working_dir}/causal_mask.pt")
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(
-                f"{working_dir}/bloom_block_{layer_name}.mlir",
-                encoding="utf-8",
-            )
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        shark_module = SharkInference(
-            mlir_str,
-            device=device,
-            mlir_dialect="tm_tensor",
-            device_idx=device_idx,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/bloom_block_{layer_name}",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(
-            f"{working_dir}/bloom_block_{layer_name}.vmfb"
-        )
-
-        output = shark_module(
-            inputs=(
-                hidden_states.detach().numpy(),
-                alibi.detach().numpy(),
-                causal_mask.detach().numpy(),
-            ),
-            function_name="forward",
-        )
-
-        hidden_states = torch.tensor(output[0]).float()
-
-        torch.save(
-            hidden_states,
-            f"{working_dir}/hidden_states_{int(layer_name) + 1}.pt",
-        )
-
-    elif layer_name == "end":
-        mlir_str = ""
-
-        if will_compile:
-            f = open(f"{working_dir}/ln_f.mlir", encoding="utf-8")
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        shark_module = SharkInference(
-            mlir_str,
-            device="cpu",
-            mlir_dialect="tm_tensor",
-            device_idx=None,
-        )
-
-        if will_compile:
-            shark_module.save_module(
-                module_name=f"{working_dir}/ln_f",
-                extra_args=[
-                    "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                    "--iree-stream-resource-max-allocation-size=1000000000",
-                    "--iree-codegen-check-ir-before-llvm-conversion=false",
-                ],
-            )
-
-        shark_module.load_module(f"{working_dir}/ln_f.vmfb")
-
-        hidden_states = torch.load(f"{working_dir}/hidden_states_{n_layer}.pt")
-
-        hidden_states = shark_module(
-            inputs=(hidden_states,), function_name="forward"
-        )
-
-        mlir_str = ""
-
-        if will_compile:
-            f = open(f"{working_dir}/lm_head.mlir", encoding="utf-8")
-            mlir_str = f.read()
-            f.close()
-
-            mlir_str = bytes(mlir_str, "utf-8")
-
-        if config["n_embed"] == 14336:
-
-            def get_state_dict():
-                d = torch.load(
-                    f"{working_dir}/pytorch_model_00001-of-00072.bin"
-                )
-                return OrderedDict(
-                    (k.replace("word_embeddings.", ""), v)
-                    for k, v in d.items()
-                )
-
-            def load_causal_lm_head():
-                linear = nn.utils.skip_init(
-                    nn.Linear, 14336, 250880, bias=False, dtype=torch.float
-                )
-                linear.load_state_dict(get_state_dict(), strict=False)
-                return linear.float()
-
-            lm_head = load_causal_lm_head()
-
-            logits = lm_head(torch.tensor(hidden_states).float())
-
-        else:
-            shark_module = SharkInference(
-                mlir_str,
-                device="cpu",
-                mlir_dialect="tm_tensor",
-                device_idx=None,
-            )
-
-            if will_compile:
-                shark_module.save_module(
-                    module_name=f"{working_dir}/lm_head",
-                    extra_args=[
-                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                        "--iree-stream-resource-max-allocation-size=1000000000",
-                        "--iree-codegen-check-ir-before-llvm-conversion=false",
-                    ],
-                )
-
-            shark_module.load_module(f"{working_dir}/lm_head.vmfb")
-
-            logits = shark_module(
-                inputs=(hidden_states,), function_name="forward"
-            )
-
-        logits = torch.tensor(logits).float()
-
-        tokenizer = AutoTokenizer.from_pretrained(working_dir)
-
-        next_token = tokenizer.decode(torch.argmax(logits[:, -1, :], dim=-1))
-
-        f = open(f"{working_dir}/prompt.txt", "w+")
-        f.write(prompt + next_token)
-        f.close()
--- a/shark/examples/shark_training/stable_diffusion/README.md
+++ b/shark/examples/shark_training/stable_diffusion/README.md
@@ -1,43 +0,0 @@
-# Stable Diffusion Fine Tuning
-
-## Installation (Linux)
-
-### Activate shark.venv Virtual Environment
-
-```shell
-source shark.venv/bin/activate
-
-# Some older pip installs may not be able to handle the recent PyTorch deps
-python -m pip install --upgrade pip
-```
-
-## Install dependencies
-
-### Run the following installation commands:
-```
-pip install -U git+https://github.com/huggingface/diffusers.git
-pip install accelerate transformers ftfy
-```
-
-### Build torch-mlir with the following branch:
-
-Please cherry-pick this branch of torch-mlir: https://github.com/vivekkhandelwal1/torch-mlir/tree/sd-ops
-and build it locally. You can find the instructions for using locally build Torch-MLIR,
-here: https://github.com/nod-ai/SHARK#how-to-use-your-locally-built-iree--torch-mlir-with-shark
-
-## Run the Stable diffusion fine tuning
-
-To run the model with the default set of images and params, run:
-```shell
-python stable_diffusion_fine_tuning.py
-```
-By default the training is run through the PyTorch path. If you want to train the model using the Torchdynamo path of Torch-MLIR, you need to specify `--use_torchdynamo=True`.
-
-The default number of training steps are `2000`, which would take many hours to complete based on your system config. You can pass the smaller value with the arg `--training_steps`. You can specify the number of images to be sampled for the result with the `--num_inference_samples` arg. For the number of inference steps you can use `--inference_steps` flag.
-
-For example, you can run the training for a limited set of steps via the dynamo path by using the following command:
-```
-python stable_diffusion_fine_tuning.py --training_steps=1 --inference_steps=1 --num_inference_samples=1 --train_batch_size=1 --use_torchdynamo=True
-```
-
-You can also specify the device to be used via the flag `--device`. The default value is `cpu`, for GPU execution you can specify `--device="cuda"`.
--- a/shark/examples/shark_training/stable_diffusion/stable_diffusion_fine_tuning.py
+++ b/shark/examples/shark_training/stable_diffusion/stable_diffusion_fine_tuning.py
@@ -1,914 +0,0 @@
-# Install the required libs
-# pip install -U git+https://github.com/huggingface/diffusers.git
-# pip install accelerate transformers ftfy
-
-# Import required libraries
-import argparse
-import itertools
-import math
-import os
-from typing import List
-import random
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import PIL
-import logging
-
-import torch_mlir
-from torch_mlir.dynamo import make_simple_dynamo_backend
-import torch._dynamo as dynamo
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
-from shark.shark_inference import SharkInference
-
-torch._dynamo.config.verbose = True
-
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.pipelines.stable_diffusion import (
-    StableDiffusionSafetyChecker,
-)
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import (
-    CLIPFeatureExtractor,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
-
-
-# Enter your HuggingFace Token
-# Note: You can comment this prompt and just set your token instead of passing it through cli for every execution.
-hf_token = input("Please enter your huggingface token here: ")
-YOUR_TOKEN = hf_token
-
-
-def image_grid(imgs, rows, cols):
-    assert len(imgs) == rows * cols
-
-    w, h = imgs[0].size
-    grid = Image.new("RGB", size=(cols * w, rows * h))
-    grid_w, grid_h = grid.size
-
-    for i, img in enumerate(imgs):
-        grid.paste(img, box=(i % cols * w, i // cols * h))
-    return grid
-
-
-# `pretrained_model_name_or_path` which Stable Diffusion checkpoint you want to use
-# Options: 1.) "stabilityai/stable-diffusion-2"
-#          2.) "stabilityai/stable-diffusion-2-base"
-#          3.) "CompVis/stable-diffusion-v1-4"
-#          4.) "runwayml/stable-diffusion-v1-5"
-pretrained_model_name_or_path = "stabilityai/stable-diffusion-2"
-
-# Add here the URLs to the images of the concept you are adding. 3-5 should be fine
-urls = [
-    "https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg",
-    "https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg",
-    "https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg",
-    "https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg",
-    ## You can add additional images here
-]
-
-# Downloading Images
-import requests
-import glob
-from io import BytesIO
-
-
-def download_image(url):
-    try:
-        response = requests.get(url)
-    except:
-        return None
-    return Image.open(BytesIO(response.content)).convert("RGB")
-
-
-images = list(filter(None, [download_image(url) for url in urls]))
-save_path = "./my_concept"
-if not os.path.exists(save_path):
-    os.mkdir(save_path)
-[image.save(f"{save_path}/{i}.jpeg") for i, image in enumerate(images)]
-
-p = argparse.ArgumentParser(
-    description=__doc__,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-)
-p.add_argument(
-    "--input_dir",
-    type=str,
-    default="my_concept/",
-    help="the directory contains the images used for fine tuning",
-)
-p.add_argument(
-    "--output_dir",
-    type=str,
-    default="sd_result",
-    help="the directory contains the images used for fine tuning",
-)
-p.add_argument(
-    "--training_steps",
-    type=int,
-    default=2000,
-    help="the maximum number of training steps",
-)
-p.add_argument(
-    "--train_batch_size",
-    type=int,
-    default=4,
-    help="The batch size for training",
-)
-p.add_argument(
-    "--save_steps",
-    type=int,
-    default=250,
-    help="the number of steps after which to save the learned concept",
-)
-p.add_argument("--seed", type=int, default=42, help="the random seed")
-p.add_argument(
-    "--what_to_teach",
-    type=str,
-    choices=["object", "style"],
-    default="object",
-    help="what is it that you are teaching?",
-)
-p.add_argument(
-    "--placeholder_token",
-    type=str,
-    default="<cat-toy>",
-    help="It is the token you are going to use to represent your new concept",
-)
-p.add_argument(
-    "--initializer_token",
-    type=str,
-    default="toy",
-    help="It is a word that can summarise what is your new concept",
-)
-p.add_argument(
-    "--inference_steps",
-    type=int,
-    default=50,
-    help="the number of steps for inference",
-)
-p.add_argument(
-    "--num_inference_samples",
-    type=int,
-    default=4,
-    help="the number of samples for inference",
-)
-p.add_argument(
-    "--prompt",
-    type=str,
-    default="a grafitti in a wall with a *s on it",
-    help="the text prompt to use",
-)
-p.add_argument(
-    "--device",
-    type=str,
-    default="cpu",
-    help="The device to use",
-)
-p.add_argument(
-    "--use_torchdynamo",
-    type=bool,
-    default=False,
-    help="This flag is used to determine whether the training has to be done through the torchdynamo path or not.",
-)
-args = p.parse_args()
-torch.manual_seed(args.seed)
-
-if "*s" not in args.prompt:
-    raise ValueError(
-        f'The prompt should have a "*s" which will be replaced by a placeholder token.'
-    )
-
-prompt1, prompt2 = args.prompt.split("*s")
-args.prompt = prompt1 + args.placeholder_token + prompt2
-
-# `images_path` is a path to directory containing the training images.
-images_path = args.input_dir
-while not os.path.exists(str(images_path)):
-    print(
-        "The images_path specified does not exist, use the colab file explorer to copy the path :"
-    )
-    images_path = input("")
-save_path = images_path
-
-# Setup and check the images you have just added
-images = []
-for file_path in os.listdir(save_path):
-    try:
-        image_path = os.path.join(save_path, file_path)
-        images.append(Image.open(image_path).resize((512, 512)))
-    except:
-        print(
-            f"{image_path} is not a valid image, please make sure to remove this file from the directory otherwise the training could fail."
-        )
-image_grid(images, 1, len(images))
-
-########### Create Dataset ##########
-
-# Setup the prompt templates for training
-imagenet_templates_small = [
-    "a photo of a {}",
-    "a rendering of a {}",
-    "a cropped photo of the {}",
-    "the photo of a {}",
-    "a photo of a clean {}",
-    "a photo of a dirty {}",
-    "a dark photo of the {}",
-    "a photo of my {}",
-    "a photo of the cool {}",
-    "a close-up photo of a {}",
-    "a bright photo of the {}",
-    "a cropped photo of a {}",
-    "a photo of the {}",
-    "a good photo of the {}",
-    "a photo of one {}",
-    "a close-up photo of the {}",
-    "a rendition of the {}",
-    "a photo of the clean {}",
-    "a rendition of a {}",
-    "a photo of a nice {}",
-    "a good photo of a {}",
-    "a photo of the nice {}",
-    "a photo of the small {}",
-    "a photo of the weird {}",
-    "a photo of the large {}",
-    "a photo of a cool {}",
-    "a photo of a small {}",
-]
-
-imagenet_style_templates_small = [
-    "a painting in the style of {}",
-    "a rendering in the style of {}",
-    "a cropped painting in the style of {}",
-    "the painting in the style of {}",
-    "a clean painting in the style of {}",
-    "a dirty painting in the style of {}",
-    "a dark painting in the style of {}",
-    "a picture in the style of {}",
-    "a cool painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a bright painting in the style of {}",
-    "a cropped painting in the style of {}",
-    "a good painting in the style of {}",
-    "a close-up painting in the style of {}",
-    "a rendition in the style of {}",
-    "a nice painting in the style of {}",
-    "a small painting in the style of {}",
-    "a weird painting in the style of {}",
-    "a large painting in the style of {}",
-]
-
-
-# Setup the dataset
-class TextualInversionDataset(Dataset):
-    def __init__(
-        self,
-        data_root,
-        tokenizer,
-        learnable_property="object",  # [object, style]
-        size=512,
-        repeats=100,
-        interpolation="bicubic",
-        flip_p=0.5,
-        set="train",
-        placeholder_token="*",
-        center_crop=False,
-    ):
-        self.data_root = data_root
-        self.tokenizer = tokenizer
-        self.learnable_property = learnable_property
-        self.size = size
-        self.placeholder_token = placeholder_token
-        self.center_crop = center_crop
-        self.flip_p = flip_p
-
-        self.image_paths = [
-            os.path.join(self.data_root, file_path)
-            for file_path in os.listdir(self.data_root)
-        ]
-
-        self.num_images = len(self.image_paths)
-        self._length = self.num_images
-
-        if set == "train":
-            self._length = self.num_images * repeats
-
-        self.interpolation = {
-            "linear": PIL.Image.LINEAR,
-            "bilinear": PIL.Image.BILINEAR,
-            "bicubic": PIL.Image.BICUBIC,
-            "lanczos": PIL.Image.LANCZOS,
-        }[interpolation]
-
-        self.templates = (
-            imagenet_style_templates_small
-            if learnable_property == "style"
-            else imagenet_templates_small
-        )
-        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, i):
-        example = {}
-        image = Image.open(self.image_paths[i % self.num_images])
-
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-
-        placeholder_string = self.placeholder_token
-        text = random.choice(self.templates).format(placeholder_string)
-
-        example["input_ids"] = self.tokenizer(
-            text,
-            padding="max_length",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids[0]
-
-        # default to score-sde preprocessing
-        img = np.array(image).astype(np.uint8)
-
-        if self.center_crop:
-            crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
-                img.shape[0],
-                img.shape[1],
-            )
-            img = img[
-                (h - crop) // 2 : (h + crop) // 2,
-                (w - crop) // 2 : (w + crop) // 2,
-            ]
-
-        image = Image.fromarray(img)
-        image = image.resize(
-            (self.size, self.size), resample=self.interpolation
-        )
-
-        image = self.flip_transform(image)
-        image = np.array(image).astype(np.uint8)
-        image = (image / 127.5 - 1.0).astype(np.float32)
-
-        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
-        return example
-
-
-########## Setting up the model ##########
-
-# Load the tokenizer and add the placeholder token as a additional special token.
-tokenizer = CLIPTokenizer.from_pretrained(
-    pretrained_model_name_or_path,
-    subfolder="tokenizer",
-)
-
-# Add the placeholder token in tokenizer
-num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
-if num_added_tokens == 0:
-    raise ValueError(
-        f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
-        " `placeholder_token` that is not already in the tokenizer."
-    )
-
-# Get token ids for our placeholder and initializer token.
-# This code block will complain if initializer string is not a single token
-# Convert the initializer_token, placeholder_token to ids
-token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
-# Check if initializer_token is a single token or a sequence of tokens
-if len(token_ids) > 1:
-    raise ValueError("The initializer token must be a single token.")
-
-initializer_token_id = token_ids[0]
-placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
-
-# Load the Stable Diffusion model
-# Load models and create wrapper for stable diffusion
-# pipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path)
-# del pipeline
-text_encoder = CLIPTextModel.from_pretrained(
-    pretrained_model_name_or_path, subfolder="text_encoder"
-)
-vae = AutoencoderKL.from_pretrained(
-    pretrained_model_name_or_path, subfolder="vae"
-)
-unet = UNet2DConditionModel.from_pretrained(
-    pretrained_model_name_or_path, subfolder="unet"
-)
-
-# We have added the placeholder_token in the tokenizer so we resize the token embeddings here
-# this will a new embedding vector in the token embeddings for our placeholder_token
-text_encoder.resize_token_embeddings(len(tokenizer))
-
-# Initialise the newly added placeholder token with the embeddings of the initializer token
-token_embeds = text_encoder.get_input_embeddings().weight.data
-token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
-
-# In Textual-Inversion we only train the newly added embedding vector
-#  so lets freeze rest of the model parameters here
-
-
-def freeze_params(params):
-    for param in params:
-        param.requires_grad = False
-
-
-# Freeze vae and unet
-freeze_params(vae.parameters())
-freeze_params(unet.parameters())
-# Freeze all parameters except for the token embeddings in text encoder
-params_to_freeze = itertools.chain(
-    text_encoder.text_model.encoder.parameters(),
-    text_encoder.text_model.final_layer_norm.parameters(),
-    text_encoder.text_model.embeddings.position_embedding.parameters(),
-)
-freeze_params(params_to_freeze)
-
-
-# Move vae and unet to device
-# For the dynamo path default compilation device is `cpu`, since torch-mlir
-# supports only that. Therefore, convert to device only for PyTorch path.
-if not args.use_torchdynamo:
-    vae.to(args.device)
-    unet.to(args.device)
-
-# Keep vae in eval mode as we don't train it
-vae.eval()
-# Keep unet in train mode to enable gradient checkpointing
-unet.train()
-
-
-class VaeModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.vae = vae
-
-    def forward(self, input):
-        x = self.vae.encode(input, return_dict=False)[0]
-        return x
-
-
-class UnetModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.unet = unet
-
-    def forward(self, x, y, z):
-        return self.unet.forward(x, y, z, return_dict=False)[0]
-
-
-shark_vae = VaeModel()
-shark_unet = UnetModel()
-
-####### Creating our training data ########
-
-# Let's create the Dataset and Dataloader
-train_dataset = TextualInversionDataset(
-    data_root=save_path,
-    tokenizer=tokenizer,
-    size=vae.sample_size,
-    placeholder_token=args.placeholder_token,
-    repeats=100,
-    learnable_property=args.what_to_teach,  # Option selected above between object and style
-    center_crop=False,
-    set="train",
-)
-
-
-def create_dataloader(train_batch_size=1):
-    return torch.utils.data.DataLoader(
-        train_dataset, batch_size=train_batch_size, shuffle=True
-    )
-
-
-# Create noise_scheduler for training
-noise_scheduler = DDPMScheduler.from_config(
-    pretrained_model_name_or_path, subfolder="scheduler"
-)
-
-######## Training ###########
-
-# Define hyperparameters for our training. If you are not happy with your results,
-# you can tune the `learning_rate` and the `max_train_steps`
-
-# Setting up all training args
-hyperparameters = {
-    "learning_rate": 5e-04,
-    "scale_lr": True,
-    "max_train_steps": args.training_steps,
-    "save_steps": args.save_steps,
-    "train_batch_size": args.train_batch_size,
-    "gradient_accumulation_steps": 1,
-    "gradient_checkpointing": True,
-    "mixed_precision": "fp16",
-    "seed": 42,
-    "output_dir": "sd-concept-output",
-}
-# creating output directory
-cwd = os.getcwd()
-out_dir = os.path.join(cwd, hyperparameters["output_dir"])
-while not os.path.exists(str(out_dir)):
-    try:
-        os.mkdir(out_dir)
-    except OSError as error:
-        print("Output directory not created")
-
-###### Torch-MLIR Compilation ######
-
-
-def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
-    removed_indexes = []
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, (list, tuple)):
-                node_arg = list(node_arg)
-                node_args_len = len(node_arg)
-                for i in range(node_args_len):
-                    curr_index = node_args_len - (i + 1)
-                    if node_arg[curr_index] is None:
-                        removed_indexes.append(curr_index)
-                        node_arg.pop(curr_index)
-                node.args = (tuple(node_arg),)
-                break
-
-    if len(removed_indexes) > 0:
-        fx_g.graph.lint()
-        fx_g.graph.eliminate_dead_code()
-        fx_g.recompile()
-    removed_indexes.sort()
-    return removed_indexes
-
-
-def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
-    """
-    Replace tuple with tuple element in functions that return one-element tuples.
-    Returns true if an unwrapping took place, and false otherwise.
-    """
-    unwrapped_tuple = False
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, tuple):
-                if len(node_arg) == 1:
-                    node.args = (node_arg[0],)
-                    unwrapped_tuple = True
-                    break
-
-    if unwrapped_tuple:
-        fx_g.graph.lint()
-        fx_g.recompile()
-    return unwrapped_tuple
-
-
-def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
-    for node in fx_g.graph.nodes:
-        if node.op == "output":
-            assert (
-                len(node.args) == 1
-            ), "Output node must have a single argument"
-            node_arg = node.args[0]
-            if isinstance(node_arg, tuple):
-                return len(node_arg) == 0
-    return False
-
-
-def transform_fx(fx_g):
-    for node in fx_g.graph.nodes:
-        if node.op == "call_function":
-            if node.target in [
-                torch.ops.aten.empty,
-            ]:
-                # aten.empty should be filled with zeros.
-                if node.target in [torch.ops.aten.empty]:
-                    with fx_g.graph.inserting_after(node):
-                        new_node = fx_g.graph.call_function(
-                            torch.ops.aten.zero_,
-                            args=(node,),
-                        )
-                        node.append(new_node)
-                        node.replace_all_uses_with(new_node)
-                        new_node.args = (node,)
-
-    fx_g.graph.lint()
-
-
-@make_simple_dynamo_backend
-def refbackend_torchdynamo_backend(
-    fx_graph: torch.fx.GraphModule, example_inputs: List[torch.Tensor]
-):
-    # handling usage of empty tensor without initializing
-    transform_fx(fx_graph)
-    fx_graph.recompile()
-    if _returns_nothing(fx_graph):
-        return fx_graph
-    removed_none_indexes = _remove_nones(fx_graph)
-    was_unwrapped = _unwrap_single_tuple_return(fx_graph)
-
-    mlir_module = torch_mlir.compile(
-        fx_graph, example_inputs, output_type="linalg-on-tensors"
-    )
-
-    bytecode_stream = BytesIO()
-    mlir_module.operation.write_bytecode(bytecode_stream)
-    bytecode = bytecode_stream.getvalue()
-
-    shark_module = SharkInference(
-        mlir_module=bytecode, device=args.device, mlir_dialect="tm_tensor"
-    )
-    shark_module.compile()
-
-    def compiled_callable(*inputs):
-        inputs = [x.numpy() for x in inputs]
-        result = shark_module("forward", inputs)
-        if was_unwrapped:
-            result = [
-                result,
-            ]
-        if not isinstance(result, list):
-            result = torch.from_numpy(result)
-        else:
-            result = tuple(torch.from_numpy(x) for x in result)
-            result = list(result)
-            for removed_index in removed_none_indexes:
-                result.insert(removed_index, None)
-            result = tuple(result)
-        return result
-
-    return compiled_callable
-
-
-def predictions(torch_func, jit_func, batchA, batchB):
-    res = jit_func(batchA.numpy(), batchB.numpy())
-    if res is not None:
-        prediction = res
-    else:
-        prediction = None
-    return prediction
-
-
-logger = logging.getLogger(__name__)
-
-
-# def save_progress(text_encoder, placeholder_token_id, accelerator, save_path):
-def save_progress(text_encoder, placeholder_token_id, save_path):
-    logger.info("Saving embeddings")
-    learned_embeds = (
-        # accelerator.unwrap_model(text_encoder)
-        text_encoder.get_input_embeddings().weight[placeholder_token_id]
-    )
-    learned_embeds_dict = {
-        args.placeholder_token: learned_embeds.detach().cpu()
-    }
-    torch.save(learned_embeds_dict, save_path)
-
-
-train_batch_size = hyperparameters["train_batch_size"]
-gradient_accumulation_steps = hyperparameters["gradient_accumulation_steps"]
-learning_rate = hyperparameters["learning_rate"]
-if hyperparameters["scale_lr"]:
-    learning_rate = (
-        learning_rate
-        * gradient_accumulation_steps
-        * train_batch_size
-        # * accelerator.num_processes
-    )
-
-# Initialize the optimizer
-optimizer = torch.optim.AdamW(
-    text_encoder.get_input_embeddings().parameters(),  # only optimize the embeddings
-    lr=learning_rate,
-)
-
-
-# Training function
-def train_func(batch_pixel_values, batch_input_ids):
-    # Convert images to latent space
-    latents = shark_vae(batch_pixel_values).sample().detach()
-    latents = latents * 0.18215
-
-    # Sample noise that we'll add to the latents
-    noise = torch.randn_like(latents)
-    bsz = latents.shape[0]
-    # Sample a random timestep for each image
-    timesteps = torch.randint(
-        0,
-        noise_scheduler.num_train_timesteps,
-        (bsz,),
-        device=latents.device,
-    ).long()
-
-    # Add noise to the latents according to the noise magnitude at each timestep
-    # (this is the forward diffusion process)
-    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-    # Get the text embedding for conditioning
-    encoder_hidden_states = text_encoder(batch_input_ids)[0]
-
-    # Predict the noise residual
-    noise_pred = shark_unet(
-        noisy_latents,
-        timesteps,
-        encoder_hidden_states,
-    )
-
-    # Get the target for loss depending on the prediction type
-    if noise_scheduler.config.prediction_type == "epsilon":
-        target = noise
-    elif noise_scheduler.config.prediction_type == "v_prediction":
-        target = noise_scheduler.get_velocity(latents, noise, timesteps)
-    else:
-        raise ValueError(
-            f"Unknown prediction type {noise_scheduler.config.prediction_type}"
-        )
-
-    loss = (
-        F.mse_loss(noise_pred, target, reduction="none").mean([1, 2, 3]).mean()
-    )
-    loss.backward()
-
-    # Zero out the gradients for all token embeddings except the newly added
-    # embeddings for the concept, as we only want to optimize the concept embeddings
-    grads = text_encoder.get_input_embeddings().weight.grad
-    # Get the index for tokens that we want to zero the grads for
-    index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
-    grads.data[index_grads_to_zero, :] = grads.data[
-        index_grads_to_zero, :
-    ].fill_(0)
-
-    optimizer.step()
-    optimizer.zero_grad()
-
-    return loss
-
-
-def training_function():
-    max_train_steps = hyperparameters["max_train_steps"]
-    output_dir = hyperparameters["output_dir"]
-    gradient_checkpointing = hyperparameters["gradient_checkpointing"]
-
-    train_dataloader = create_dataloader(train_batch_size)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(
-        len(train_dataloader) / gradient_accumulation_steps
-    )
-    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
-
-    # Train!
-    total_batch_size = (
-        train_batch_size
-        * gradient_accumulation_steps
-        # train_batch_size * accelerator.num_processes * gradient_accumulation_steps
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
-    logger.info(
-        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
-    )
-    logger.info(
-        f"  Gradient Accumulation steps = {gradient_accumulation_steps}"
-    )
-    logger.info(f"  Total optimization steps = {max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(
-        # range(max_train_steps), disable=not accelerator.is_local_main_process
-        range(max_train_steps)
-    )
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    params_ = [i for i in text_encoder.get_input_embeddings().parameters()]
-    if args.use_torchdynamo:
-        print("******** TRAINING STARTED - TORCHYDNAMO PATH ********")
-    else:
-        print("******** TRAINING STARTED - PYTORCH PATH ********")
-    print("Initial weights:")
-    print(params_, params_[0].shape)
-
-    for epoch in range(num_train_epochs):
-        text_encoder.train()
-        for step, batch in enumerate(train_dataloader):
-            if args.use_torchdynamo:
-                dynamo_callable = dynamo.optimize(
-                    refbackend_torchdynamo_backend
-                )(train_func)
-                lam_func = lambda x, y: dynamo_callable(
-                    torch.from_numpy(x), torch.from_numpy(y)
-                )
-                loss = predictions(
-                    train_func,
-                    lam_func,
-                    batch["pixel_values"],
-                    batch["input_ids"],
-                    # params[0].detach(),
-                )
-            else:
-                loss = train_func(batch["pixel_values"], batch["input_ids"])
-            print(loss)
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            progress_bar.update(1)
-            global_step += 1
-            if global_step % hyperparameters["save_steps"] == 0:
-                save_path = os.path.join(
-                    output_dir,
-                    f"learned_embeds-step-{global_step}.bin",
-                )
-                save_progress(
-                    text_encoder,
-                    placeholder_token_id,
-                    save_path,
-                )
-
-            logs = {"loss": loss.detach().item()}
-            progress_bar.set_postfix(**logs)
-
-            if global_step >= max_train_steps:
-                break
-
-    # Create the pipeline using using the trained modules and save it.
-    params__ = [i for i in text_encoder.get_input_embeddings().parameters()]
-    print("******** TRAINING PROCESS FINISHED ********")
-    print("Updated weights:")
-    print(params__, params__[0].shape)
-    pipeline = StableDiffusionPipeline.from_pretrained(
-        pretrained_model_name_or_path,
-        # text_encoder=accelerator.unwrap_model(text_encoder),
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        vae=vae,
-        unet=unet,
-    )
-    pipeline.save_pretrained(output_dir)
-    # Also save the newly trained embeddings
-    save_path = os.path.join(output_dir, f"learned_embeds.bin")
-    save_progress(text_encoder, placeholder_token_id, save_path)
-
-
-training_function()
-
-for param in itertools.chain(unet.parameters(), text_encoder.parameters()):
-    if param.grad is not None:
-        del param.grad  # free some memory
-    torch.cuda.empty_cache()
-
-# Set up the pipeline
-from diffusers import DPMSolverMultistepScheduler
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    hyperparameters["output_dir"],
-    scheduler=DPMSolverMultistepScheduler.from_pretrained(
-        hyperparameters["output_dir"], subfolder="scheduler"
-    ),
-)
-if not args.use_torchdynamo:
-    pipe.to(args.device)
-
-# Run the Stable Diffusion pipeline
-# Don't forget to use the placeholder token in your prompt
-
-all_images = []
-for _ in range(args.num_inference_samples):
-    images = pipe(
-        [args.prompt],
-        num_inference_steps=args.inference_steps,
-        guidance_scale=7.5,
-    ).images
-    all_images.extend(images)
-
-output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
-if not os.path.isdir(args.output_dir):
-    os.mkdir(args.output_dir)
-
-[
-    image.save(f"{args.output_dir}/{i}.jpeg")
-    for i, image in enumerate(all_images)
-]
--- a/shark/iree_utils/benchmark_utils.py
+++ b/shark/iree_utils/benchmark_utils.py
@@ -139,14 +139,9 @@ def run_benchmark_module(benchmark_cl):
        benchmark_path
    ), "Cannot find benchmark_module, Please contact SHARK maintainer on discord."
    bench_result = run_cmd(" ".join(benchmark_cl))
-    try:
-        regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
-        match = regex_split.search(bench_result)
-        time = float(match.group(1))
-        unit = match.group(3)
-    except AttributeError:
-        regex_split = re.compile("(\d+[.]*\d*)([a-zA-Z]+)")
-        match = regex_split.search(bench_result)
-        time = float(match.group(1))
-        unit = match.group(2)
+    print(bench_result)
+    regex_split = re.compile("(\d+[.]*\d*)(  *)([a-zA-Z]+)")
+    match = regex_split.search(bench_result)
+    time = float(match.group(1))
+    unit = match.group(3)
    return 1.0 / (time * 0.001)
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -53,10 +53,10 @@ def get_iree_device_args(device, extra_args=[]):
 # Get the iree-compiler arguments given frontend.
 def get_iree_frontend_args(frontend):
    if frontend in ["torch", "pytorch", "linalg"]:
-        return ["--iree-llvmcpu-target-cpu-features=host"]
+        return ["--iree-llvm-target-cpu-features=host"]
    elif frontend in ["tensorflow", "tf", "mhlo"]:
        return [
-            "--iree-llvmcpu-target-cpu-features=host",
+            "--iree-llvm-target-cpu-features=host",
            "--iree-mhlo-demote-i64-to-i32=false",
            "--iree-flow-demote-i64-to-i32",
        ]
--- a/shark/iree_utils/cpu_utils.py
+++ b/shark/iree_utils/cpu_utils.py
@@ -44,4 +44,4 @@ def get_iree_cpu_args():
        error_message = f"OS Type f{os_name} not supported and triple can't be determined, open issue to dSHARK team please :)"
        raise Exception(error_message)
    print(f"Target triple found:{target_triple}")
-    return [f"--iree-llvmcpu-target-triple={target_triple}"]
+    return [f"-iree-llvm-target-triple={target_triple}"]
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -118,11 +118,10 @@ class SharkBenchmarkRunner(SharkRunner):
        )
        HFmodel, input = get_torch_model(modelname)[:2]
        frontend_model = HFmodel.model
+        # frontend_model = dynamo.optimize("inductor")(frontend_model)
        frontend_model.to(torch_device)
        input.to(torch_device)

-        # frontend_model = torch.compile(frontend_model, mode="max-autotune", backend="inductor")
-
        for i in range(shark_args.num_warmup_iterations):
            frontend_model.forward(input)

--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -89,7 +89,7 @@ if custom_path is not None:

    print(f"Using {WORKDIR} as local shark_tank cache directory.")

-elif os.path.exists(alt_path):
+if os.path.exists(alt_path):
    WORKDIR = alt_path
    print(
        f"Using {WORKDIR} as shark_tank directory. Delete this directory if you aren't working from locally generated shark_tank."
@@ -99,7 +99,6 @@ else:
    print(
        f"shark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
    )
-os.makedirs(WORKDIR, exist_ok=True)


 # Checks whether the directory and files exists.
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -4,6 +4,17 @@
 import sys
 import tempfile
 import os
+import hashlib
+
+
+def create_hash(file_name):
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(2**20):
+            file_hash.update(chunk)
+
+    return file_hash.hexdigest()
+

 # List of the supported frontends.
 supported_frontends = {
@@ -150,11 +161,11 @@ class SharkImporter:
        np.savez(os.path.join(dir, inputs_name), *inputs)
        np.savez(os.path.join(dir, outputs_name), *outputs)
        np.save(os.path.join(dir, func_file_name), np.array(func_name))
-
        if self.frontend == "torch":
            with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
                mlir_file.write(mlir_data)
-
+            mlir_hash = create_hash(os.path.join(dir, model_name_mlir))
+            np.save(os.path.join(dir, "hash"), np.array(mlir_hash))
        return

    def import_debug(
--- a/shark/sharkdynamo/utils.py
+++ b/shark/sharkdynamo/utils.py
@@ -3,7 +3,7 @@ import time
 from typing import List, Optional
 import torch
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch._functorch.compile_utils import strip_overloads
+from functorch._src.compile_utils import strip_overloads
 from shark.shark_inference import SharkInference
 from torch._decomp import get_decompositions

@@ -119,19 +119,14 @@ def make_shark_compiler(use_tracing: bool, device: str, verbose=False):
            example_inputs,
            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
        )
-        import io
-
-        bytecode_stream = io.BytesIO()
-        linalg_module.operation.write_bytecode(bytecode_stream)
-        mlir_module = bytecode_stream.getvalue()

        shark_module = SharkInference(
-            mlir_module, mlir_dialect="linalg", device=device
+            linalg_module, "forward", mlir_dialect="linalg", device=device
        )
        shark_module.compile()

        def forward(*inputs):
-            result = shark_module("forward", inputs)
+            result = shark_module.forward(inputs)
            result = tuple() if result is None else result
            return (result,) if was_unwrapped else result

--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -1,29 +1,28 @@
 resnet50,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 albert-base-v2,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
-roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
-bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"","enabled_windows"
-camembert-base,mhlo,tf,1e-2,1e-3,default,None,True,True,True,"",""
+roberta-base,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
+bert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+camembert-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 dbmdz/convbert-base-turkish-cased,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/iree-org/iree/issues/9971",""
 distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342","macos"
+facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342",""
 funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/201",""
 google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile",""
 google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,False,"",""
 microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
 microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,True,True,True,"",""
+microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,False,False,False,"",""
 albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir",""
 alexnet,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/879",""
 bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
 bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
 bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,"",""
-bert-large-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-bert-large-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
 google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
 microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390",""
 microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+microsoft/resnet-50,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
 google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"https://github.com/nod-ai/SHARK/issues/344",""
 mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/388","macos"
 nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/343","macos"
@@ -34,4 +33,4 @@ resnet50_fp16,linalg,torch,1e-2,1e-2,default,nhcw-nhwc/img2col,True,False,True,"
 squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
 efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
-mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
+mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
--- a/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
+++ b/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
@@ -63,7 +63,7 @@ if __name__ == "__main__":
    # Compile the model using IREE
    backend = "dylib-llvm-aot"
    args = [
-        "--iree-llvmcpu-target-cpu-features=host",
+        "--iree-llvm-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
        "--iree-flow-demote-i64-to-i32",
    ]
--- a/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
@@ -136,7 +136,7 @@ if __name__ == "__main__":
    backend = "dylib-llvm-aot"
    if backend == "dylib-llvm-aot":
        args = [
-            "--iree-llvmcpu-target-cpu-features=host",
+            "--iree-llvm-target-cpu-features=host",
            "--iree-mhlo-demote-i64-to-i32=false",
            "--iree-flow-demote-i64-to-i32",
        ]
--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -83,7 +83,7 @@ if __name__ == "__main__":
    # Compile the model using IREE
    backend = "dylib-llvm-aot"
    args = [
-        "--iree-llvmcpu-target-cpu-features=host",
+        "--iree-llvm-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
        "--iree-stream-resource-index-bits=64",
        "--iree-vm-target-index-bits=64",
--- a/tank/examples/bert_tf/bert_small_run.py
+++ b/tank/examples/bert_tf/bert_small_run.py
@@ -79,7 +79,7 @@ if __name__ == "__main__":
    # Compile the model using IREE
    backend = "dylib-llvm-aot"
    args = [
-        "--iree-llvmcpu-target-cpu-features=host",
+        "--iree-llvm-target-cpu-features=host",
        "--iree-mhlo-demote-i64-to-i32=false",
        "--iree-flow-demote-i64-to-i32",
    ]
--- a/tank/model_metadata.csv
+++ b/tank/model_metadata.csv
@@ -31,4 +31,3 @@ xlm-roberta-base,False,False,-,-,-
 facebook/convnext-tiny-224,False,False,-,-,-
 efficientnet-v2-s,False,False,22M,"image-classification,cnn","Includes MBConv and Fused-MBConv"
 mnasnet1_0,False,True,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
-bert-large-uncased,True,hf,True,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
--- a/tank/model_utils_tf.py
+++ b/tank/model_utils_tf.py
@@ -15,7 +15,6 @@ keras_models = ["resnet50", "efficientnet-v2-s"]
 maskedlm_models = [
    "albert-base-v2",
    "bert-base-uncased",
-    "bert-large-uncased",
    "camembert-base",
    "dbmdz/convbert-base-turkish-cased",
    "deberta-base",
--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -137,19 +137,6 @@ class SharkModuleTester:
    def create_and_check_module(self, dynamic, device):
        shark_args.local_tank_cache = self.local_tank_cache
        shark_args.force_update_tank = self.update_tank
-        shark_args.dispatch_benchmarks = self.benchmark_dispatches
-        if self.benchmark_dispatches is not None:
-            _m = self.config["model_name"].split("/")
-            _m.extend([self.config["framework"], str(dynamic), device])
-            _m = "_".join(_m)
-            shark_args.dispatch_benchmarks_dir = os.path.join(
-                self.dispatch_benchmarks_dir,
-                _m,
-            )
-            if not os.path.exists(self.dispatch_benchmarks_dir):
-                os.mkdir(self.dispatch_benchmarks_dir)
-            if not os.path.exists(shark_args.dispatch_benchmarks_dir):
-                os.mkdir(shark_args.dispatch_benchmarks_dir)
        if "nhcw-nhwc" in self.config["flags"] and not os.path.isfile(
            ".use-iree"
        ):
@@ -291,12 +278,6 @@ class SharkModuleTest(unittest.TestCase):
            "update_tank"
        )
        self.module_tester.tank_url = self.pytestconfig.getoption("tank_url")
-        self.module_tester.benchmark_dispatches = self.pytestconfig.getoption(
-            "benchmark_dispatches"
-        )
-        self.module_tester.dispatch_benchmarks_dir = (
-            self.pytestconfig.getoption("dispatch_benchmarks_dir")
-        )

        if config["xfail_cpu"] == "True" and device == "cpu":
            pytest.xfail(reason=config["xfail_reason"])
@@ -307,9 +288,6 @@ class SharkModuleTest(unittest.TestCase):
        if config["xfail_vkm"] == "True" and device in ["metal", "vulkan"]:
            pytest.xfail(reason=config["xfail_reason"])

-        if os.name == "nt" and "enabled_windows" not in config["xfail_other"]:
-            pytest.xfail(reason="this model skipped on windows")
-
        # Special cases that need to be marked.
        if "macos" in config["xfail_other"] and device in [
            "metal",
--- a/tank/tf_model_list.csv
+++ b/tank/tf_model_list.csv
@@ -18,4 +18,3 @@ microsoft/mpnet-base,hf
 facebook/convnext-tiny-224,img
 google/vit-base-patch16-224,img
 efficientnet-v2-s,keras
-bert-large-uncased,hf
--- a/tank/torch_model_list.csv
+++ b/tank/torch_model_list.csv
@@ -18,4 +18,3 @@ nvidia/mit-b0,True,hf_img_cls,False,3.7M,"image-classification,transformer-encod
 mnasnet1_0,False,vision,True,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
 resnet50_fp16,False,vision,True,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
 bert-base-uncased_fp16,True,fp16,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
-bert-large-uncased,True,hf,True,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
--- a/tank/torch_sd_list.csv
+++ b/tank/torch_sd_list.csv
@@ -0,0 +1,6 @@
+model_name, use_tracing, model_type, dynamic, param_count, tags, notes
+stabilityai/stable-diffusion-2-1-base,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
+stabilityai/stable-diffusion-2-1,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
+anythingv3/v1_4,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
+analogdiffusion/v1_4,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
+openjourney/v1_4",True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A