[Web] remove txt2img ui dependencies from png import metadata (#1275 )

Add auto-tuner to SD apps (#1291 )
Add documentation for using SHARK with AI-Render (#1296 )
2026-04-20 03:00:34 -04:00 · 2023-04-12 07:32:47 -10:00 · 2023-04-12 09:21:17 -07:00 · 2023-04-12 03:09:34 -10:00 · 2023-04-11 17:59:09 -07:00 · 2023-04-11 15:34:25 -05:00
69 changed files with 3103 additions and 1810 deletions
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -112,7 +112,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cpu 
+        pytest --forked --benchmark=native --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cpu 
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv

@@ -120,9 +120,9 @@ jobs:
      if: matrix.suite == 'cuda'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} BENCHMARK=1 IMPORTER=1 ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cuda
+        pytest --forked --benchmark=native --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
        # Disabled due to black image bug
@@ -145,17 +145,19 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k vulkan
+        pytest --forked --benchmark="native" --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k vulkan
        python build_tools/stable_diffusion_testing.py --device=vulkan

    - name: Validate Vulkan Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
      run: |
        ./setup_venv.ps1
-        pytest -k vulkan -s
+        pytest -k vulkan -s --ci

    - name: Validate Stable Diffusion Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
      run: |
        ./setup_venv.ps1
+        python process_skipfiles.py
+        pyinstaller .\apps\stable_diffusion\shark_sd.spec
        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/README.md
+++ b/README.md
@@ -114,12 +114,12 @@ source shark.venv/bin/activate

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\txt2img.py --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
 ```

 #### Linux / macOS Users
 ```shell
-python3.11 apps/stable_diffusion/scripts/txt2img.py --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
+python3.11 apps/stable_diffusion/scripts/main.py --app=txt2img --precision=fp16 --device=vulkan --prompt="tajmahal, oil on canvas, sunflowers, 4k, uhd"
 ```

 You can replace `vulkan` with `cpu` to run on your CPU or with `cuda` to run on CUDA devices. If you have multiple vulkan devices you can address them with `--device=vulkan://1` etc
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1,5 +1,3 @@
-from apps.stable_diffusion.scripts.txt2img import txt2img_inf
-from apps.stable_diffusion.scripts.img2img import img2img_inf
 from apps.stable_diffusion.scripts.inpaint import inpaint_inf
 from apps.stable_diffusion.scripts.outpaint import outpaint_inf
 from apps.stable_diffusion.scripts.upscaler import upscaler_inf
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -2,10 +2,12 @@ import sys
 import torch
 import time
 from PIL import Image
+import transformers
 from apps.stable_diffusion.src import (
    args,
    Image2ImagePipeline,
    StencilPipeline,
+    resize_stencil,
    get_schedulers,
    set_init_device_flags,
    utils,
@@ -15,258 +17,7 @@ from apps.stable_diffusion.src import (
 from apps.stable_diffusion.src.utils import get_generation_text_info


-schedulers = None
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# For stencil, the input image can be of any size but we need to ensure that
-# it conforms with our model contraints :-
-#   Both width and height should be > 384 and multiple of 8.
-# This utility function performs the transformation on the input image while
-# also maintaining the aspect ratio before sending it to the stencil pipeline.
-def resize_stencil(image: Image.Image):
-    width, height = image.size
-    aspect_ratio = width / height
-    min_size = min(width, height)
-    if min_size < 384:
-        n_size = 384
-        if width == min_size:
-            width = n_size
-            height = n_size / aspect_ratio
-        else:
-            height = n_size
-            width = n_size * aspect_ratio
-    width = int(width)
-    height = int(height)
-    n_width = width // 8
-    n_height = height // 8
-    n_width *= 8
-    n_height *= 8
-    new_image = image.resize((n_width, n_height))
-    return new_image, n_width, n_height
-
-
-# Exposed to UI.
-def img2img_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image,
-    height: int,
-    width: int,
-    steps: int,
-    strength: float,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    use_stencil: str,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    global schedulers
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.seed = seed
-    args.steps = steps
-    args.strength = strength
-    args.scheduler = scheduler
-    args.img_path = "not none"
-
-    if init_image is None:
-        return None, "An Initial Image is required"
-    image = init_image.convert("RGB")
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    use_stencil = None if use_stencil == "None" else use_stencil
-    args.use_stencil = use_stencil
-    if use_stencil is not None:
-        args.scheduler = "DDIM"
-        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
-        image, width, height = resize_stencil(image)
-    elif args.scheduler != "PNDM":
-        if "Shark" in args.scheduler:
-            print(
-                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
-            )
-            args.scheduler = "PNDM"
-        else:
-            sys.exit(
-                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
-            )
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    args.precision = precision
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    new_config_obj = Config(
-        "img2img",
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=use_stencil,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-
-        if use_stencil is not None:
-            args.use_tuned = False
-            global_obj.set_sd_obj(
-                StencilPipeline.from_pretrained(
-                    scheduler_obj,
-                    args.import_mlir,
-                    args.hf_model_id,
-                    args.ckpt_loc,
-                    args.custom_vae,
-                    args.precision,
-                    args.max_length,
-                    args.batch_size,
-                    args.height,
-                    args.width,
-                    args.use_base_vae,
-                    args.use_tuned,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
-                    use_stencil=use_stencil,
-                    debug=args.import_debug if args.import_mlir else False,
-                    use_lora=args.use_lora,
-                )
-            )
-        else:
-            global_obj.set_sd_obj(
-                Image2ImagePipeline.from_pretrained(
-                    scheduler_obj,
-                    args.import_mlir,
-                    args.hf_model_id,
-                    args.ckpt_loc,
-                    args.custom_vae,
-                    args.precision,
-                    args.max_length,
-                    args.batch_size,
-                    args.height,
-                    args.width,
-                    args.use_base_vae,
-                    args.use_tuned,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
-                    debug=args.import_debug if args.import_mlir else False,
-                    use_lora=args.use_lora,
-                )
-            )
-
-    global_obj.set_schedulers(schedulers[scheduler])
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    extra_info = {"STRENGTH": strength}
-    text_output = ""
-    for current_batch in range(batch_count):
-        if current_batch > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            image,
-            batch_size,
-            height,
-            width,
-            steps,
-            strength,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-            use_stencil=use_stencil,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed, extra_info)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output
-
-
-if __name__ == "__main__":
+def main():
    if args.clear_all:
        clear_all()

@@ -319,6 +70,7 @@ if __name__ == "__main__":
            use_stencil=use_stencil,
            debug=args.import_debug if args.import_mlir else False,
            use_lora=args.use_lora,
+            ondemand=args.ondemand,
        )
    else:
        img2img_obj = Image2ImagePipeline.from_pretrained(
@@ -337,6 +89,7 @@ if __name__ == "__main__":
            low_cpu_mem_usage=args.low_cpu_mem_usage,
            debug=args.import_debug if args.import_mlir else False,
            use_lora=args.use_lora,
+            ondemand=args.ondemand,
        )

    start_time = time.time()
@@ -372,3 +125,7 @@ if __name__ == "__main__":
    extra_info = {"STRENGTH": args.strength}
    save_output_img(generated_imgs[0], seed, extra_info)
    print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/inpaint.py
+++ b/apps/stable_diffusion/scripts/inpaint.py
@@ -1,6 +1,7 @@
 import torch
 import time
 from PIL import Image
+import transformers
 from apps.stable_diffusion.src import (
    args,
    InpaintPipeline,
@@ -13,8 +14,6 @@ from apps.stable_diffusion.src import (
 from apps.stable_diffusion.src.utils import get_generation_text_info


-schedulers = None
-
 # set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
 init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
 init_use_tuned = args.use_tuned
@@ -45,6 +44,7 @@ def inpaint_inf(
    save_metadata_to_png: bool,
    lora_weights: str,
    lora_hf_id: str,
+    ondemand: bool,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -56,8 +56,6 @@ def inpaint_inf(
        SD_STATE_CANCEL,
    )

-    global schedulers
-
    args.prompts = [prompt]
    args.negative_prompts = [negative_prompt]
    args.guidance_scale = guidance_scale
@@ -65,6 +63,10 @@ def inpaint_inf(
    args.scheduler = scheduler
    args.img_path = "not none"
    args.mask_path = "not none"
+    args.ondemand = ondemand
+    if ondemand and batch_count > 1:
+        print("Low VRAM mode currently only supports 1 batch count.")
+        batch_count = 1

    # set ckpt_loc and hf_model_id.
    args.ckpt_loc = ""
@@ -102,9 +104,11 @@ def inpaint_inf(
        device,
        use_lora=args.use_lora,
        use_stencil=None,
+        ondemand=ondemand,
    )
    if (
-        not global_obj.get_sd_obj()
+        args.ondemand
+        or not global_obj.get_sd_obj()
        or global_obj.get_cfg_obj() != new_config_obj
    ):
        global_obj.clear_cache()
@@ -125,14 +129,15 @@ def inpaint_inf(
            if args.hf_model_id
            else "stabilityai/stable-diffusion-2-inpainting"
        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
        global_obj.set_sd_obj(
            InpaintPipeline.from_pretrained(
                scheduler=scheduler_obj,
                import_mlir=args.import_mlir,
                model_id=args.hf_model_id,
                ckpt_loc=args.ckpt_loc,
+                custom_vae=args.custom_vae,
                precision=args.precision,
                max_length=args.max_length,
                batch_size=args.batch_size,
@@ -143,10 +148,11 @@ def inpaint_inf(
                low_cpu_mem_usage=args.low_cpu_mem_usage,
                debug=args.import_debug if args.import_mlir else False,
                use_lora=args.use_lora,
+                ondemand=args.ondemand,
            )
        )

-    global_obj.set_schedulers(schedulers[scheduler])
+    global_obj.set_sd_scheduler(scheduler)

    start_time = time.time()
    global_obj.get_sd_obj().log = ""
@@ -193,7 +199,7 @@ def inpaint_inf(
    return generated_imgs, text_output


-if __name__ == "__main__":
+def main():
    if args.clear_all:
        clear_all()

@@ -223,6 +229,7 @@ if __name__ == "__main__":
        import_mlir=args.import_mlir,
        model_id=args.hf_model_id,
        ckpt_loc=args.ckpt_loc,
+        custom_vae=args.custom_vae,
        precision=args.precision,
        max_length=args.max_length,
        batch_size=args.batch_size,
@@ -233,6 +240,7 @@ if __name__ == "__main__":
        low_cpu_mem_usage=args.low_cpu_mem_usage,
        debug=args.import_debug if args.import_mlir else False,
        use_lora=args.use_lora,
+        ondemand=args.ondemand,
    )

    for current_batch in range(args.batch_count):
@@ -275,3 +283,7 @@ if __name__ == "__main__":

        save_output_img(generated_imgs[0], seed)
        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/main.py
+++ b/apps/stable_diffusion/scripts/main.py
@@ -0,0 +1,19 @@
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.scripts import (
+    img2img,
+    txt2img,
+    #    inpaint,
+    #    outpaint,
+)
+
+if __name__ == "__main__":
+    if args.app == "txt2img":
+        txt2img.main()
+    elif args.app == "img2img":
+        img2img.main()
+    #   elif args.app == "inpaint":
+    #       inpaint.main()
+    #   elif args.app == "outpaint":
+    #       outpaint.main()
+    else:
+        print(f"args.app value is {args.app} but this isn't supported")
--- a/apps/stable_diffusion/scripts/outpaint.py
+++ b/apps/stable_diffusion/scripts/outpaint.py
@@ -1,6 +1,7 @@
 import torch
 import time
 from PIL import Image
+import transformers
 from apps.stable_diffusion.src import (
    args,
    OutpaintPipeline,
@@ -13,8 +14,6 @@ from apps.stable_diffusion.src import (
 from apps.stable_diffusion.src.utils import get_generation_text_info


-schedulers = None
-
 # set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
 init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
 init_use_tuned = args.use_tuned
@@ -48,6 +47,7 @@ def outpaint_inf(
    save_metadata_to_png: bool,
    lora_weights: str,
    lora_hf_id: str,
+    ondemand: bool,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -59,14 +59,16 @@ def outpaint_inf(
        SD_STATE_CANCEL,
    )

-    global schedulers
-
    args.prompts = [prompt]
    args.negative_prompts = [negative_prompt]
    args.guidance_scale = guidance_scale
    args.steps = steps
    args.scheduler = scheduler
    args.img_path = "not none"
+    if ondemand:
+        print("Outpainting is not supporting ondemand yet.")
+        ondemand = False
+    args.ondemand = ondemand

    # set ckpt_loc and hf_model_id.
    args.ckpt_loc = ""
@@ -127,8 +129,8 @@ def outpaint_inf(
            if args.hf_model_id
            else "stabilityai/stable-diffusion-2-inpainting"
        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
        global_obj.set_sd_obj(
            OutpaintPipeline.from_pretrained(
                scheduler_obj,
@@ -144,10 +146,11 @@ def outpaint_inf(
                args.use_base_vae,
                args.use_tuned,
                use_lora=args.use_lora,
+                ondemand=args.ondemand,
            )
        )

-    global_obj.set_schedulers(schedulers[scheduler])
+    global_obj.set_sd_scheduler(scheduler)

    start_time = time.time()
    global_obj.get_sd_obj().log = ""
@@ -203,7 +206,7 @@ def outpaint_inf(
    return generated_imgs, text_output


-if __name__ == "__main__":
+def main():
    if args.clear_all:
        clear_all()

@@ -238,6 +241,7 @@ if __name__ == "__main__":
        args.use_base_vae,
        args.use_tuned,
        use_lora=args.use_lora,
+        ondemand=args.ondemand,
    )

    for current_batch in range(args.batch_count):
@@ -302,3 +306,7 @@ if __name__ == "__main__":
        }
        save_output_img(generated_imgs[0], seed, extra_info)
        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/train_lora_word.py
+++ b/apps/stable_diffusion/scripts/train_lora_word.py
@@ -159,9 +159,6 @@ class LoraDataset(Dataset):
        return example


-schedulers = None
-
-
 ########## Setting up the model ##########
 def lora_train(
    prompt: str,
@@ -187,8 +184,6 @@ def lora_train(
    )
    import apps.stable_diffusion.web.utils.global_obj as global_obj

-    global schedulers
-
    print(
        "Note LoRA training is not compatible with the latest torch-mlir branch"
    )
@@ -227,7 +222,12 @@ def lora_train(
    args.max_length = max_length
    args.height = height
    args.width = width
-    args.device = device.split("=>", 1)[1].strip()
+    device_str = device.split("=>", 1)[1].strip().split("://")
+    if len(device_str) > 1:
+        device_str = device_str[0] + ":" + device_str[1]
+    else:
+        device_str = device_str[0]
+    args.device = device_str

    # Load the Stable Diffusion model
    text_encoder = CLIPTextModel.from_pretrained(
--- a/apps/stable_diffusion/scripts/tuner.py
+++ b/apps/stable_diffusion/scripts/tuner.py
@@ -0,0 +1,114 @@
+import os
+from pathlib import Path
+from shark_tuner.codegen_tuner import SharkCodegenTuner
+from shark_tuner.iree_utils import (
+    dump_dispatches,
+    create_context,
+    export_module_to_mlir_file,
+)
+from shark_tuner.model_annotation import model_annotation
+from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.utils import set_init_device_flags
+from apps.stable_diffusion.src.utils.sd_annotation import (
+    get_device_args,
+    load_winograd_configs,
+)
+from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel
+
+
+def load_mlir_module():
+    sd_model = SharkifyStableDiffusionModel(
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        args.precision,
+        max_len=args.max_length,
+        batch_size=args.batch_size,
+        height=args.height,
+        width=args.width,
+        use_base_vae=args.use_base_vae,
+        use_tuned=False,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        return_mlir=True,
+    )
+
+    if args.annotation_model == "unet":
+        mlir_module = sd_model.unet()
+        model_name = sd_model.model_name["unet"]
+    elif args.annotation_model == "vae":
+        mlir_module = sd_model.vae()
+        model_name = sd_model.model_name["vae"]
+    else:
+        raise ValueError(
+            f"{args.annotation_model} is not supported for tuning."
+        )
+
+    return mlir_module, model_name
+
+
+def main():
+    args.use_tuned = False
+    set_init_device_flags()
+    mlir_module, model_name = load_mlir_module()
+
+    # Get device and device specific arguments
+    device, device_spec_args = get_device_args()
+    device_spec = ""
+    if device_spec_args:
+        device_spec = device_spec_args[-1].split("=")[-1].strip()
+        if device == "vulkan":
+            device_spec = device_spec.split("-")[0]
+
+    # Add winograd annotation for vulkan device
+    winograd_config = (
+        load_winograd_configs()
+        if device == "vulkan" and args.annotation_model in ["unet", "vae"]
+        else ""
+    )
+    with create_context() as ctx:
+        input_module = model_annotation(
+            ctx,
+            input_contents=mlir_module,
+            config_path=winograd_config,
+            search_op="conv",
+            winograd=True,
+        )
+
+    # Dump model dispatches
+    if device == "vulkan" and device_spec == "rdna3":
+        device = "vulkan/RX 7900"
+    generates_dir = Path.home() / "tmp"
+    if not os.path.exists(generates_dir):
+        os.makedirs(generates_dir)
+    dump_mlir = generates_dir / "temp.mlir"
+    dispatch_dir = generates_dir / f"{model_name}_{device_spec}_dispatches"
+    export_module_to_mlir_file(input_module, dump_mlir)
+    dump_dispatches(dump_mlir, device, dispatch_dir, False)
+
+    # Tune each dispatch
+    dtype = "f16" if args.precision == "fp16" else "f32"
+    config_filename = f"{model_name}_{device_spec}_configs.json"
+
+    for f_path in os.listdir(dispatch_dir):
+        if not f_path.endswith(".mlir"):
+            continue
+
+        model_dir = os.path.join(dispatch_dir, f_path)
+
+        tuner = SharkCodegenTuner(
+            model_dir,
+            device,
+            "random",
+            args.num_iters,
+            args.tuned_config_dir,
+            dtype,
+            args.search_op,
+            batch_size=1,
+            config_filename=config_filename,
+            use_dispatch=True,
+        )
+        tuner.tune()
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -1,4 +1,5 @@
 import torch
+import transformers
 import time
 from apps.stable_diffusion.src import (
    args,
@@ -9,180 +10,9 @@ from apps.stable_diffusion.src import (
    clear_all,
    save_output_img,
 )
-from apps.stable_diffusion.src.utils import get_generation_text_info
-
-schedulers = None
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir


-# Exposed to UI.
-def txt2img_inf(
-    prompt: str,
-    negative_prompt: str,
-    height: int,
-    width: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    global schedulers
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        "txt2img",
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.precision = precision
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        args.img_path = None
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
-        global_obj.set_sd_obj(
-            Text2ImagePipeline.from_pretrained(
-                scheduler=scheduler_obj,
-                import_mlir=args.import_mlir,
-                model_id=args.hf_model_id,
-                ckpt_loc=args.ckpt_loc,
-                precision=args.precision,
-                max_length=args.max_length,
-                batch_size=args.batch_size,
-                height=args.height,
-                width=args.width,
-                use_base_vae=args.use_base_vae,
-                use_tuned=args.use_tuned,
-                custom_vae=args.custom_vae,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                debug=args.import_debug if args.import_mlir else False,
-                use_lora=args.use_lora,
-            )
-        )
-
-    global_obj.set_schedulers(schedulers[scheduler])
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    text_output = ""
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            batch_size,
-            height,
-            width,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output
-
-
-if __name__ == "__main__":
+def main():
    if args.clear_all:
        clear_all()

@@ -208,6 +38,8 @@ if __name__ == "__main__":
        low_cpu_mem_usage=args.low_cpu_mem_usage,
        debug=args.import_debug if args.import_mlir else False,
        use_lora=args.use_lora,
+        use_quantize=args.use_quantize,
+        ondemand=args.ondemand,
    )

    for current_batch in range(args.batch_count):
@@ -247,3 +79,7 @@ if __name__ == "__main__":

        save_output_img(generated_imgs[0], seed)
        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/upscaler.py
+++ b/apps/stable_diffusion/scripts/upscaler.py
@@ -1,6 +1,7 @@
 import torch
 import time
 from PIL import Image
+import transformers
 from apps.stable_diffusion.src import (
    args,
    UpscalerPipeline,
@@ -12,8 +13,6 @@ from apps.stable_diffusion.src import (
 )


-schedulers = None
-
 # set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
 init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
 init_use_tuned = args.use_tuned
@@ -43,6 +42,7 @@ def upscaler_inf(
    save_metadata_to_png: bool,
    lora_weights: str,
    lora_hf_id: str,
+    ondemand: bool,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -51,14 +51,16 @@ def upscaler_inf(
    )
    import apps.stable_diffusion.web.utils.global_obj as global_obj

-    global schedulers
-
    args.prompts = [prompt]
    args.negative_prompts = [negative_prompt]
    args.guidance_scale = guidance_scale
    args.seed = seed
    args.steps = steps
    args.scheduler = scheduler
+    args.ondemand = ondemand
+    if ondemand and batch_count > 1:
+        print("Low VRAM mode currently only supports 1 batch count.")
+        batch_count = 1

    if init_image is None:
        return None, "An Initial Image is required"
@@ -102,9 +104,11 @@ def upscaler_inf(
        device,
        use_lora=args.use_lora,
        use_stencil=None,
+        ondemand=ondemand,
    )
    if (
-        not global_obj.get_sd_obj()
+        args.ondemand
+        or not global_obj.get_sd_obj()
        or global_obj.get_cfg_obj() != new_config_obj
    ):
        global_obj.clear_cache()
@@ -121,8 +125,8 @@ def upscaler_inf(
            if args.hf_model_id
            else "stabilityai/stable-diffusion-2-1-base"
        )
-        schedulers = get_schedulers(model_id)
-        scheduler_obj = schedulers[scheduler]
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
        global_obj.set_sd_obj(
            UpscalerPipeline.from_pretrained(
                scheduler_obj,
@@ -139,11 +143,14 @@ def upscaler_inf(
                args.use_tuned,
                low_cpu_mem_usage=args.low_cpu_mem_usage,
                use_lora=args.use_lora,
+                ondemand=args.ondemand,
            )
        )

-    global_obj.set_schedulers(schedulers[scheduler])
-    global_obj.get_sd_obj().low_res_scheduler = schedulers["DDPM"]
+    global_obj.set_sd_scheduler(scheduler)
+    global_obj.get_sd_obj().low_res_scheduler = global_obj.get_scheduler(
+        "DDPM"
+    )

    start_time = time.time()
    global_obj.get_sd_obj().log = ""
@@ -238,6 +245,7 @@ if __name__ == "__main__":
        low_cpu_mem_usage=args.low_cpu_mem_usage,
        use_lora=args.use_lora,
        ddpm_scheduler=schedulers["DDPM"],
+        ondemand=args.ondemand,
    )

    start_time = time.time()
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -25,6 +25,7 @@ datas += collect_data_files('pytorch_lightning')
 datas += collect_data_files('opencv-python')
 datas += collect_data_files('skimage')
 datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_client')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
 datas += collect_data_files('shark')
--- a/apps/stable_diffusion/shark_sd_cli.spec
+++ b/apps/stable_diffusion/shark_sd_cli.spec
@@ -25,6 +25,7 @@ datas += collect_data_files('opencv-python')
 datas += collect_data_files('pytorch_lightning')
 datas += collect_data_files('skimage')
 datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_client')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
 datas += collect_data_files('shark')
@@ -43,7 +44,7 @@ hiddenimports = ['shark', 'shark.shark_inference', 'apps']
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]

 a = Analysis(
-    ['scripts/txt2img.py'],
+    ['scripts/main.py'],
    pathex=['.'],
    binaries=binaries,
    datas=datas,
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -5,6 +5,7 @@ from apps.stable_diffusion.src.utils import (
    get_available_devices,
    clear_all,
    save_output_img,
+    resize_stencil,
 )
 from apps.stable_diffusion.src.pipelines import (
    Text2ImagePipeline,
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -11,7 +11,7 @@ from apps.stable_diffusion.src.utils import (
    get_opt_flags,
    base_models,
    args,
-    fetch_or_delete_vmfbs,
+    fetch_vmfb,
    preprocessCKPT,
    get_path_to_diffusers_checkpoint,
    fetch_and_update_base_model_id,
@@ -55,29 +55,9 @@ def replace_shape_str(shape, max_len, width, height, batch_size):
    return new_shape


-# Get the input info for various models i.e. "unet", "clip", "vae", "vae_encode".
-def get_input_info(model_info, max_len, width, height, batch_size):
-    dtype_config = {"f32": torch.float32, "i64": torch.int64}
-    input_map = defaultdict(list)
-    for k in model_info:
-        for inp in model_info[k]:
-            shape = model_info[k][inp]["shape"]
-            dtype = dtype_config[model_info[k][inp]["dtype"]]
-            tensor = None
-            if isinstance(shape, list):
-                clean_shape = replace_shape_str(
-                    shape, max_len, width, height, batch_size
-                )
-                if dtype == torch.int64:
-                    tensor = torch.randint(1, 3, tuple(clean_shape))
-                else:
-                    tensor = torch.randn(*clean_shape).to(dtype)
-            elif isinstance(shape, int):
-                tensor = torch.tensor(shape).to(dtype)
-            else:
-                sys.exit("shape isn't specified correctly.")
-            input_map[k].append(tensor)
-    return input_map
+def check_compilation(model, model_name):
+    if not model:
+        raise Exception(f"Could not compile {model_name}. Please create an issue with the detailed log at https://github.com/nod-ai/SHARK/issues")


 class SharkifyStableDiffusionModel:
@@ -100,7 +80,9 @@ class SharkifyStableDiffusionModel:
        is_inpaint: bool = False,
        is_upscaler: bool = False,
        use_stencil: str = None,
-        use_lora: str = ""
+        use_lora: str = "",
+        use_quantize: str = None,
+        return_mlir: bool = False,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
@@ -108,6 +90,7 @@ class SharkifyStableDiffusionModel:
        self.width = width // 8
        self.batch_size = batch_size
        self.custom_weights = custom_weights
+        self.use_quantize = use_quantize
        if custom_weights != "":
            assert custom_weights.lower().endswith(
                (".ckpt", ".safetensors")
@@ -146,18 +129,32 @@ class SharkifyStableDiffusionModel:
        self.use_lora = use_lora

        print(self.model_name)
+        self.model_name = self.get_extended_name_for_all_model()
        self.debug = debug
        self.sharktank_dir = sharktank_dir
        self.generate_vmfb = generate_vmfb

-    def get_extended_name_for_all_model(self, mask_to_fetch):
+        self.inputs = dict()
+        self.model_to_run = ""
+        if self.custom_weights != "":
+            self.model_to_run = self.custom_weights
+            assert self.custom_weights.lower().endswith(
+                (".ckpt", ".safetensors")
+            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
+            preprocessCKPT(self.custom_weights, self.is_inpaint)
+        else:
+            self.model_to_run = args.hf_model_id
+        self.custom_vae = self.process_custom_vae()
+        self.base_model_id = fetch_and_update_base_model_id(self.model_to_run)
+        if self.base_model_id != "" and args.ckpt_loc != "":
+            args.hf_model_id = self.base_model_id
+        self.return_mlir = return_mlir
+
+    def get_extended_name_for_all_model(self):
        model_name = {}
        sub_model_list = ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
        index = 0
        for model in sub_model_list:
-            if mask_to_fetch[index] == False:
-                index += 1
-                continue
            sub_model = model
            model_config = self.model_name
            if "vae" == model:
@@ -177,6 +174,29 @@ class SharkifyStableDiffusionModel:
        if not (height % 8 == 0 and height >= 128):
            sys.exit("height should be greater than 128 and multiple of 8")

+    # Get the input info for a model i.e. "unet", "clip", "vae", etc.
+    def get_input_info_for(self, model_info):
+        dtype_config = {"f32": torch.float32, "i64": torch.int64}
+        input_map = []
+        for inp in model_info:
+            shape = model_info[inp]["shape"]
+            dtype = dtype_config[model_info[inp]["dtype"]]
+            tensor = None
+            if isinstance(shape, list):
+                clean_shape = replace_shape_str(
+                    shape, self.max_len, self.width, self.height, self.batch_size
+                )
+                if dtype == torch.int64:
+                    tensor = torch.randint(1, 3, tuple(clean_shape))
+                else:
+                    tensor = torch.randn(*clean_shape).to(dtype)
+            elif isinstance(shape, int):
+                tensor = torch.tensor(shape).to(dtype)
+            else:
+                sys.exit("shape isn't specified correctly.")
+            input_map.append(tensor)
+        return input_map
+    
    def get_vae_encode(self):
        class VaeEncodeModel(torch.nn.Module):
            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
@@ -193,8 +213,8 @@ class SharkifyStableDiffusionModel:

        vae_encode = VaeEncodeModel()
        inputs = tuple(self.inputs["vae_encode"])
-        is_f16 = True if self.precision == "fp16" else False
-        shark_vae_encode = compile_through_fx(
+        is_f16 = True if not self.is_upscaler and self.precision == "fp16" else False
+        shark_vae_encode, vae_encode_mlir = compile_through_fx(
            vae_encode,
            inputs,
            is_f16=is_f16,
@@ -203,7 +223,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("vae", precision=self.precision),
            base_model_id=self.base_model_id,
        )
-        return shark_vae_encode
+        return shark_vae_encode, vae_encode_mlir

    def get_vae(self):
        class VaeModel(torch.nn.Module):
@@ -247,7 +267,7 @@ class SharkifyStableDiffusionModel:
        save_dir = os.path.join(self.sharktank_dir, self.model_name["vae"])
        if self.debug:
            os.makedirs(save_dir, exist_ok=True)
-        shark_vae = compile_through_fx(
+        shark_vae, vae_mlir = compile_through_fx(
            vae,
            inputs,
            is_f16=is_f16,
@@ -259,34 +279,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("vae", precision=self.precision),
            base_model_id=self.base_model_id,
        )
-        return shark_vae
-
-    def get_vae_upscaler(self):
-        class VaeModel(torch.nn.Module):
-            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
-                super().__init__()
-                self.vae = AutoencoderKL.from_pretrained(
-                    model_id,
-                    subfolder="vae",
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                )
-
-            def forward(self, input):
-                x = self.vae.decode(input, return_dict=False)[0]
-                x = (x / 2 + 0.5).clamp(0, 1)
-                return x
-
-        vae = VaeModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
-        inputs = tuple(self.inputs["vae"])
-        shark_vae = compile_through_fx(
-            vae,
-            inputs,
-            use_tuned=self.use_tuned,
-            model_name=self.model_name["vae"],
-            extra_args=get_opt_flags("vae", precision="fp32"),
-            base_model_id=self.base_model_id,
-        )
-        return shark_vae
+        return shark_vae, vae_mlir

    def get_controlled_unet(self):
        class ControlledUnetModel(torch.nn.Module):
@@ -329,9 +322,9 @@ class SharkifyStableDiffusionModel:
        unet = ControlledUnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        is_f16 = True if self.precision == "fp16" else False

-        inputs = tuple(self.inputs["stencil_unet"])
+        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True,]
-        shark_controlled_unet = compile_through_fx(
+        shark_controlled_unet, controlled_unet_mlir = compile_through_fx(
            unet,
            inputs,
            model_name=self.model_name["stencil_unet"],
@@ -341,7 +334,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
        )
-        return shark_controlled_unet
+        return shark_controlled_unet, controlled_unet_mlir

    def get_control_net(self):
        class StencilControlNetModel(torch.nn.Module):
@@ -385,7 +378,7 @@ class SharkifyStableDiffusionModel:

        inputs = tuple(self.inputs["stencil_adaptor"])
        input_mask = [True, True, True, True]
-        shark_cnet = compile_through_fx(
+        shark_cnet, cnet_mlir = compile_through_fx(
            scnet,
            inputs,
            model_name=self.model_name["stencil_adaptor"],
@@ -395,7 +388,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
        )
-        return shark_cnet
+        return shark_cnet, cnet_mlir

    def get_unet(self):
        class UnetModel(torch.nn.Module):
@@ -441,7 +434,7 @@ class SharkifyStableDiffusionModel:
                save_dir,
                exist_ok=True,
            )
-        shark_unet = compile_through_fx(
+        shark_unet, unet_mlir = compile_through_fx(
            unet,
            inputs,
            model_name=self.model_name["unet"],
@@ -454,7 +447,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
        )
-        return shark_unet
+        return shark_unet, unet_mlir

    def get_unet_upscaler(self):
        class UnetModel(torch.nn.Module):
@@ -482,7 +475,7 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False]
-        shark_unet = compile_through_fx(
+        shark_unet, unet_mlir = compile_through_fx(
            unet,
            inputs,
            model_name=self.model_name["unet"],
@@ -492,7 +485,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
        )
-        return shark_unet
+        return shark_unet, unet_mlir

    def get_clip(self):
        class CLIPText(torch.nn.Module):
@@ -516,7 +509,7 @@ class SharkifyStableDiffusionModel:
                save_dir,
                exist_ok=True,
            )
-        shark_clip = compile_through_fx(
+        shark_clip, clip_mlir = compile_through_fx(
            clip_model,
            tuple(self.inputs["clip"]),
            model_name=self.model_name["clip"],
@@ -526,7 +519,7 @@ class SharkifyStableDiffusionModel:
            extra_args=get_opt_flags("clip", precision="fp32"),
            base_model_id=self.base_model_id,
        )
-        return shark_clip
+        return shark_clip, clip_mlir

    def process_custom_vae(self):
        custom_vae = self.custom_vae.lower()
@@ -547,133 +540,130 @@ class SharkifyStableDiffusionModel:
                vae_checkpoint = vae_checkpoint["state_dict"]
            vae_dict = {k: v for k, v in vae_checkpoint.items() if k[0:4] != "loss" and k not in vae_ignore_keys}
            return vae_dict
-        
-            
-    # Compiles Clip, Unet and Vae with `base_model_id` as defining their input
-    # configiration.
-    def compile_all(self, base_model_id, need_vae_encode, need_stencil):
-        self.base_model_id = base_model_id
-        self.inputs = get_input_info(
-            base_models[base_model_id],
-            self.max_len,
-            self.width,
-            self.height,
-            self.batch_size,
-        )
-        if self.is_upscaler:
-            return self.get_clip(), self.get_unet_upscaler(), self.get_vae_upscaler()

-        compiled_controlnet = None
-        compiled_controlled_unet = None
-        compiled_unet = None
-        if need_stencil:
-            compiled_controlnet = self.get_control_net()
-            compiled_controlled_unet = self.get_controlled_unet()
-        else:
-            compiled_unet = self.get_unet()
-        if self.custom_vae != "":
-            print("Plugging in custom Vae")
-        compiled_vae = self.get_vae()
-        compiled_clip = self.get_clip()
-
-        if need_stencil:
-            return compiled_clip, compiled_controlled_unet, compiled_vae, compiled_controlnet
-        if need_vae_encode:
-            compiled_vae_encode = self.get_vae_encode()
-            return compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode
-
-        return compiled_clip, compiled_unet, compiled_vae
-
-    def __call__(self):
-        # Step 1:
-        # --  Fetch all vmfbs for the model, if present, else delete the lot.
-        need_vae_encode, need_stencil = False, False
-        if not self.is_upscaler and args.img_path is not None:
-            if self.use_stencil is not None:
-                need_stencil = True
+    def compile_unet_variants(self, model):
+        if model == "unet":
+            if self.is_upscaler:
+                return self.get_unet_upscaler()
+            # TODO: Plug the experimental "int8" support at right place.
+            elif self.use_quantize == "int8":
+                from apps.stable_diffusion.src.models.opt_params import get_unet
+                return get_unet()
            else:
-                need_vae_encode = True
-        # `mask_to_fetch` prepares a mask to pick a combination out of :-
-        # ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
-        mask_to_fetch = [True, True, False, True, False, False]
-        if need_vae_encode:
-            mask_to_fetch = [True, True, False, True, True, False]
-        elif need_stencil:
-            mask_to_fetch = [True, False, True, True, False, True]
-        self.model_name = self.get_extended_name_for_all_model(mask_to_fetch)
-        vmfbs = fetch_or_delete_vmfbs(self.model_name, self.precision)   
-        if vmfbs[0]:
-            # -- If all vmfbs are indeed present, we also try and fetch the base
-            #    model configuration for running SD with custom checkpoints.
-            if self.custom_weights != "":
-                args.hf_model_id = fetch_and_update_base_model_id(self.custom_weights)
-            if args.hf_model_id == "":
-                sys.exit("Base model configuration for the custom model is missing. Use `--clear_all` and re-run.")
-            print("Loaded vmfbs from cache and successfully fetched base model configuration.")
-            return vmfbs
-
-        # Step 2:
-        # -- If vmfbs weren't found, we try to see if the base model configuration
-        #    for the required SD run is known to us and bypass the retry mechanism.
-        model_to_run = ""
-        if self.custom_weights != "":
-            model_to_run = self.custom_weights
-            assert self.custom_weights.lower().endswith(
-                (".ckpt", ".safetensors")
-            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
-            preprocessCKPT(self.custom_weights, self.is_inpaint)
+                return self.get_unet()
        else:
-            model_to_run = args.hf_model_id
-        # For custom Vae user can provide either the repo-id or a checkpoint file,
-        # and for a checkpoint file we'd need to process it via Diffusers' script.
-        self.custom_vae = self.process_custom_vae()
-        base_model_fetched = fetch_and_update_base_model_id(model_to_run)
-        if base_model_fetched != "":
-            print("Compiling all the models with the fetched base model configuration.")
-            if args.ckpt_loc != "":
-                args.hf_model_id = base_model_fetched
-            return self.compile_all(base_model_fetched, need_vae_encode, need_stencil)
+            return self.get_controlled_unet()

-        # Step 3:
-        # -- This is the retry mechanism where the base model's configuration is not
-        #    known to us and figure that out by trial and error.
-        print("Inferring base model configuration.")
-        for model_id in base_models:
-            try:
-                if need_vae_encode:
-                    compiled_clip, compiled_unet, compiled_vae, compiled_vae_encode = self.compile_all(model_id, need_vae_encode, need_stencil)
-                elif need_stencil:
-                    compiled_clip, compiled_unet, compiled_vae, compiled_controlnet = self.compile_all(model_id, need_vae_encode, need_stencil)
-                else:
-                    compiled_clip, compiled_unet, compiled_vae = self.compile_all(model_id, need_vae_encode, need_stencil)
-            except Exception as e:
-                print(e)
-                print("Retrying with a different base model configuration")
-                continue
-            # -- Once a successful compilation has taken place we'd want to store
-            #    the base model's configuration inferred.
-            fetch_and_update_base_model_id(model_to_run, model_id)
-            # This is done just because in main.py we are basing the choice of tokenizer and scheduler
-            # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
-            # model and rely on retrying method to find the input configuration, we should also update
-            # the knowledge of base model id accordingly into `args.hf_model_id`.
-            if args.ckpt_loc != "":
-                args.hf_model_id = model_id
-            if need_vae_encode:
-                return (
-                    compiled_clip,
-                    compiled_unet,
-                    compiled_vae,
-                    compiled_vae_encode,
-                )
-            if need_stencil:
-                return (
-                    compiled_clip,
-                    compiled_unet,
-                    compiled_vae,
-                    compiled_controlnet,
-                )
-            return compiled_clip, compiled_unet, compiled_vae
-        sys.exit(
-            "Cannot compile the model. Please create an issue with the detailed log at https://github.com/nod-ai/SHARK/issues"
-        )
+    def vae_encode(self):
+        # Fetch vmfb for the model if present
+        vmfb = fetch_vmfb("vae_encode", self.model_name["vae_encode"], self.precision)
+        if vmfb and not self.return_mlir:
+            return vmfb
+
+        try:
+            self.inputs["vae_encode"] = self.get_input_info_for(base_models["vae_encode"])
+            compiled_vae_encode, vae_encode_mlir = self.get_vae_encode()
+
+            check_compilation(compiled_vae_encode, "Vae Encode")
+            if self.return_mlir:
+                return vae_encode_mlir
+            return compiled_vae_encode
+        except Exception as e:
+            sys.exit(e)
+
+    def clip(self):
+        vmfb = fetch_vmfb("clip", self.model_name["clip"], self.precision)
+        if vmfb and not self.return_mlir:
+            return vmfb
+
+        try:
+            self.inputs["clip"] = self.get_input_info_for(base_models["clip"])
+            compiled_clip, clip_mlir = self.get_clip()
+
+            check_compilation(compiled_clip, "Clip")
+            if self.return_mlir:
+                return clip_mlir
+            return compiled_clip
+        except Exception as e:
+            sys.exit(e)
+
+    def unet(self):
+        model = "stencil_unet" if self.use_stencil is not None else "unet"
+        vmfb = fetch_vmfb(model, self.model_name[model], self.precision)
+        if vmfb and not self.return_mlir:
+            return vmfb
+
+        try:
+            compiled_unet = None
+            unet_inputs = base_models[model]
+
+            if self.base_model_id != "":
+                self.inputs["unet"] = self.get_input_info_for(unet_inputs[self.base_model_id])
+                compiled_unet, unet_mlir = self.compile_unet_variants(model)
+            else:
+                for model_id in unet_inputs:
+                    self.base_model_id = model_id
+                    self.inputs["unet"] = self.get_input_info_for(unet_inputs[model_id])
+
+                    try:
+                        compiled_unet, unet_mlir = self.compile_unet_variants(model)
+                    except Exception as e:
+                        print(e)
+                        print("Retrying with a different base model configuration")
+                        continue
+
+                    # -- Once a successful compilation has taken place we'd want to store
+                    #    the base model's configuration inferred.
+                    fetch_and_update_base_model_id(self.model_to_run, model_id)
+                    # This is done just because in main.py we are basing the choice of tokenizer and scheduler
+                    # on `args.hf_model_id`. Since now, we don't maintain 1:1 mapping of variants and the base
+                    # model and rely on retrying method to find the input configuration, we should also update
+                    # the knowledge of base model id accordingly into `args.hf_model_id`.
+                    if args.ckpt_loc != "":
+                        args.hf_model_id = model_id
+                    break
+
+            check_compilation(compiled_unet, "Unet")
+            if self.return_mlir:
+                return unet_mlir
+            return compiled_unet
+        except Exception as e:
+            sys.exit(e)
+
+    def vae(self):
+        vmfb = fetch_vmfb("vae", self.model_name["vae"], self.precision)
+        if vmfb and not self.return_mlir:
+            return vmfb
+
+        try:
+            vae_input = base_models["vae"]["vae_upscaler"] if self.is_upscaler else base_models["vae"]["vae"]
+            self.inputs["vae"] = self.get_input_info_for(vae_input)
+
+            is_base_vae = self.base_vae
+            if self.is_upscaler:
+                self.base_vae = True
+            compiled_vae, vae_mlir = self.get_vae()
+            self.base_vae = is_base_vae
+
+            check_compilation(compiled_vae, "Vae")
+            if self.return_mlir:
+                return vae_mlir
+            return compiled_vae
+        except Exception as e:
+            sys.exit(e)
+
+    def controlnet(self):
+        vmfb = fetch_vmfb("stencil_adaptor", self.model_name["stencil_adaptor"], self.precision)
+        if vmfb and not self.return_mlir:
+            return vmfb
+
+        try:
+            self.inputs["stencil_adaptor"] = self.get_input_info_for(base_models["stencil_adaptor"])
+            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net()
+
+            check_compilation(compiled_stencil_adaptor, "Stencil")
+            if self.return_mlir:
+                return controlnet_mlir
+            return compiled_stencil_adaptor
+        except Exception as e:
+            sys.exit(e)
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -20,6 +20,15 @@ hf_model_variant_map = {
    "stabilityai/stable-diffusion-2-inpainting": ["stablediffusion", "inpaint_v2"],
 }

+# TODO: Add the quantized model as a part model_db.json.
+# This is currently in experimental phase.
+def get_quantize_model():
+    bucket_key = "gs://shark_tank/prashant_nod"
+    model_key = "unet_int8"
+    iree_flags = get_opt_flags("unet", precision="fp16")
+    if args.height != 512 and args.width != 512 and args.max_length != 77:
+        sys.exit("The int8 quantized model currently requires the height and width to be 512, and max_length to be 77")
+    return bucket_key, model_key, iree_flags

 def get_variant_version(hf_model_id):
    return hf_model_variant_map[hf_model_id]
@@ -41,6 +50,12 @@ def get_unet():
    variant, version = get_variant_version(args.hf_model_id)
    # Tuned model is present only for `fp16` precision.
    is_tuned = "tuned" if args.use_tuned else "untuned"
+
+    # TODO: Get the quantize model from model_db.json
+    if args.use_quantize == "int8":
+        bk, mk, flags = get_quantize_model()
+        return get_shark_model(bk, mk, flags)
+
    if "vulkan" not in args.device and args.use_tuned:
        bucket_key = f"{variant}/{is_tuned}/{args.device}"
        model_key = f"{variant}/{version}/unet/{args.precision}/length_{args.max_length}/{is_tuned}/{args.device}"
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -20,16 +20,15 @@ from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    StableDiffusionPipeline,
 )
+from apps.stable_diffusion.src.models import (
+    SharkifyStableDiffusionModel,
+    get_vae_encode,
+)


 class Image2ImagePipeline(StableDiffusionPipeline):
    def __init__(
        self,
-        vae_encode: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -40,9 +39,30 @@ class Image2ImagePipeline(StableDiffusionPipeline):
            SharkEulerDiscreteScheduler,
            DEISMultistepScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.vae_encode = vae_encode
+        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        self.vae_encode = None
+
+    def load_vae_encode(self):
+        if self.vae_encode is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            self.vae_encode = self.sd_model.vae_encode()
+        else:
+            try:
+                self.vae_encode = get_vae_encode()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.vae_encode = self.sd_model.vae_encode()
+
+    def unload_vae_encode(self):
+        del self.vae_encode
+        self.vae_encode = None

    def prepare_image_latents(
        self,
@@ -89,9 +109,12 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        return latents, timesteps

    def encode_image(self, input_image):
+        self.load_vae_encode()
        vae_encode_start = time.time()
        latents = self.vae_encode("forward", input_image)
        vae_inf_time = (time.time() - vae_encode_start) * 1000
+        if self.ondemand:
+            self.unload_vae_encode()
        self.log += f"\nVAE Encode Inference time (ms): {vae_inf_time:.3f}"

        return latents
@@ -131,8 +154,10 @@ class Image2ImagePipeline(StableDiffusionPipeline):
            seed = randint(uint32_min, uint32_max)
        generator = torch.manual_seed(seed)

-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+        # Get text embeddings with weight emphasis from prompts
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
@@ -161,6 +186,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):

        # Img latents -> PIL images
        all_imgs = []
+        self.load_vae()
        for i in tqdm(range(0, latents.shape[0], batch_size)):
            imgs = self.decode_latents(
                latents=latents[i : i + batch_size],
@@ -168,5 +194,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
                cpu_scheduling=cpu_scheduling,
            )
            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_vae()

        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
@@ -19,16 +19,15 @@ from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    StableDiffusionPipeline,
 )
+from apps.stable_diffusion.src.models import (
+    SharkifyStableDiffusionModel,
+    get_vae_encode,
+)


 class InpaintPipeline(StableDiffusionPipeline):
    def __init__(
        self,
-        vae_encode: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -39,9 +38,30 @@ class InpaintPipeline(StableDiffusionPipeline):
            SharkEulerDiscreteScheduler,
            DEISMultistepScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.vae_encode = vae_encode
+        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        self.vae_encode = None
+
+    def load_vae_encode(self):
+        if self.vae_encode is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            self.vae_encode = self.sd_model.vae_encode()
+        else:
+            try:
+                self.vae_encode = get_vae_encode()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.vae_encode = self.sd_model.vae_encode()
+
+    def unload_vae_encode(self):
+        del self.vae_encode
+        self.vae_encode = None

    def prepare_latents(
        self,
@@ -305,9 +325,12 @@ class InpaintPipeline(StableDiffusionPipeline):
        )
        mask = mask.to(dtype)

+        self.load_vae_encode()
        masked_image = masked_image.to(dtype)
        masked_image_latents = self.vae_encode("forward", (masked_image,))
        masked_image_latents = torch.from_numpy(masked_image_latents)
+        if self.ondemand:
+            self.unload_vae_encode()

        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
        if mask.shape[0] < batch_size:
@@ -383,8 +406,10 @@ class InpaintPipeline(StableDiffusionPipeline):
            dtype=dtype,
        )

-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+        # Get text embeddings with weight emphasis from prompts
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
@@ -428,6 +453,7 @@ class InpaintPipeline(StableDiffusionPipeline):

        # Img latents -> PIL images
        all_imgs = []
+        self.load_vae()
        for i in tqdm(range(0, latents.shape[0], batch_size)):
            imgs = self.decode_latents(
                latents=latents[i : i + batch_size],
@@ -435,6 +461,8 @@ class InpaintPipeline(StableDiffusionPipeline):
                cpu_scheduling=cpu_scheduling,
            )
            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_vae()

        if inpaint_full_res:
            output_image = self.apply_overlay(
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
@@ -20,16 +20,15 @@ from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils i
    StableDiffusionPipeline,
 )
 import math
+from apps.stable_diffusion.src.models import (
+    SharkifyStableDiffusionModel,
+    get_vae_encode,
+)


 class OutpaintPipeline(StableDiffusionPipeline):
    def __init__(
        self,
-        vae_encode: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -40,9 +39,30 @@ class OutpaintPipeline(StableDiffusionPipeline):
            SharkEulerDiscreteScheduler,
            DEISMultistepScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.vae_encode = vae_encode
+        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        self.vae_encode = None
+
+    def load_vae_encode(self):
+        if self.vae_encode is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            self.vae_encode = self.sd_model.vae_encode()
+        else:
+            try:
+                self.vae_encode = get_vae_encode()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.vae_encode = self.sd_model.vae_encode()
+
+    def unload_vae_encode(self):
+        del self.vae_encode
+        self.vae_encode = None

    def prepare_latents(
        self,
@@ -123,9 +143,12 @@ class OutpaintPipeline(StableDiffusionPipeline):
        )
        mask = mask.to(dtype)

+        self.load_vae_encode()
        masked_image = masked_image.to(dtype)
        masked_image_latents = self.vae_encode("forward", (masked_image,))
        masked_image_latents = torch.from_numpy(masked_image_latents)
+        if self.ondemand:
+            self.unload_vae_encode()

        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
        if mask.shape[0] < batch_size:
@@ -384,8 +407,10 @@ class OutpaintPipeline(StableDiffusionPipeline):
            dtype=dtype,
        )

-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+        # Get text embeddings with weight emphasis from prompts
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
@@ -506,6 +531,7 @@ class OutpaintPipeline(StableDiffusionPipeline):

            # Img latents -> PIL images
            all_imgs = []
+            self.load_vae()
            for i in tqdm(range(0, latents.shape[0], batch_size)):
                imgs = self.decode_latents(
                    latents=latents[i : i + batch_size],
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -20,16 +20,16 @@ from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils i
    StableDiffusionPipeline,
 )
 from apps.stable_diffusion.src.utils import controlnet_hint_conversion
+from apps.stable_diffusion.src.utils import (
+    start_profiling,
+    end_profiling,
+)
+from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel


 class StencilPipeline(StableDiffusionPipeline):
    def __init__(
        self,
-        controlnet: SharkInference,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -39,9 +39,22 @@ class StencilPipeline(StableDiffusionPipeline):
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
-        self.controlnet = controlnet
+        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        self.controlnet = None
+
+    def load_controlnet(self):
+        if self.controlnet is not None:
+            return
+        self.controlnet = self.sd_model.controlnet()
+
+    def unload_controlnet(self):
+        del self.controlnet
+        self.controlnet = None

    def prepare_latents(
        self,
@@ -68,6 +81,113 @@ class StencilPipeline(StableDiffusionPipeline):
        latents = latents * self.scheduler.init_noise_sigma
        return latents

+    def produce_stencil_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        dtype,
+        cpu_scheduling,
+        controlnet_hint=None,
+        controlnet_conditioning_scale: float = 1.0,
+        mask=None,
+        masked_image_latents=None,
+        return_all_latents=False,
+    ):
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        self.load_unet()
+        self.load_controlnet()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype)
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            if not torch.is_tensor(latent_model_input):
+                latent_model_input_1 = torch.from_numpy(
+                    np.asarray(latent_model_input)
+                ).to(dtype)
+            else:
+                latent_model_input_1 = latent_model_input
+            control = self.controlnet(
+                "forward",
+                (
+                    latent_model_input_1,
+                    timestep,
+                    text_embeddings,
+                    controlnet_hint,
+                ),
+                send_to_host=False,
+            )
+            timestep = timestep.detach().numpy()
+            # Profiling Unet.
+            profile_device = start_profiling(file_path="unet.rdc")
+            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                    control[0],
+                    control[1],
+                    control[2],
+                    control[3],
+                    control[4],
+                    control[5],
+                    control[6],
+                    control[7],
+                    control[8],
+                    control[9],
+                    control[10],
+                    control[11],
+                    control[12],
+                ),
+                send_to_host=False,
+            )
+            end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(
+                    noise_pred, t, latents
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents)
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            #  self.log += (
+            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            #  )
+            step_time_sum += step_time
+
+        if self.ondemand:
+            self.unload_unet()
+            self.unload_controlnet()
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
    def generate_images(
        self,
        prompts,
@@ -108,8 +228,10 @@ class StencilPipeline(StableDiffusionPipeline):
            seed = randint(uint32_min, uint32_max)
        generator = torch.manual_seed(seed)

-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+        # Get text embeddings with weight emphasis from prompts
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
@@ -134,11 +256,11 @@ class StencilPipeline(StableDiffusionPipeline):
            dtype=dtype,
            cpu_scheduling=cpu_scheduling,
            controlnet_hint=controlnet_hint,
-            controlnet=self.controlnet,
        )

        # Img latents -> PIL images
        all_imgs = []
+        self.load_vae()
        for i in tqdm(range(0, latents.shape[0], batch_size)):
            imgs = self.decode_latents(
                latents=latents[i : i + batch_size],
@@ -146,5 +268,7 @@ class StencilPipeline(StableDiffusionPipeline):
                cpu_scheduling=cpu_scheduling,
            )
            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_vae()

        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -1,5 +1,4 @@
 import torch
-from tqdm.auto import tqdm
 import numpy as np
 from random import randint
 from transformers import CLIPTokenizer
@@ -19,15 +18,12 @@ from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    StableDiffusionPipeline,
 )
+from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel


 class Text2ImagePipeline(StableDiffusionPipeline):
    def __init__(
        self,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -39,8 +35,12 @@ class Text2ImagePipeline(StableDiffusionPipeline):
            SharkEulerDiscreteScheduler,
            DEISMultistepScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)

    def prepare_latents(
        self,
@@ -110,8 +110,10 @@ class Text2ImagePipeline(StableDiffusionPipeline):
            dtype=dtype,
        )

-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+        # Get text embeddings with weight emphasis from prompts
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
@@ -128,12 +130,15 @@ class Text2ImagePipeline(StableDiffusionPipeline):

        # Img latents -> PIL images
        all_imgs = []
-        for i in tqdm(range(0, latents.shape[0], batch_size)):
+        self.load_vae()
+        for i in range(0, latents.shape[0], batch_size):
            imgs = self.decode_latents(
                latents=latents[i : i + batch_size],
                use_base_vae=use_base_vae,
                cpu_scheduling=cpu_scheduling,
            )
            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_vae()

        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
@@ -27,6 +27,7 @@ from apps.stable_diffusion.src.utils import (
    end_profiling,
 )
 from PIL import Image
+from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel


 def preprocess(image):
@@ -55,10 +56,6 @@ def preprocess(image):
 class UpscalerPipeline(StableDiffusionPipeline):
    def __init__(
        self,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -80,8 +77,12 @@ class UpscalerPipeline(StableDiffusionPipeline):
            SharkEulerDiscreteScheduler,
            DEISMultistepScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
        self.low_res_scheduler = low_res_scheduler

    def prepare_extra_step_kwargs(self, generator, eta):
@@ -163,6 +164,7 @@ class UpscalerPipeline(StableDiffusionPipeline):
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
+        self.load_unet()
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
            latent_model_input = torch.cat([latents] * 2)
@@ -208,6 +210,8 @@ class UpscalerPipeline(StableDiffusionPipeline):
            #  )
            step_time_sum += step_time

+        if self.ondemand:
+            self.unload_unet()
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -251,8 +255,10 @@ class UpscalerPipeline(StableDiffusionPipeline):
            seed = randint(uint32_min, uint32_max)
        generator = torch.manual_seed(seed)

-        # Get text embeddings from prompts
-        text_embeddings = self.encode_prompts(prompts, neg_prompts, max_length)
+        # Get text embeddings with weight emphasis from prompts
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # 4. Preprocess image
        image = preprocess(image).to(dtype)
@@ -299,6 +305,7 @@ class UpscalerPipeline(StableDiffusionPipeline):

        # Img latents -> PIL images
        all_imgs = []
+        self.load_vae()
        for i in tqdm(range(0, latents.shape[0], batch_size)):
            imgs = self.decode_latents(
                latents=latents[i : i + batch_size],
@@ -306,5 +313,7 @@ class UpscalerPipeline(StableDiffusionPipeline):
                cpu_scheduling=cpu_scheduling,
            )
            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_vae()

        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -20,7 +20,6 @@ from shark.shark_inference import SharkInference
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.models import (
    SharkifyStableDiffusionModel,
-    get_vae_encode,
    get_vae,
    get_clip,
    get_unet,
@@ -30,6 +29,7 @@ from apps.stable_diffusion.src.utils import (
    start_profiling,
    end_profiling,
 )
+import sys

 SD_STATE_IDLE = "idle"
 SD_STATE_CANCEL = "cancel"
@@ -38,10 +38,6 @@ SD_STATE_CANCEL = "cancel"
 class StableDiffusionPipeline:
    def __init__(
        self,
-        vae: SharkInference,
-        text_encoder: SharkInference,
-        tokenizer: CLIPTokenizer,
-        unet: SharkInference,
        scheduler: Union[
            DDIMScheduler,
            PNDMScheduler,
@@ -53,15 +49,85 @@ class StableDiffusionPipeline:
            SharkEulerDiscreteScheduler,
            DEISMultistepScheduler,
        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        ondemand: bool,
    ):
-        self.vae = vae
-        self.text_encoder = text_encoder
-        self.tokenizer = tokenizer
-        self.unet = unet
+        self.vae = None
+        self.text_encoder = None
+        self.unet = None
+        self.model_max_length = 77
        self.scheduler = scheduler
        # TODO: Implement using logging python utility.
        self.log = ""
        self.status = SD_STATE_IDLE
+        self.sd_model = sd_model
+        self.import_mlir = import_mlir
+        self.use_lora = use_lora
+        self.ondemand = ondemand
+        # TODO: Find a better workaround for fetching base_model_id early enough for CLIPTokenizer.
+        try:
+            self.tokenizer = get_tokenizer()
+        except:
+            self.load_unet()
+            self.unload_unet()
+            self.tokenizer = get_tokenizer()
+
+    def load_clip(self):
+        if self.text_encoder is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            if not self.import_mlir:
+                print(
+                    "Warning: LoRA provided but import_mlir not specified. Importing MLIR anyways."
+                )
+            self.text_encoder = self.sd_model.clip()
+        else:
+            try:
+                self.text_encoder = get_clip()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.text_encoder = self.sd_model.clip()
+
+    def unload_clip(self):
+        del self.text_encoder
+        self.text_encoder = None
+
+    def load_unet(self):
+        if self.unet is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            self.unet = self.sd_model.unet()
+        else:
+            try:
+                self.unet = get_unet()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.unet = self.sd_model.unet()
+
+    def unload_unet(self):
+        del self.unet
+        self.unet = None
+
+    def load_vae(self):
+        if self.vae is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            self.vae = self.sd_model.vae()
+        else:
+            try:
+                self.vae = get_vae()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.vae = self.sd_model.vae()
+
+    def unload_vae(self):
+        del self.vae
+        self.vae = None

    def encode_prompts(self, prompts, neg_prompts, max_length):
        # Tokenize text and get embeddings
@@ -81,12 +147,14 @@ class StableDiffusionPipeline:
            truncation=True,
            return_tensors="pt",
        )
-
        text_input = torch.cat([uncond_input.input_ids, text_input.input_ids])

+        self.load_clip()
        clip_inf_start = time.time()
        text_embeddings = self.text_encoder("forward", (text_input,))
        clip_inf_time = (time.time() - clip_inf_start) * 1000
+        if self.ondemand:
+            self.unload_clip()
        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"

        return text_embeddings
@@ -115,109 +183,6 @@ class StableDiffusionPipeline:
        pil_images = [Image.fromarray(image) for image in images.numpy()]
        return pil_images

-    def produce_stencil_latents(
-        self,
-        latents,
-        text_embeddings,
-        guidance_scale,
-        total_timesteps,
-        dtype,
-        cpu_scheduling,
-        controlnet_hint=None,
-        controlnet=None,
-        controlnet_conditioning_scale: float = 1.0,
-        mask=None,
-        masked_image_latents=None,
-        return_all_latents=False,
-    ):
-        step_time_sum = 0
-        latent_history = [latents]
-        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-        for i, t in tqdm(enumerate(total_timesteps)):
-            step_start_time = time.time()
-            timestep = torch.tensor([t]).to(dtype)
-            latent_model_input = self.scheduler.scale_model_input(latents, t)
-            if mask is not None and masked_image_latents is not None:
-                latent_model_input = torch.cat(
-                    [
-                        torch.from_numpy(np.asarray(latent_model_input)),
-                        mask,
-                        masked_image_latents,
-                    ],
-                    dim=1,
-                ).to(dtype)
-            if cpu_scheduling:
-                latent_model_input = latent_model_input.detach().numpy()
-
-            if not torch.is_tensor(latent_model_input):
-                latent_model_input_1 = torch.from_numpy(
-                    np.asarray(latent_model_input)
-                ).to(dtype)
-            else:
-                latent_model_input_1 = latent_model_input
-            control = controlnet(
-                "forward",
-                (
-                    latent_model_input_1,
-                    timestep,
-                    text_embeddings,
-                    controlnet_hint,
-                ),
-                send_to_host=False,
-            )
-            timestep = timestep.detach().numpy()
-            # Profiling Unet.
-            profile_device = start_profiling(file_path="unet.rdc")
-            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                    control[0],
-                    control[1],
-                    control[2],
-                    control[3],
-                    control[4],
-                    control[5],
-                    control[6],
-                    control[7],
-                    control[8],
-                    control[9],
-                    control[10],
-                    control[11],
-                    control[12],
-                ),
-                send_to_host=False,
-            )
-            end_profiling(profile_device)
-
-            if cpu_scheduling:
-                noise_pred = torch.from_numpy(noise_pred.to_host())
-                latents = self.scheduler.step(
-                    noise_pred, t, latents
-                ).prev_sample
-            else:
-                latents = self.scheduler.step(noise_pred, t, latents)
-
-            latent_history.append(latents)
-            step_time = (time.time() - step_start_time) * 1000
-            #  self.log += (
-            #      f"\nstep = {i} | timestep = {t} | time = {step_time:.2f}ms"
-            #  )
-            step_time_sum += step_time
-
-        avg_step_time = step_time_sum / len(total_timesteps)
-        self.log += f"\nAverage step time: {avg_step_time}ms/it"
-
-        if not return_all_latents:
-            return latents
-        all_latents = torch.cat(latent_history, dim=0)
-        return all_latents
-
    def produce_img_latents(
        self,
        latents,
@@ -235,6 +200,7 @@ class StableDiffusionPipeline:
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
+        self.load_unet()
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype).detach().numpy()
@@ -283,6 +249,8 @@ class StableDiffusionPipeline:
            if self.status == SD_STATE_CANCEL:
                break

+        if self.ondemand:
+            self.unload_unet()
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -316,115 +284,556 @@ class StableDiffusionPipeline:
        width: int,
        use_base_vae: bool,
        use_tuned: bool,
+        ondemand: bool,
        low_cpu_mem_usage: bool = False,
        debug: bool = False,
        use_stencil: str = None,
        use_lora: str = "",
        ddpm_scheduler: DDPMScheduler = None,
+        use_quantize=None,
    ):
+        if (
+            not import_mlir
+            and not use_lora
+            and cls.__name__ == "StencilPipeline"
+        ):
+            sys.exit("StencilPipeline not supported with SharkTank currently.")
+
        is_inpaint = cls.__name__ in [
            "InpaintPipeline",
            "OutpaintPipeline",
        ]
        is_upscaler = cls.__name__ in ["UpscalerPipeline"]
-        if import_mlir or use_lora:
-            if not import_mlir:
-                print(
-                    "Warning: LoRA provided but import_mlir not specified. Importing MLIR anyways."
-                )
-            mlir_import = SharkifyStableDiffusionModel(
-                model_id,
-                ckpt_loc,
-                custom_vae,
-                precision,
-                max_len=max_length,
-                batch_size=batch_size,
-                height=height,
-                width=width,
-                use_base_vae=use_base_vae,
-                use_tuned=use_tuned,
-                low_cpu_mem_usage=low_cpu_mem_usage,
-                debug=debug,
-                is_inpaint=is_inpaint,
-                is_upscaler=is_upscaler,
-                use_stencil=use_stencil,
-                use_lora=use_lora,
-            )
-            if cls.__name__ in [
-                "Image2ImagePipeline",
-                "InpaintPipeline",
-                "OutpaintPipeline",
-            ]:
-                clip, unet, vae, vae_encode = mlir_import()
-                return cls(
-                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            if cls.__name__ in ["StencilPipeline"]:
-                clip, unet, vae, controlnet = mlir_import()
-                return cls(
-                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            if cls.__name__ in ["UpscalerPipeline"]:
-                clip, unet, vae = mlir_import()
-                return cls(
-                    vae, clip, get_tokenizer(), unet, scheduler, ddpm_scheduler
-                )

-            clip, unet, vae = mlir_import()
-            return cls(vae, clip, get_tokenizer(), unet, scheduler)
-        try:
-            if cls.__name__ in [
-                "Image2ImagePipeline",
-                "InpaintPipeline",
-                "OutpaintPipeline",
-            ]:
-                return cls(
-                    get_vae_encode(),
-                    get_vae(),
-                    get_clip(),
-                    get_tokenizer(),
-                    get_unet(),
-                    scheduler,
-                )
-            if cls.__name__ == "StencilPipeline":
-                import sys
+        sd_model = SharkifyStableDiffusionModel(
+            model_id,
+            ckpt_loc,
+            custom_vae,
+            precision,
+            max_len=max_length,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            use_base_vae=use_base_vae,
+            use_tuned=use_tuned,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            debug=debug,
+            is_inpaint=is_inpaint,
+            is_upscaler=is_upscaler,
+            use_stencil=use_stencil,
+            use_lora=use_lora,
+            use_quantize=use_quantize,
+        )

-                sys.exit(
-                    "StencilPipeline not supported with SharkTank currently."
-                )
+        if cls.__name__ in ["UpscalerPipeline"]:
            return cls(
-                get_vae(), get_clip(), get_tokenizer(), get_unet(), scheduler
+                scheduler,
+                ddpm_scheduler,
+                sd_model,
+                import_mlir,
+                use_lora,
+                ondemand,
            )
-        except:
-            print("download pipeline failed, falling back to import_mlir")
-            mlir_import = SharkifyStableDiffusionModel(
-                model_id,
-                ckpt_loc,
-                custom_vae,
-                precision,
-                max_len=max_length,
-                batch_size=batch_size,
-                height=height,
-                width=width,
-                use_base_vae=use_base_vae,
-                use_tuned=use_tuned,
-                low_cpu_mem_usage=low_cpu_mem_usage,
-                is_inpaint=is_inpaint,
-                is_upscaler=is_upscaler,
+
+        return cls(scheduler, sd_model, import_mlir, use_lora, ondemand)
+
+    # #####################################################
+    # Implements text embeddings with weights from prompts
+    # https://huggingface.co/AlanB/lpw_stable_diffusion_mod
+    # #####################################################
+    def encode_prompts_weight(
+        self,
+        prompt,
+        negative_prompt,
+        model_max_length,
+        do_classifier_free_guidance=True,
+        max_embeddings_multiples=1,
+        num_images_per_prompt=1,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            model_max_length (int):
+                SHARK: pass the max length instead of relying on pipe.tokenizer.model_max_length
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not,
+                SHARK: must be set to True as we always expect neg embeddings (defaulted to True)
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+                SHARK: max_embeddings_multiples>1 produce a tensor shape error (defaulted to 1)
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+                SHARK: num_images_per_prompt is not used (defaulted to 1)
+        """
+
+        # SHARK: Save model_max_length, load the clip and init inference time
+        self.model_max_length = model_max_length
+        self.load_clip()
+        clip_inf_start = time.time()
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
            )
-            if cls.__name__ in [
-                "Image2ImagePipeline",
-                "InpaintPipeline",
-                "OutpaintPipeline",
-            ]:
-                clip, unet, vae, vae_encode = mlir_import()
-                return cls(
-                    vae_encode, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            if cls.__name__ == "StencilPipeline":
-                clip, unet, vae, controlnet = mlir_import()
-                return cls(
-                    controlnet, vae, clip, get_tokenizer(), unet, scheduler
-                )
-            clip, unet, vae = mlir_import()
-            return cls(vae, clip, get_tokenizer(), unet, scheduler)
+
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt
+            if do_classifier_free_guidance
+            else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+        )
+        # SHARK: we are not using num_images_per_prompt
+        # bs_embed, seq_len, _ = text_embeddings.shape
+        # text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        # text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # SHARK: we are not using num_images_per_prompt
+            # bs_embed, seq_len, _ = uncond_embeddings.shape
+            # uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            # uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # SHARK: Report clip inference time
+        clip_inf_time = (time.time() - clip_inf_start) * 1000
+        if self.ondemand:
+            self.unload_clip()
+        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"
+
+        return text_embeddings.numpy()
+
+
+from typing import List, Optional, Union
+import re
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(
+    pipe: StableDiffusionPipeline, prompt: List[str], max_length: int
+):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        print(
+            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
+        )
+    return tokens, weights
+
+
+def pad_tokens_and_weights(
+    tokens,
+    weights,
+    max_length,
+    bos,
+    eos,
+    no_boseos_middle=True,
+    chunk_length=77,
+):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = (
+        max_length
+        if no_boseos_middle
+        else max_embeddings_multiples * chunk_length
+    )
+    for i in range(len(tokens)):
+        tokens[i] = (
+            [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        )
+        if no_boseos_middle:
+            weights[i] = (
+                [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+            )
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][
+                        j
+                        * (chunk_length - 2) : min(
+                            len(weights[i]), (j + 1) * (chunk_length - 2)
+                        )
+                    ]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[
+                :, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2
+            ].clone()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+            # text_embedding = pipe.text_encoder(text_input_chunk)[0]
+            # SHARK: deplicate the text_input as Shark runner expects tokens and neg tokens
+            formatted_text_input_chunk = torch.cat(
+                [text_input_chunk, text_input_chunk]
+            )
+            text_embedding = pipe.text_encoder(
+                "forward", (formatted_text_input_chunk,)
+            )[0]
+
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        # SHARK: Convert the result to tensor
+        # text_embeddings = torch.concat(text_embeddings, axis=1)
+        text_embeddings_np = np.concatenate(np.array(text_embeddings))
+        text_embeddings = torch.from_numpy(text_embeddings_np)[None, :]
+    else:
+        # SHARK: deplicate the text_input as Shark runner expects tokens and neg tokens
+        # Convert the result to tensor
+        # text_embeddings = pipe.text_encoder(text_input)[0]
+        formatted_text_input = torch.cat([text_input, text_input])
+        text_embeddings = pipe.text_encoder(
+            "forward", (formatted_text_input,)
+        )[0]
+        text_embeddings = torch.from_numpy(text_embeddings)[None, :]
+    return text_embeddings
+
+
+def get_weighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        pipe (`StableDiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(
+            pipe, prompt, max_length - 2
+        )
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(
+                pipe, uncond_prompt, max_length - 2
+            )
+    else:
+        prompt_tokens = [
+            token[1:-1]
+            for token in pipe.tokenizer(
+                prompt, max_length=max_length, truncation=True
+            ).input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(
+                    uncond_prompt, max_length=max_length, truncation=True
+                ).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(
+            max_length, max([len(token) for token in uncond_tokens])
+        )
+
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.model_max_length,
+    )
+    # prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device="cpu")
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.model_max_length,
+        )
+        # uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+        uncond_tokens = torch.tensor(
+            uncond_tokens, dtype=torch.long, device="cpu"
+        )
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.model_max_length,
+        no_boseos_middle=no_boseos_middle,
+    )
+    # prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    prompt_weights = torch.tensor(
+        prompt_weights, dtype=torch.float, device="cpu"
+    )
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.model_max_length,
+            no_boseos_middle=no_boseos_middle,
+        )
+        # uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+        uncond_weights = torch.tensor(
+            uncond_weights, dtype=torch.float, device="cpu"
+        )
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = (
+            text_embeddings.float()
+            .mean(axis=[-2, -1])
+            .to(text_embeddings.dtype)
+        )
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = (
+            text_embeddings.float()
+            .mean(axis=[-2, -1])
+            .to(text_embeddings.dtype)
+        )
+        text_embeddings *= (
+            (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        )
+        if uncond_prompt is not None:
+            previous_mean = (
+                uncond_embeddings.float()
+                .mean(axis=[-2, -1])
+                .to(uncond_embeddings.dtype)
+            )
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = (
+                uncond_embeddings.float()
+                .mean(axis=[-2, -1])
+                .to(uncond_embeddings.dtype)
+            )
+            uncond_embeddings *= (
+                (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+            )
+
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -89,7 +89,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):

        def _import(self):
            scaling_model = ScalingModel()
-            self.scaling_model = compile_through_fx(
+            self.scaling_model, _ = compile_through_fx(
                model=scaling_model,
                inputs=(example_latent, example_sigma),
                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
@@ -98,7 +98,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            )

            step_model = SchedulerStepModel()
-            self.step_model = compile_through_fx(
+            self.step_model, _ = compile_through_fx(
                step_model,
                (example_output, example_sigma, example_latent, example_dt),
                model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -24,7 +24,7 @@ from apps.stable_diffusion.src.utils.utils import (
    get_available_devices,
    get_opt_flags,
    preprocessCKPT,
-    fetch_or_delete_vmfbs,
+    fetch_vmfb,
    fetch_and_update_base_model_id,
    get_path_to_diffusers_checkpoint,
    sanitize_seed,
@@ -34,4 +34,5 @@ from apps.stable_diffusion.src.utils.utils import (
    save_output_img,
    get_generation_text_info,
    update_lora_weight,
+    resize_stencil,
 )
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -1,6 +1,157 @@
 {
-    "stabilityai/stable-diffusion-x4-upscaler": {
-        "unet": {
+    "clip": {
+        "token" : {
+            "shape" : [
+                "2*batch_size",
+                "max_len"
+            ],
+            "dtype":"i64"
+        }
+    },
+    "vae_encode": {
+        "image" : {
+            "shape" : [
+                "1*batch_size",3,"8*height","8*width"
+            ],
+            "dtype":"f32"
+        }
+    },
+    "vae": {
+        "vae": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"height","width"
+                ],
+                "dtype":"f32"
+            }
+        },
+        "vae_upscaler": {
+            "latents" : {
+                "shape" : [
+                    "1*batch_size",4,"8*height","8*width"
+                ],
+                "dtype":"f32"
+            }
+        }
+    },
+    "unet": {
+        "stabilityai/stable-diffusion-2-1": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "CompVis/stable-diffusion-v1-4": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "stabilityai/stable-diffusion-2-inpainting": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    9,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    1024
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "runwayml/stable-diffusion-inpainting": {
+            "latents": {
+                "shape": [
+                    "1*batch_size",
+                    9,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "embedding": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    768
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 2,
+                "dtype": "f32"
+            }
+        },
+        "stabilityai/stable-diffusion-x4-upscaler": {
            "latents": {
                "shape": [
                    "2*batch_size",
@@ -28,141 +179,39 @@
                "shape": [2],
                "dtype": "i64"
            }
-        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
        }
    },
-    "stabilityai/stable-diffusion-2-1": {
-        "unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    4,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    1024
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            }
+    "stencil_adaptor": {
+        "latents": {
+            "shape": [
+                "1*batch_size",
+                4,
+                "height",
+                "width"
+            ],
+            "dtype": "f32"
        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
+        "timesteps": {
+            "shape": [
+                1
+            ],
+            "dtype": "f32"
        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
+        "embedding": {
+            "shape": [
+                "2*batch_size",
+                "max_len",
+                768
+            ],
+            "dtype": "f32"
        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
+        "controlnet_hint": {
+            "shape": [1, 3, "8*height", "8*width"],
+            "dtype": "f32"
        }
    },
-    "CompVis/stable-diffusion-v1-4": {
-        "unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    4,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    768
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            }
-        },
-        "stencil_adaptor": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    4,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    768
-                ],
-                "dtype": "f32"
-            },
-            "controlnet_hint": {
-                "shape": [1, 3, "8*height", "8*width"],
-                "dtype": "f32"
-            }
-        },
-        "stencil_unet": {
+    "stencil_unet": {
+        "CompVis/stable-diffusion-v1-4": {
            "latents": {
                "shape": [
                    "1*batch_size",
@@ -242,143 +291,6 @@
                "shape": [2, 1280, "height/8", "width/8"],
                "dtype": "f32"
            }
-        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
-        }
-    },
-    "stabilityai/stable-diffusion-2-inpainting": {
-        "unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    9,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    1024
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            }
-        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
-        }
-    },
-    "runwayml/stable-diffusion-inpainting": {
-        "unet": {
-            "latents": {
-                "shape": [
-                    "1*batch_size",
-                    9,
-                    "height",
-                    "width"
-                ],
-                "dtype": "f32"
-            },
-            "timesteps": {
-                "shape": [
-                    1
-                ],
-                "dtype": "f32"
-            },
-            "embedding": {
-                "shape": [
-                    "2*batch_size",
-                    "max_len",
-                    768
-                ],
-                "dtype": "f32"
-            },
-            "guidance_scale": {
-                "shape": 2,
-                "dtype": "f32"
-            }
-        },
-        "vae_encode": {
-            "image" : {
-                "shape" : [
-                    "1*batch_size",3,"8*height","8*width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "vae": {
-            "latents" : {
-                "shape" : [
-                    "1*batch_size",4,"height","width"
-                ],
-                "dtype":"f32"
-            }
-        },
-        "clip": {
-            "token" : {
-                "shape" : [
-                    "2*batch_size",
-                    "max_len"
-                ],
-                "dtype":"i64"
-            }
        }
    }
-}
+}
--- a/apps/stable_diffusion/src/utils/resources/model_db.json
+++ b/apps/stable_diffusion/src/utils/resources/model_db.json
@@ -1,85 +1,19 @@
 [
  {
-    "stablediffusion/untuned":"gs://shark_tank/sd_untuned",
-    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
-    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
-    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
-    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
-    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
-    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
-    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
-    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
-    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
-    "openjourney/tuned":"gs://shark_tank/sd_tuned",
-    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+    "stablediffusion/untuned":"gs://shark_tank/nightly"
  },
  {
-    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
-    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
-    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
-    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
-    "stablediffusion/v1_4/unet/fp32/length_64/untuned":"unet_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
-    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
-    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
-    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
-    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
-    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
-    "stablediffusion/v1_4/vae/fp32/length_64/untuned":"vae_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
-    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
-    "stablediffusion/v1_4/clip/fp32/length_64/untuned":"clip_1_64_512_512_fp32_CompVis_stable_diffusion_v1_4",
-    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
-    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
-    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet64_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
-    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
-    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
-    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
-    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae2base_8dec_fp16",
-    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
-    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
-    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip64_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"vae2_8dec_fp16",
-    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
-    "anythingv3/v1_4/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
-    "anythingv3/v1_4/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
-    "anythingv3/v1_4/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
-    "anythingv3/v1_4/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
-    "anythingv3/v1_4/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
-    "anythingv3/v1_4/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
-    "anythingv3/v1_4/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
-    "anythingv3/v1_4/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
-    "anythingv3/v1_4/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
-    "anythingv3/v1_4/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
-    "anythingv3/v1_4/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
-    "analogdiffusion/v1_4/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
-    "analogdiffusion/v1_4/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
-    "analogdiffusion/v1_4/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
-    "analogdiffusion/v1_4/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
-    "analogdiffusion/v1_4/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
-    "analogdiffusion/v1_4/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
-    "analogdiffusion/v1_4/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
-    "analogdiffusion/v1_4/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
-    "analogdiffusion/v1_4/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
-    "analogdiffusion/v1_4/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
-    "analogdiffusion/v1_4/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
-    "openjourney/v1_4/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
-    "openjourney/v1_4/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
-    "openjourney/v1_4/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
-    "openjourney/v1_4/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
-    "openjourney/v1_4/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
-    "openjourney/v1_4/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
-    "openjourney/v1_4/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
-    "dreamlike/v1_4/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
-    "dreamlike/v1_4/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
-    "dreamlike/v1_4/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
-    "dreamlike/v1_4/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
-    "dreamlike/v1_4/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
-    "dreamlike/v1_4/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
-    "dreamlike/v1_4/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+    "stablediffusion/v1_4/unet/fp16/length_64/untuned":"unet_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
+    "stablediffusion/v1_4/vae/fp16/length_64/untuned":"vae_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
+    "stablediffusion/v1_4/clip/fp32/length_64/untuned":"clip_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet_1_77_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip_1_77_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet_1_77_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan"
  }
 ]
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -115,7 +115,14 @@ def load_lower_configs(base_model_id=None):
            config_name = f"{args.annotation_model}_{args.precision}_{device}_{spec}.json"
    else:
        if not spec or spec in ["rdna3", "sm_80"]:
-            config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}.json"
+            if (
+                version in ["v2_1", "v2_1base"]
+                and args.height == 768
+                and args.width == 768
+            ):
+                config_name = f"{args.annotation_model}_v2_1_768_{args.precision}_{device}.json"
+            else:
+                config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}.json"
        else:
            config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}_{spec}.json"

--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -22,6 +22,12 @@ p = argparse.ArgumentParser(
 ### Stable Diffusion Params
 ##############################################################################

+p.add_argument(
+    "-a",
+    "--app",
+    default="txt2img",
+    help="which app to use, one of: txt2img, img2img, outpaint, inpaint",
+)
 p.add_argument(
    "-p",
    "--prompts",
@@ -340,6 +346,21 @@ p.add_argument(
    help="Use standalone LoRA weight using a HF ID or a checkpoint file (~3 MB)",
 )

+p.add_argument(
+    "--use_quantize",
+    type=str,
+    default="none",
+    help="""Runs the quantized version of stable diffusion model. This is currently in experimental phase.
+            Currently, only runs the stable-diffusion-2-1-base model in int8 quantization.""",
+)
+
+p.add_argument(
+    "--ondemand",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Load and unload models for low VRAM",
+)
+
 ##############################################################################
 ### IREE - Vulkan supported flags
 ##############################################################################
@@ -488,6 +509,12 @@ p.add_argument(
    help="flag for setting server port",
 )

+p.add_argument(
+    "--api",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for enabling rest API",
+)
 ##############################################################################
 ### SD model auto-annotation flags
 ##############################################################################
@@ -512,6 +539,31 @@ p.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Save annotated mlir file",
 )
+##############################################################################
+### SD model auto-tuner flags
+##############################################################################
+
+p.add_argument(
+    "--tuned_config_dir",
+    type=path_expand,
+    default="./",
+    help="Directory to save the tuned config file",
+)
+
+p.add_argument(
+    "--num_iters",
+    type=int,
+    default=400,
+    help="Number of iterations for tuning",
+)
+
+p.add_argument(
+    "--search_op",
+    type=str,
+    default="all",
+    help="Op to be optimized, options are matmul, bmm, conv and all",
+)
+

 args, unknown = p.parse_known_args()
 if args.import_debug:
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -3,6 +3,7 @@ import gc
 import json
 import re
 from PIL import PngImagePlugin
+from PIL import Image
 from datetime import datetime as dt
 from csv import DictWriter
 from pathlib import Path
@@ -135,9 +136,10 @@ def compile_through_fx(
            device=args.device,
            mlir_dialect="tm_tensor",
        )
-        del mlir_module
-        gc.collect()
-        return _compile_module(shark_module, model_name, extra_args)
+        return (
+            _compile_module(shark_module, model_name, extra_args),
+            mlir_module,
+        )

    del mlir_module
    gc.collect()
@@ -269,8 +271,9 @@ def set_init_device_flags():

    if (
        args.precision != "fp16"
-        or args.height != 512
-        or args.width != 512
+        or args.height not in [512, 768]
+        or (args.height == 512 and args.width != 512)
+        or (args.height == 768 and args.width != 768)
        or args.batch_size != 1
        or ("vulkan" not in args.device and "cuda" not in args.device)
    ):
@@ -304,6 +307,20 @@ def set_init_device_flags():
    ]:
        args.use_tuned = False

+    elif (
+        args.height == 768
+        and args.width == 768
+        and (
+            base_model_id
+            not in [
+                "stabilityai/stable-diffusion-2-1",
+                "stabilityai/stable-diffusion-2-1-base",
+            ]
+            or "rdna3" not in args.iree_vulkan_target_triple
+        )
+    ):
+        args.use_tuned = False
+
    if args.use_tuned:
        print(f"Using tuned models for {base_model_id}/fp16/{args.device}.")
    else:
@@ -373,7 +390,7 @@ def get_available_devices():
    available_devices.extend(vulkan_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
-    available_devices.append("cpu")
+    available_devices.append("device => cpu")
    return available_devices


@@ -588,34 +605,14 @@ def load_vmfb(vmfb_path, model, precision):
    return shark_module


-# This utility returns vmfbs of Clip, Unet, Vae and Vae_encode, in case all of them
-# are present; deletes them otherwise.
-def fetch_or_delete_vmfbs(extended_model_name, precision="fp32"):
-    vmfb_path = [
-        get_vmfb_path_name(extended_model_name[model])
-        for model in extended_model_name
-    ]
-    number_of_vmfbs = len(vmfb_path)
-    vmfb_present = [os.path.isfile(vmfb) for vmfb in vmfb_path]
-    all_vmfb_present = True
-    compiled_models = [None] * number_of_vmfbs
-
-    for i in range(number_of_vmfbs):
-        all_vmfb_present = all_vmfb_present and vmfb_present[i]
-
-    # We need to delete vmfbs only if some of the models were compiled.
-    if not all_vmfb_present:
-        for i in range(number_of_vmfbs):
-            if vmfb_present[i]:
-                os.remove(vmfb_path[i])
-                print("Deleted: ", vmfb_path[i])
-    else:
-        model_name = [model for model in extended_model_name.keys()]
-        for i in range(number_of_vmfbs):
-            compiled_models[i] = load_vmfb(
-                vmfb_path[i], model_name[i], precision
-            )
-    return compiled_models
+# This utility returns vmfb of sub-model of the SD pipeline, if present.
+def fetch_vmfb(model, extended_model_name, precision="fp32"):
+    vmfb_path = get_vmfb_path_name(extended_model_name)
+    vmfb_present = os.path.isfile(vmfb_path)
+    compiled_model = (
+        load_vmfb(vmfb_path, model, precision) if vmfb_present else None
+    )
+    return compiled_model


 # `fetch_and_update_base_model_id` is a resource utility function which
@@ -754,3 +751,46 @@ def get_generation_text_info(seeds, device):
    text_output += f"\nsize={args.height}x{args.width}, batch_count={args.batch_count}, batch_size={args.batch_size}, max_length={args.max_length}"

    return text_output
+
+
+# For stencil, the input image can be of any size but we need to ensure that
+# it conforms with our model contraints :-
+#   Both width and height should be in the range of [128, 768] and multiple of 8.
+# This utility function performs the transformation on the input image while
+# also maintaining the aspect ratio before sending it to the stencil pipeline.
+def resize_stencil(image: Image.Image):
+    width, height = image.size
+    aspect_ratio = width / height
+    min_size = min(width, height)
+    if min_size < 128:
+        n_size = 128
+        if width == min_size:
+            width = n_size
+            height = n_size / aspect_ratio
+        else:
+            height = n_size
+            width = n_size * aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+
+    min_size = min(width, height)
+    if min_size > 768:
+        n_size = 768
+        if width == min_size:
+            height = n_size
+            width = n_size * aspect_ratio
+        else:
+            width = n_size
+            height = n_size / aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+    new_image = image.resize((n_width, n_height))
+    return new_image, n_width, n_height
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,204 +1,219 @@
 import os
 import sys
+import transformers
+from apps.stable_diffusion.src import args, clear_all
+import apps.stable_diffusion.web.utils.global_obj as global_obj

 if sys.platform == "darwin":
    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"

-import gradio as gr
-import apps.stable_diffusion.web.utils.global_obj as global_obj
-from apps.stable_diffusion.src import args, clear_all
-from apps.stable_diffusion.web.utils.gradio_configs import (
-    clear_gradio_tmp_imgs_folder,
-)
-from apps.stable_diffusion.web.ui.utils import get_custom_model_path
-
-# Clear all gradio tmp images from the last session
-clear_gradio_tmp_imgs_folder()
-# Create the custom model folder if it doesn't already exist
-get_custom_model_path().mkdir(parents=True, exist_ok=True)
-
 if args.clear_all:
    clear_all()

+if __name__ == "__main__":
+    if args.api:
+        from apps.stable_diffusion.web.ui import txt2img_inf, img2img_api
+        from fastapi import FastAPI, APIRouter
+        import uvicorn

-def resource_path(relative_path):
-    """Get absolute path to resource, works for dev and for PyInstaller"""
-    base_path = getattr(
-        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+        # init global sd pipeline and config
+        global_obj._init()
+
+        app = FastAPI()
+        app.add_api_route("/sdapi/v1/txt2img", txt2img_inf, methods=["post"])
+        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
+        app.include_router(APIRouter())
+        uvicorn.run(app, host="127.0.0.1", port=args.server_port)
+        sys.exit(0)
+
+    import gradio as gr
+    from apps.stable_diffusion.web.utils.gradio_configs import (
+        clear_gradio_tmp_imgs_folder,
    )
-    return os.path.join(base_path, relative_path)
+    from apps.stable_diffusion.web.ui.utils import get_custom_model_path

+    # Clear all gradio tmp images from the last session
+    clear_gradio_tmp_imgs_folder()
+    # Create the custom model folder if it doesn't already exist
+    dir = ["models", "vae", "lora"]
+    for root in dir:
+        get_custom_model_path(root).mkdir(parents=True, exist_ok=True)

-dark_theme = resource_path("ui/css/sd_dark_theme.css")
+    def resource_path(relative_path):
+        """Get absolute path to resource, works for dev and for PyInstaller"""
+        base_path = getattr(
+            sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+        )
+        return os.path.join(base_path, relative_path)

-from apps.stable_diffusion.web.ui import (
-    txt2img_web,
-    txt2img_gallery,
-    txt2img_sendto_img2img,
-    txt2img_sendto_inpaint,
-    txt2img_sendto_outpaint,
-    txt2img_sendto_upscaler,
-    img2img_web,
-    img2img_gallery,
-    img2img_init_image,
-    img2img_sendto_inpaint,
-    img2img_sendto_outpaint,
-    img2img_sendto_upscaler,
-    inpaint_web,
-    inpaint_gallery,
-    inpaint_init_image,
-    inpaint_sendto_img2img,
-    inpaint_sendto_outpaint,
-    inpaint_sendto_upscaler,
-    outpaint_web,
-    outpaint_gallery,
-    outpaint_init_image,
-    outpaint_sendto_img2img,
-    outpaint_sendto_inpaint,
-    outpaint_sendto_upscaler,
-    upscaler_web,
-    upscaler_gallery,
-    upscaler_init_image,
-    upscaler_sendto_img2img,
-    upscaler_sendto_inpaint,
-    upscaler_sendto_outpaint,
-    lora_train_web,
-)
+    dark_theme = resource_path("ui/css/sd_dark_theme.css")

-# init global sd pipeline and config
-global_obj._init()
-
-
-def register_button_click(button, selectedid, inputs, outputs):
-    button.click(
-        lambda x: (
-            x[0]["name"] if len(x) != 0 else None,
-            gr.Tabs.update(selected=selectedid),
-        ),
-        inputs,
-        outputs,
-    )
-
-
-with gr.Blocks(
-    css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
-) as sd_web:
-    with gr.Tabs() as tabs:
-        with gr.TabItem(label="Text-to-Image", id=0):
-            txt2img_web.render()
-        with gr.TabItem(label="Image-to-Image", id=1):
-            img2img_web.render()
-        with gr.TabItem(label="Inpainting", id=2):
-            inpaint_web.render()
-        with gr.TabItem(label="Outpainting", id=3):
-            outpaint_web.render()
-        with gr.TabItem(label="Upscaler", id=4):
-            upscaler_web.render()
-        with gr.TabItem(label="LoRA Training", id=5):
-            lora_train_web.render()
-
-    register_button_click(
+    from apps.stable_diffusion.web.ui import (
+        txt2img_web,
+        txt2img_gallery,
        txt2img_sendto_img2img,
-        1,
-        [txt2img_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
        txt2img_sendto_inpaint,
-        2,
-        [txt2img_gallery],
-        [inpaint_init_image, tabs],
-    )
-    register_button_click(
        txt2img_sendto_outpaint,
-        3,
-        [txt2img_gallery],
-        [outpaint_init_image, tabs],
-    )
-    register_button_click(
        txt2img_sendto_upscaler,
-        4,
-        [txt2img_gallery],
-        [upscaler_init_image, tabs],
-    )
-    register_button_click(
+        img2img_web,
+        img2img_gallery,
+        img2img_init_image,
        img2img_sendto_inpaint,
-        2,
-        [img2img_gallery],
-        [inpaint_init_image, tabs],
-    )
-    register_button_click(
        img2img_sendto_outpaint,
-        3,
-        [img2img_gallery],
-        [outpaint_init_image, tabs],
-    )
-    register_button_click(
        img2img_sendto_upscaler,
-        4,
-        [img2img_gallery],
-        [upscaler_init_image, tabs],
-    )
-    register_button_click(
+        inpaint_web,
+        inpaint_gallery,
+        inpaint_init_image,
        inpaint_sendto_img2img,
-        1,
-        [inpaint_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
        inpaint_sendto_outpaint,
-        3,
-        [inpaint_gallery],
-        [outpaint_init_image, tabs],
-    )
-    register_button_click(
        inpaint_sendto_upscaler,
-        4,
-        [inpaint_gallery],
-        [upscaler_init_image, tabs],
-    )
-    register_button_click(
+        outpaint_web,
+        outpaint_gallery,
+        outpaint_init_image,
        outpaint_sendto_img2img,
-        1,
-        [outpaint_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
        outpaint_sendto_inpaint,
-        2,
-        [outpaint_gallery],
-        [inpaint_init_image, tabs],
-    )
-    register_button_click(
        outpaint_sendto_upscaler,
-        4,
-        [outpaint_gallery],
-        [upscaler_init_image, tabs],
-    )
-    register_button_click(
+        upscaler_web,
+        upscaler_gallery,
+        upscaler_init_image,
        upscaler_sendto_img2img,
-        1,
-        [upscaler_gallery],
-        [img2img_init_image, tabs],
-    )
-    register_button_click(
        upscaler_sendto_inpaint,
-        2,
-        [upscaler_gallery],
-        [inpaint_init_image, tabs],
-    )
-    register_button_click(
        upscaler_sendto_outpaint,
-        3,
-        [upscaler_gallery],
-        [outpaint_init_image, tabs],
+        lora_train_web,
    )

+    # init global sd pipeline and config
+    global_obj._init()

-sd_web.queue()
-sd_web.launch(
-    share=args.share,
-    inbrowser=True,
-    server_name="0.0.0.0",
-    server_port=args.server_port,
-)
+    def register_button_click(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                x[0]["name"] if len(x) != 0 else None,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    with gr.Blocks(
+        css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
+    ) as sd_web:
+        with gr.Tabs() as tabs:
+            with gr.TabItem(label="Text-to-Image", id=0):
+                txt2img_web.render()
+            with gr.TabItem(label="Image-to-Image", id=1):
+                img2img_web.render()
+            with gr.TabItem(label="Inpainting", id=2):
+                inpaint_web.render()
+            with gr.TabItem(label="Outpainting", id=3):
+                outpaint_web.render()
+            with gr.TabItem(label="Upscaler", id=4):
+                upscaler_web.render()
+
+        with gr.Tabs(visible=False) as experimental_tabs:
+            with gr.TabItem(label="LoRA Training", id=5):
+                lora_train_web.render()
+
+        register_button_click(
+            txt2img_sendto_img2img,
+            1,
+            [txt2img_gallery],
+            [img2img_init_image, tabs],
+        )
+        register_button_click(
+            txt2img_sendto_inpaint,
+            2,
+            [txt2img_gallery],
+            [inpaint_init_image, tabs],
+        )
+        register_button_click(
+            txt2img_sendto_outpaint,
+            3,
+            [txt2img_gallery],
+            [outpaint_init_image, tabs],
+        )
+        register_button_click(
+            txt2img_sendto_upscaler,
+            4,
+            [txt2img_gallery],
+            [upscaler_init_image, tabs],
+        )
+        register_button_click(
+            img2img_sendto_inpaint,
+            2,
+            [img2img_gallery],
+            [inpaint_init_image, tabs],
+        )
+        register_button_click(
+            img2img_sendto_outpaint,
+            3,
+            [img2img_gallery],
+            [outpaint_init_image, tabs],
+        )
+        register_button_click(
+            img2img_sendto_upscaler,
+            4,
+            [img2img_gallery],
+            [upscaler_init_image, tabs],
+        )
+        register_button_click(
+            inpaint_sendto_img2img,
+            1,
+            [inpaint_gallery],
+            [img2img_init_image, tabs],
+        )
+        register_button_click(
+            inpaint_sendto_outpaint,
+            3,
+            [inpaint_gallery],
+            [outpaint_init_image, tabs],
+        )
+        register_button_click(
+            inpaint_sendto_upscaler,
+            4,
+            [inpaint_gallery],
+            [upscaler_init_image, tabs],
+        )
+        register_button_click(
+            outpaint_sendto_img2img,
+            1,
+            [outpaint_gallery],
+            [img2img_init_image, tabs],
+        )
+        register_button_click(
+            outpaint_sendto_inpaint,
+            2,
+            [outpaint_gallery],
+            [inpaint_init_image, tabs],
+        )
+        register_button_click(
+            outpaint_sendto_upscaler,
+            4,
+            [outpaint_gallery],
+            [upscaler_init_image, tabs],
+        )
+        register_button_click(
+            upscaler_sendto_img2img,
+            1,
+            [upscaler_gallery],
+            [img2img_init_image, tabs],
+        )
+        register_button_click(
+            upscaler_sendto_inpaint,
+            2,
+            [upscaler_gallery],
+            [inpaint_init_image, tabs],
+        )
+        register_button_click(
+            upscaler_sendto_outpaint,
+            3,
+            [upscaler_gallery],
+            [outpaint_init_image, tabs],
+        )
+    sd_web.queue()
+    sd_web.launch(
+        share=args.share,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=args.server_port,
+    )
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -1,4 +1,5 @@
 from apps.stable_diffusion.web.ui.txt2img_ui import (
+    txt2img_inf,
    txt2img_web,
    txt2img_gallery,
    txt2img_sendto_img2img,
@@ -7,6 +8,8 @@ from apps.stable_diffusion.web.ui.txt2img_ui import (
    txt2img_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.img2img_ui import (
+    img2img_api,
+    img2img_inf,
    img2img_web,
    img2img_gallery,
    img2img_init_image,
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -1,8 +1,13 @@
 from pathlib import Path
 import os
+import torch
+import time
+import sys
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import img2img_inf
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.src import args
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
@@ -13,6 +18,314 @@ from apps.stable_diffusion.web.ui.utils import (
    predefined_models,
    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    Image2ImagePipeline,
+    StencilPipeline,
+    resize_stencil,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def img2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    height: int,
+    width: int,
+    steps: int,
+    strength: float,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    use_stencil: str,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.strength = strength
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.ondemand = ondemand
+    if ondemand and batch_count > 1:
+        print("Low VRAM mode currently only supports 1 batch count.")
+        batch_count = 1
+
+    if init_image is None:
+        return None, "An Initial Image is required"
+    image = init_image.convert("RGB")
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    use_stencil = None if use_stencil == "None" else use_stencil
+    args.use_stencil = use_stencil
+    if use_stencil is not None:
+        args.scheduler = "DDIM"
+        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
+        image, width, height = resize_stencil(image)
+    elif args.scheduler != "PNDM":
+        if "Shark" in args.scheduler:
+            print(
+                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
+            )
+            args.scheduler = "PNDM"
+        else:
+            sys.exit(
+                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
+            )
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    args.precision = precision
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    new_config_obj = Config(
+        "img2img",
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=use_stencil,
+        ondemand=ondemand,
+    )
+    if (
+        args.ondemand
+        or not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(args.scheduler)
+
+        if use_stencil is not None:
+            args.use_tuned = False
+            global_obj.set_sd_obj(
+                StencilPipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    args.batch_size,
+                    args.height,
+                    args.width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    use_stencil=use_stencil,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    ondemand=args.ondemand,
+                )
+            )
+        else:
+            global_obj.set_sd_obj(
+                Image2ImagePipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    args.batch_size,
+                    args.height,
+                    args.width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    ondemand=args.ondemand,
+                )
+            )
+
+    global_obj.set_sd_scheduler(args.scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    extra_info = {"STRENGTH": strength}
+    text_output = ""
+    for current_batch in range(batch_count):
+        if current_batch > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            batch_size,
+            height,
+            width,
+            steps,
+            strength,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+            use_stencil=use_stencil,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed, extra_info)
+            generated_imgs.extend(out_imgs)
+            #  yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Img2Img Rest API.
+def img2img_api(
+    InputData: dict,
+):
+    print(InputData)
+    init_image = decode_base64_to_image(InputData["init_images"][0])
+    res = img2img_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        init_image,
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["denoising_strength"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="PNDM",
+        custom_model="None",
+        hf_model_id="stabilityai/stable-diffusion-2-1-base",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        use_stencil="None",
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Image-to-Image") as img2img_web:
@@ -144,6 +457,11 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                            step=0.01,
                            label="Denoising Strength",
                        )
+                        ondemand = gr.Checkbox(
+                            value=args.ondemand,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
@@ -247,6 +565,7 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                save_metadata_to_png,
                lora_weights,
                lora_hf_id,
+                ondemand,
            ],
            outputs=[img2img_gallery, std_output],
            show_progress=args.progress_bar,
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -146,6 +146,11 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        steps = gr.Slider(
                            1, 100, value=args.steps, step=1, label="Steps"
                        )
+                        ondemand = gr.Checkbox(
+                            value=args.ondemand,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
@@ -249,6 +254,7 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                save_metadata_to_png,
                lora_weights,
                lora_hf_id,
+                ondemand,
            ],
            outputs=[inpaint_gallery, std_output],
            show_progress=args.progress_bar,
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -165,6 +165,11 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        steps = gr.Slider(
                            1, 100, value=20, step=1, label="Steps"
                        )
+                        ondemand = gr.Checkbox(
+                            value=args.ondemand,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
@@ -269,6 +274,7 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                save_metadata_to_png,
                lora_weights,
                lora_hf_id,
+                ondemand,
            ],
            outputs=[outpaint_gallery, std_output],
            show_progress=args.progress_bar,
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -1,9 +1,9 @@
 from pathlib import Path
 import os
+import torch
+import time
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import txt2img_inf
-from apps.stable_diffusion.src import prompt_examples, args
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -13,6 +13,191 @@ from apps.stable_diffusion.web.ui.utils import (
    predefined_models,
    cancel_sd,
 )
+from apps.stable_diffusion.web.utils.png_metadata import import_png_metadata
+from apps.stable_diffusion.src import (
+    args,
+    Text2ImagePipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    save_output_img,
+    prompt_examples,
+)
+from apps.stable_diffusion.src.utils import get_generation_text_info
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+def txt2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.ondemand = ondemand
+    if ondemand and batch_count > 1:
+        print("Low VRAM mode currently only supports 1 batch count.")
+        batch_count = 1
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "txt2img",
+        args.hf_model_id,
+        args.ckpt_loc,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        args.ondemand
+        or not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        args.img_path = None
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            Text2ImagePipeline.from_pretrained(
+                scheduler=scheduler_obj,
+                import_mlir=args.import_mlir,
+                model_id=args.hf_model_id,
+                ckpt_loc=args.ckpt_loc,
+                precision=args.precision,
+                max_length=args.max_length,
+                batch_size=args.batch_size,
+                height=args.height,
+                width=args.width,
+                use_base_vae=args.use_base_vae,
+                use_tuned=args.use_tuned,
+                custom_vae=args.custom_vae,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                debug=args.import_debug if args.import_mlir else False,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            batch_size,
+            height,
+            width,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output
+
+    return generated_imgs, text_output
+

 with gr.Blocks(title="Text-to-Image") as txt2img_web:
    with gr.Row(elem_id="ui_title"):
@@ -106,10 +291,18 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            )
                    with gr.Row():
                        height = gr.Slider(
-                            384, 768, value=args.height, step=8, label="Height"
+                            384,
+                            768,
+                            value=args.height,
+                            step=8,
+                            label="Height",
                        )
                        width = gr.Slider(
-                            384, 768, value=args.width, step=8, label="Width"
+                            384,
+                            768,
+                            value=args.width,
+                            step=8,
+                            label="Width",
                        )
                        precision = gr.Radio(
                            label="Precision",
@@ -140,6 +333,11 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            step=0.1,
                            label="CFG Scale",
                        )
+                        ondemand = gr.Checkbox(
+                            value=args.ondemand,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            batch_count = gr.Slider(
@@ -241,6 +439,7 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                save_metadata_to_png,
                lora_weights,
                lora_hf_id,
+                ondemand,
            ],
            outputs=[txt2img_gallery, std_output],
            show_progress=args.progress_bar,
@@ -254,14 +453,20 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
        )

-        from apps.stable_diffusion.web.utils.png_metadata import (
-            import_png_metadata,
-        )
-
        png_info_img.change(
            fn=import_png_metadata,
            inputs=[
                png_info_img,
+                prompt,
+                negative_prompt,
+                steps,
+                scheduler,
+                guidance_scale,
+                seed,
+                width,
+                height,
+                custom_model,
+                hf_model_id,
            ],
            outputs=[
                png_info_img,
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -143,6 +143,11 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                            step=1,
                            label="Noise Level",
                        )
+                        ondemand = gr.Checkbox(
+                            value=args.ondemand,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
                    with gr.Row():
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
@@ -243,6 +248,7 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                save_metadata_to_png,
                lora_weights,
                lora_hf_id,
+                ondemand,
            ],
            outputs=[upscaler_gallery, std_output],
            show_progress=args.progress_bar,
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -24,6 +24,7 @@ class Config:
    device: str
    use_lora: str
    use_stencil: str
+    ondemand: str


 custom_model_filetypes = (
@@ -75,6 +76,13 @@ def resource_path(relative_path):


 def get_custom_model_path(model="models"):
+    # If `--ckpt_dir` is provided it'd override the heirarchical folder
+    # structure in WebUI :-
+    #       model
+    #         |___lora
+    #         |___vae
+    if args.ckpt_dir:
+        return Path(args.ckpt_dir)
    match model:
        case "models":
            return Path(Path.cwd(), "models")
--- a/apps/stable_diffusion/web/utils/global_obj.py
+++ b/apps/stable_diffusion/web/utils/global_obj.py
@@ -11,8 +11,10 @@ Also we could avoid memory leak when switching models by clearing the cache.
 def _init():
    global _sd_obj
    global _config_obj
+    global _schedulers
    _sd_obj = None
    _config_obj = None
+    _schedulers = None


 def set_sd_obj(value):
@@ -20,9 +22,9 @@ def set_sd_obj(value):
    _sd_obj = value


-def set_schedulers(value):
+def set_sd_scheduler(key):
    global _sd_obj
-    _sd_obj.scheduler = value
+    _sd_obj.scheduler = _schedulers[key]


 def set_sd_status(value):
@@ -35,23 +37,39 @@ def set_cfg_obj(value):
    _config_obj = value


+def set_schedulers(value):
+    global _schedulers
+    _schedulers = value
+
+
 def get_sd_obj():
+    global _sd_obj
    return _sd_obj


 def get_sd_status():
+    global _sd_obj
    return _sd_obj.status


 def get_cfg_obj():
+    global _config_obj
    return _config_obj


+def get_scheduler(key):
+    global _schedulers
+    return _schedulers[key]
+
+
 def clear_cache():
    global _sd_obj
    global _config_obj
+    global _schedulers
    del _sd_obj
    del _config_obj
+    del _schedulers
    gc.collect()
    _sd_obj = None
    _config_obj = None
+    _schedulers = None
--- a/apps/stable_diffusion/web/utils/png_metadata.py
+++ b/apps/stable_diffusion/web/utils/png_metadata.py
@@ -1,18 +1,5 @@
 import re
 from pathlib import Path
-from apps.stable_diffusion.web.ui.txt2img_ui import (
-    png_info_img,
-    prompt,
-    negative_prompt,
-    steps,
-    scheduler,
-    guidance_scale,
-    seed,
-    width,
-    height,
-    custom_model,
-    hf_model_id,
-)
 from apps.stable_diffusion.web.ui.utils import (
    get_custom_model_pathfile,
    scheduler_list_txt2img,
@@ -75,7 +62,19 @@ def parse_generation_parameters(x: str):
    return res


-def import_png_metadata(pil_data):
+def import_png_metadata(
+    pil_data,
+    prompt,
+    negative_prompt,
+    steps,
+    sampler,
+    cfg_scale,
+    seed,
+    width,
+    height,
+    custom_model,
+    hf_model_id,
+):
    try:
        png_info = pil_data.info["parameters"]
        metadata = parse_generation_parameters(png_info)
@@ -110,39 +109,44 @@ def import_png_metadata(pil_data):
                    % metadata["Model"]
                )

-        outputs = {
-            png_info_img: None,
-            negative_prompt: metadata["Negative prompt"],
-            steps: int(metadata["Steps"]),
-            guidance_scale: float(metadata["CFG scale"]),
-            seed: int(metadata["Seed"]),
-            width: float(metadata["Size-1"]),
-            height: float(metadata["Size-2"]),
-        }
+        negative_prompt = metadata["Negative prompt"]
+        steps = int(metadata["Steps"])
+        cfg_scale = float(metadata["CFG scale"])
+        seed = int(metadata["Seed"])
+        width = float(metadata["Size-1"])
+        height = float(metadata["Size-2"])
        if "Model" in metadata and png_custom_model:
-            outputs[custom_model] = png_custom_model
-            outputs[hf_model_id] = ""
+            custom_model = png_custom_model
+            hf_model_id = ""
        if "Model" in metadata and png_hf_model_id:
-            outputs[custom_model] = "None"
-            outputs[hf_model_id] = png_hf_model_id
+            custom_model = "None"
+            hf_model_id = png_hf_model_id
        if "Prompt" in metadata:
-            outputs[prompt] = metadata["Prompt"]
+            prompt = metadata["Prompt"]
        if "Sampler" in metadata:
            if metadata["Sampler"] in scheduler_list_txt2img:
-                outputs[scheduler] = metadata["Sampler"]
+                sampler = metadata["Sampler"]
            else:
                print(
                    "Import PNG info: Unable to find a scheduler for %s"
                    % metadata["Sampler"]
                )

-        return outputs
-
    except Exception as ex:
        if pil_data and pil_data.info.get("parameters"):
            print("import_png_metadata failed with %s" % ex)
        pass

-    return {
-        png_info_img: None,
-    }
+    return (
+        None,
+        prompt,
+        negative_prompt,
+        steps,
+        sampler,
+        cfg_scale,
+        seed,
+        width,
+        height,
+        custom_model,
+        hf_model_id,
+    )
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -87,11 +87,22 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
        "wavymulder/Analog-Diffusion",
        "dreamlike-art/dreamlike-diffusion-1.0",
    ]
+    counter = 0
    for import_opt in import_options:
        for model_name in hf_model_names:
            if model_name in to_skip:
                continue
            for use_tune in tuned_options:
+                if (
+                    model_name == "stabilityai/stable-diffusion-2-1"
+                    and use_tune == tuned_options[0]
+                ):
+                    continue
+                elif (
+                    model_name == "stabilityai/stable-diffusion-2-1-base"
+                    and use_tune == tuned_options[1]
+                ):
+                    continue
                command = (
                    [
                        executable,  # executable is the python from the venv used to run this
@@ -174,9 +185,21 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
                else:
                    print(command)
                    print("failed to generate image for this configuration")
-                    if "2_1_base" in model_name:
-                        print("failed a known successful model.")
-                        exit(1)
+                    with open(dumpfile_name, "r+") as f:
+                        output = f.readlines()
+                        print("\n".join(output))
+                    exit(1)
+                if os.name == "nt":
+                    counter += 1
+                    if counter % 2 == 0:
+                        extra_flags.append(
+                            "--iree_vulkan_target_triple=rdna2-unknown-windows"
+                        )
+                    else:
+                        if counter != 1:
+                            extra_flags.remove(
+                                "--iree_vulkan_target_triple=rdna2-unknown-windows"
+                            )
    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
        f.write(header)
--- a/conftest.py
+++ b/conftest.py
@@ -2,9 +2,11 @@ def pytest_addoption(parser):
    # Attaches SHARK command-line arguments to the pytest machinery.
    parser.addoption(
        "--benchmark",
-        action="store_true",
-        default="False",
-        help="Pass option to benchmark and write results.csv",
+        action="store",
+        type=str,
+        default=None,
+        choices=("baseline", "native", "all"),
+        help="Benchmarks specified engine(s) and writes bench_results.csv.",
    )
    parser.addoption(
        "--onnx_bench",
@@ -40,7 +42,13 @@ def pytest_addoption(parser):
        "--update_tank",
        action="store_true",
        default="False",
-        help="Update local shark tank with latest artifacts.",
+        help="Update local shark tank with latest artifacts if model artifact hash mismatched.",
+    )
+    parser.addoption(
+        "--force_update_tank",
+        action="store_true",
+        default="False",
+        help="Force-update local shark tank with artifacts from specified shark_tank URL (defaults to nightly).",
    )
    parser.addoption(
        "--ci_sha",
@@ -51,15 +59,21 @@ def pytest_addoption(parser):
    parser.addoption(
        "--local_tank_cache",
        action="store",
-        default="",
+        default=None,
        help="Specify the directory in which all downloaded shark_tank artifacts will be cached.",
    )
    parser.addoption(
        "--tank_url",
        type=str,
-        default="gs://shark_tank/latest",
+        default="gs://shark_tank/nightly",
        help="URL to bucket from which to download SHARK tank artifacts. Default is gs://shark_tank/latest",
    )
+    parser.addoption(
+        "--tank_prefix",
+        type=str,
+        default=None,
+        help="Prefix to gs://shark_tank/ model directories from which to download SHARK tank artifacts. Default is nightly.",
+    )
    parser.addoption(
        "--benchmark_dispatches",
        default=None,
@@ -70,3 +84,9 @@ def pytest_addoption(parser):
        default="./temp_dispatch_benchmarks",
        help="Directory in which dispatch benchmarks are saved.",
    )
+    parser.addoption(
+        "--batchsize",
+        default=1,
+        type=int,
+        help="Batch size for the tested model.",
+    )
--- a/docs/shark_sd_blender.md
+++ b/docs/shark_sd_blender.md
@@ -0,0 +1,75 @@
+# Overview
+
+This document is intended to provide a starting point for using SHARK stable diffusion with Blender. 
+
+We currently make use of the [AI-Render Plugin](https://github.com/benrugg/AI-Render) to integrate with Blender.
+
+## Setup SHARK and prerequisites:
+
+ * Download the latest SHARK SD webui .exe from [here](https://github.com/nod-ai/SHARK/releases) or follow instructions on the [README](https://github.com/nod-ai/SHARK#readme)
+ * Once you have the .exe where you would like SHARK to install, run the .exe from terminal/PowerShell with the `--api` flag:
+```
+## Run the .exe in API mode:
+.\shark_sd_<date>_<ver>.exe --api
+
+## For example:
+.\shark_sd_20230411_671.exe --api --server_port=8082
+
+## From a the base directory of a source clone of SHARK:
+./setup_venv.ps1
+python apps\stable_diffusion\web\index.py --api
+
+```
+
+Your local SD server should start and look something like this:
+![image](https://user-images.githubusercontent.com/87458719/231369758-e2c3c45a-eccc-4fe5-a788-4a3bf1ace1d1.png)
+
+ * Note: When running in api mode with `--api`, the .exe will not function as a webUI. Thus, the address in the terminal output will only be useful for API requests.
+
+### Install AI Render
+
+- Get AI Render on [Blender Market](https://blendermarket.com/products/ai-render) or [Gumroad](https://airender.gumroad.com/l/ai-render)
+- Open Blender, then go to Edit > Preferences > Add-ons > Install and then find the zip file
+- We will be using the Automatic1111 SD backend for the AI-Render plugin. Follow instructions [here](https://github.com/benrugg/AI-Render/wiki/Local-Installation) to setup local SD backend.
+
+Your AI-Render preferences should be configured as shown; the highlighted part should match your terminal output:
+![image](https://user-images.githubusercontent.com/87458719/231390322-59a54a09-520a-4a08-b658-6e37bd63e932.png)
+
+
+The [AI-Render README](https://github.com/benrugg/AI-Render/blob/main/README.md) has more details on installation and usage, as well as video tutorials.
+
+## Using AI-Render + SHARK in your Blender project
+
+- In the Render Properties tab, in the AI-Render dropdown, enable AI-Render.
+
+![image](https://user-images.githubusercontent.com/87458719/231392843-9bd51744-3ce2-464e-843a-0c4d4c96df0c.png)
+
+- Select an image size (it's usually better to upscale later than go high on the img2img resolution here.)
+
+![image](https://user-images.githubusercontent.com/87458719/231394288-0c4ab8c5-dc30-4dbe-8bc1-7520ded5efe8.png)
+
+- From here, you can enter a prompt and configure img2img Stable Diffusion parameters, and AI-Render will run SHARK SD img2img on the rendered scene.
+- AI-Render has useful presets for aesthetic styles, so you should be able to keep your subject prompt simple and focus on creating a decent Blender scene to start from.
+
+![image](https://user-images.githubusercontent.com/87458719/231440729-2fe69586-41cb-4274-9ce7-f6c08def600b.png)
+
+## Examples:
+Scene (Input image):
+
+![blender-sample-2](https://user-images.githubusercontent.com/87458719/231450408-0e680086-3e52-4962-a5c1-c703a94d1583.png)
+
+Prompt:
+"A bowl of tangerines in front of rocks, masterpiece, oil on canvas, by Georgia O'Keefe, trending on artstation, landscape painting by Caspar David Friedrich"
+
+Negative Prompt (default):
+"ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+
+Example output:
+
+![blender-sample-2_out](https://user-images.githubusercontent.com/87458719/231451145-a0b56897-a7d0-4add-bbed-7e8af21a65df.png)
+
+
+
+
+
+
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -6,36 +6,16 @@ from distutils.sysconfig import get_python_lib
 import fileinput
 from pathlib import Path

-# Diffusers 0.13.1 fails with transformers __init.py errros in BLIP. So remove it for now until we fork it
-pix2pix_init = Path(get_python_lib() + "/diffusers/__init__.py")
-for line in fileinput.input(pix2pix_init, inplace=True):
-    if "Pix2Pix" in line:
-        if not line.startswith("#"):
-            print(f"#{line}", end="")
-        else:
-            print(f"{line[1:]}", end="")
-    else:
-        print(line, end="")
-pix2pix_init = Path(get_python_lib() + "/diffusers/pipelines/__init__.py")
-for line in fileinput.input(pix2pix_init, inplace=True):
-    if "Pix2Pix" in line:
-        if not line.startswith("#"):
-            print(f"#{line}", end="")
-        else:
-            print(f"{line[1:]}", end="")
-    else:
-        print(line, end="")
-pix2pix_init = Path(
-    get_python_lib() + "/diffusers/pipelines/stable_diffusion/__init__.py"
+# Temorary workaround for transformers/__init__.py.
+path_to_tranformers_hook = Path(
+    get_python_lib()
+    + "/_pyinstaller_hooks_contrib/hooks/stdhooks/hook-transformers.py"
 )
-for line in fileinput.input(pix2pix_init, inplace=True):
-    if "StableDiffusionPix2PixZeroPipeline" in line:
-        if not line.startswith("#"):
-            print(f"#{line}", end="")
-        else:
-            print(f"{line[1:]}", end="")
-    else:
-        print(line, end="")
+if path_to_tranformers_hook.is_file():
+    pass
+else:
+    with open(path_to_tranformers_hook, "w") as f:
+        f.write("module_collection_mode = 'pyz+py'")

 path_to_skipfiles = Path(get_python_lib() + "/torch/_dynamo/skipfiles.py")

--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,3 @@
 [pytest]
-addopts = --verbose -p no:warnings
+addopts = --verbose -s -p no:warnings
 norecursedirs = inference tank/tflite examples benchmarks shark 
--- a/requirements-importer.txt
+++ b/requirements-importer.txt
@@ -2,8 +2,8 @@
 --pre

 numpy>1.22.4
-torchvision
 pytorch-triton
+torchvision==0.16.0.dev20230322 
 tabulate

 tqdm
@@ -15,8 +15,8 @@ iree-tools-tf

 # TensorFlow and JAX.
 gin-config
-tf-nightly
-keras>=2.10
+tensorflow>2.11
+keras
 #tf-models-nightly
 #tensorflow-text-nightly
 transformers
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ parameterized

 # Add transformers, diffusers and scipy since it most commonly used
 transformers
-diffusers @ git+https://github.com/huggingface/diffusers@main
+diffusers @ git+https://github.com/huggingface/diffusers@e47459c80f6f6a5a1c19d32c3fd74edf94f47aa2
 scipy
 ftfy
 gradio
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -45,7 +45,7 @@ if ($arguments -eq "--force"){
        Remove-Item .\shark.venv -Force -Recurse
        if (Test-Path .\shark.venv\) {
            Write-Host 'could not remove .\shark-venv - please try running ".\setup_venv.ps1 --force" again!'
-            break
+            exit 1
        }
    }
 }
@@ -78,12 +78,12 @@ if (!($PyVer.length -ne 0)) {$p} # return Python --version String if py.exe is u
 if (!($PyVer -like "*3.11*") -and !($p -like "*3.11*")) # if 3.11 is not in any list
 {
    Write-Host "Please install Python 3.11 and try again"
-    break
+    exit 34
 }

 Write-Host "Installing Build Dependencies"
 # make sure we really use 3.11 from list, even if it's not the default.
-if (!($PyVer.length -ne 0)) {py -3.11 -m venv .\shark.venv\}
+if ($NULL -ne $PyVer) {py -3.11 -m venv .\shark.venv\}
 else {python -m venv .\shark.venv\}
 .\shark.venv\Scripts\activate
 python -m pip install --upgrade pip
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -129,11 +129,11 @@ if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
  TV_VERSION=${TV_VER:9:18}
  $PYTHON -m pip uninstall -y torch torchvision
  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu117/torch-${TORCH_VERSION}%2Bcu117-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu117/torchvision-${TV_VERSION}%2Bcu117-cp311-cp311-linux_x86_64.whl
+  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu117."
+    echo "Successfully Installed torch + cu118."
  else
-    echo "Could not install torch + cu117." >&2
+    echo "Could not install torch + cu118." >&2
  fi
 fi

--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -70,6 +70,7 @@ def get_iree_common_args():
    return [
        "--iree-stream-resource-index-bits=64",
        "--iree-vm-target-index-bits=64",
+        "--iree-vm-bytecode-module-strip-source-map=true",
        "--iree-util-zero-fill-elided-attrs",
    ]

--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -30,11 +30,10 @@ def get_iree_gpu_args():
        in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86", "sm_89"]
    ) and (shark_args.enable_tf32 == True):
        return [
-            "--iree-hal-cuda-disable-loop-nounroll-wa",
            f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
        ]
    else:
-        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
+        return []


 # Get the default gpu args given the architecture.
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -131,6 +131,8 @@ def get_vendor(triple):
        return "ARM"
    if arch == "m1":
        return "Apple"
+    if arch in ["arc", "UHD"]:
+        return "Intel"
    if arch in ["turing", "ampere"]:
        return "NVIDIA"
    if arch == "ardeno":
@@ -149,7 +151,7 @@ def get_device_type(triple):
        return "Unknown"
    if arch == "cpu":
        return "CPU"
-    if arch in ["turing", "ampere"]:
+    if arch in ["turing", "ampere", "arc"]:
        return "DiscreteGPU"
    if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn5"]:
        if product == "ivega10":
@@ -343,6 +345,37 @@ def get_vulkan_target_capabilities(triple):
        cap["variablePointers"] = True
        cap["variablePointersStorageBuffer"] = True

+    elif arch == "arc":
+        cap["maxComputeSharedMemorySize"] = 32768
+        cap["maxComputeWorkGroupInvocations"] = 1024
+        cap["maxComputeWorkGroupSize"] = [1024, 1024, 64]
+
+        cap["subgroupSize"] = 32
+        cap["subgroupFeatures"] = [
+            "Basic",
+            "Vote",
+            "Arithmetic",
+            "Ballot",
+            "Shuffle",
+            "ShuffleRelative",
+            "Clustered",
+            "Quad",
+        ]
+
+        cap["shaderFloat16"] = True
+        cap["shaderFloat64"] = False
+        cap["shaderInt8"] = True
+        cap["shaderInt16"] = True
+        cap["shaderInt64"] = False
+        cap["storageBuffer16BitAccess"] = True
+        cap["storagePushConstant16"] = True
+        cap["uniformAndStorageBuffer16BitAccess"] = True
+        cap["storageBuffer8BitAccess"] = True
+        cap["storagePushConstant8"] = True
+        cap["uniformAndStorageBuffer8BitAccess"] = True
+        cap["variablePointers"] = True
+        cap["variablePointersStorageBuffer"] = True
+
    elif arch == "cpu":
        if product == "swiftshader":
            cap["maxComputeSharedMemorySize"] = 16384
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -109,6 +109,9 @@ def get_vulkan_target_triple(device_name):
        triple = f"rdna3-7900-{system_os}"
    elif any(x in device_name for x in ("AMD", "Radeon")):
        triple = f"rdna2-unknown-{system_os}"
+    # Intel Targets
+    elif any(x in device_name for x in ("A770", "A750")):
+        triple = f"arc-770-{system_os}"
    else:
        triple = None
    return triple
@@ -140,7 +143,7 @@ def get_vulkan_triple_flag(device_name="", extra_args=[]):


 def get_iree_vulkan_args(extra_args=[]):
-    # vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
+    # res_vulkan_flag = ["--iree-flow-demote-i64-to-i32"]

    res_vulkan_flag = []
    vulkan_triple_flag = None
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -14,8 +14,10 @@

 import argparse
 import os
+import subprocess

 parser = argparse.ArgumentParser(description="SHARK runner.")
+
 parser.add_argument(
    "--device",
    type=str,
@@ -54,7 +56,7 @@ parser.add_argument(
 )
 parser.add_argument(
    "--shark_prefix",
-    default="latest",
+    default=None,
    help="gs://shark_tank/<this_flag>/model_directories",
 )
 parser.add_argument(
--- a/shark/shark_benchmark_runner.py
+++ b/shark/shark_benchmark_runner.py
@@ -78,6 +78,7 @@ class SharkBenchmarkRunner(SharkRunner):
        self.vmfb_file = None
        self.mlir_dialect = mlir_dialect
        self.extra_args = extra_args
+        self.import_args = {}
        SharkRunner.__init__(
            self,
            mlir_module,
@@ -112,7 +113,6 @@ class SharkBenchmarkRunner(SharkRunner):

    def benchmark_torch(self, modelname):
        import torch
-        import torch._dynamo as dynamo
        from tank.model_utils import get_torch_model

        if self.device == "cuda":
@@ -124,12 +124,18 @@ class SharkBenchmarkRunner(SharkRunner):
        torch_device = torch.device(
            "cuda:0" if self.device == "cuda" else "cpu"
        )
-        HFmodel, input = get_torch_model(modelname)[:2]
+        HFmodel, input = get_torch_model(modelname, self.import_args)[:2]
        frontend_model = HFmodel.model
        frontend_model.to(torch_device)
        input.to(torch_device)

-        # frontend_model = torch.compile(frontend_model, mode="max-autotune", backend="inductor")
+        # TODO: re-enable as soon as pytorch CUDA context issues are resolved
+        try:
+            frontend_model = torch.compile(
+                frontend_model, mode="max-autotune", backend="inductor"
+            )
+        except RuntimeError:
+            frontend_model = HFmodel.model

        for i in range(shark_args.num_warmup_iterations):
            frontend_model.forward(input)
@@ -143,12 +149,18 @@ class SharkBenchmarkRunner(SharkRunner):
        if self.device == "cuda":
            stats = torch.cuda.memory_stats()
            device_peak_b = stats["allocated_bytes.all.peak"]
+            frontend_model.to(torch.device("cpu"))
+            input.to(torch.device("cpu"))
+            torch.cuda.empty_cache()
        else:
            device_peak_b = None

        print(
            f"Torch benchmark:{shark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{shark_args.num_iterations}"
        )
+        if self.device == "cuda":
+            # Set device to CPU so we don't run into segfaults exiting pytest subprocesses.
+            torch_device = torch.device("cpu")
        return [
            f"{shark_args.num_iterations/(end-begin)}",
            f"{((end-begin)/shark_args.num_iterations)*1000}",
@@ -157,6 +169,9 @@ class SharkBenchmarkRunner(SharkRunner):
        ]

    def benchmark_tf(self, modelname):
+        import os
+
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
        import tensorflow as tf

        visible_default = tf.config.list_physical_devices("GPU")
@@ -178,7 +193,7 @@ class SharkBenchmarkRunner(SharkRunner):
                model,
                input,
            ) = get_tf_model(
-                modelname
+                modelname, self.import_args
            )[:2]
            frontend_model = model

@@ -338,11 +353,21 @@ for currently supported models. Exiting benchmark ONNX."
        return comp_str

    def benchmark_all_csv(
-        self, inputs: tuple, modelname, dynamic, device_str, frontend
+        self,
+        inputs: tuple,
+        modelname,
+        dynamic,
+        device_str,
+        frontend,
+        import_args,
+        mode="native",
    ):
        self.setup_cl(inputs)
+        self.import_args = import_args
+        self.mode = mode
        field_names = [
            "model",
+            "batch_size",
            "engine",
            "dialect",
            "device",
@@ -362,7 +387,13 @@ for currently supported models. Exiting benchmark ONNX."
            "measured_device_memory_mb",
        ]
        # "frontend" must be the first element.
-        engines = ["frontend", "shark_python", "shark_iree_c"]
+        if self.mode == "native":
+            engines = ["shark_python", "shark_iree_c"]
+        if self.mode == "baseline":
+            engines = ["frontend"]
+        if self.mode == "all":
+            engines = ["frontend", "shark_python", "shark_iree_c"]
+
        if shark_args.onnx_bench == True:
            engines.append("onnxruntime")

@@ -375,6 +406,7 @@ for currently supported models. Exiting benchmark ONNX."
            writer = csv.DictWriter(f, fieldnames=field_names)
            bench_info = {}
            bench_info["model"] = modelname
+            bench_info["batch_size"] = str(import_args["batch_size"])
            bench_info["dialect"] = self.mlir_dialect
            bench_info["iterations"] = shark_args.num_iterations
            if dynamic == True:
@@ -389,6 +421,7 @@ for currently supported models. Exiting benchmark ONNX."

            for e in engines:
                engine_result = {}
+                self.frontend_result = None
                if e == "frontend":
                    engine_result["engine"] = frontend
                    if check_requirements(frontend):
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -127,33 +127,105 @@ def check_dir_exists(model_name, frontend="torch", dynamic=""):
            and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
            and os.path.isfile(os.path.join(model_dir, "hash.npy"))
        ):
-            print(f"""Using cached models from {WORKDIR}...""")
+            print(
+                f"""Model artifacts for {model_name} found at {WORKDIR}..."""
+            )
            return True
    return False


+def _internet_connected():
+    import requests as req
+
+    try:
+        req.get("http://1.1.1.1")
+        return True
+    except:
+        return False
+
+
+def get_git_revision_short_hash() -> str:
+    import subprocess
+
+    if shark_args.shark_prefix is not None:
+        prefix_kw = shark_args.shark_prefix
+    else:
+        import json
+
+        dir_path = os.path.dirname(os.path.realpath(__file__))
+        src = os.path.join(dir_path, "..", "tank_version.json")
+        with open(src, "r") as f:
+            data = json.loads(f.read())
+            prefix_kw = data["version"]
+    print(f"Checking for updates from gs://shark_tank/{prefix_kw}")
+    return prefix_kw
+
+
+def get_sharktank_prefix():
+    tank_prefix = ""
+    if not _internet_connected():
+        print(
+            "No internet connection. Using the model already present in the tank."
+        )
+        tank_prefix = "none"
+    else:
+        desired_prefix = get_git_revision_short_hash()
+        storage_client_a = storage.Client.create_anonymous_client()
+        base_bucket_name = "shark_tank"
+        base_bucket = storage_client_a.bucket(base_bucket_name)
+        dir_blobs = base_bucket.list_blobs(prefix=f"{desired_prefix}")
+        for blob in dir_blobs:
+            dir_blob_name = blob.name.split("/")
+            if desired_prefix in dir_blob_name[0]:
+                tank_prefix = dir_blob_name[0]
+                break
+            else:
+                continue
+        if tank_prefix == "":
+            print(
+                f"shark_tank bucket not found matching ({desired_prefix}). Defaulting to nightly."
+            )
+            tank_prefix = "nightly"
+    return tank_prefix
+
+
 # Downloads the torch model from gs://shark_tank dir.
 def download_model(
    model_name,
    dynamic=False,
-    tank_url="gs://shark_tank/latest",
+    tank_url=None,
    frontend=None,
    tuned=None,
+    import_args={"batch_size": "1"},
 ):
    model_name = model_name.replace("/", "_")
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
-    model_dir_name = model_name + "_" + frontend
+    shark_args.shark_prefix = get_sharktank_prefix()
+    if import_args["batch_size"] and import_args["batch_size"] != 1:
+        model_dir_name = (
+            model_name
+            + "_"
+            + frontend
+            + "_BS"
+            + str(import_args["batch_size"])
+        )
+    else:
+        model_dir_name = model_name + "_" + frontend
    model_dir = os.path.join(WORKDIR, model_dir_name)
-    full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name

+    if not tank_url:
+        tank_url = "gs://shark_tank/" + shark_args.shark_prefix
+
+    full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name
    if not check_dir_exists(
        model_dir_name, frontend=frontend, dynamic=dyn_str
    ):
        print(
-            f"Force-updating artifacts for model {model_name} from: {full_gs_url}"
+            f"Downloading artifacts for model {model_name} from: {full_gs_url}"
        )
        download_public_file(full_gs_url, model_dir)
+
    elif shark_args.force_update_tank == True:
        print(
            f"Force-updating artifacts for model {model_name} from: {full_gs_url}"
@@ -179,6 +251,7 @@ def download_model(
                    np.load(os.path.join(model_dir, "upstream_hash.npy"))
                )
            except FileNotFoundError:
+                print(f"Model artifact hash not found at {model_dir}.")
                upstream_hash = None
            if local_hash != upstream_hash and shark_args.update_tank == True:
                print(f"Updating artifacts for model {model_name}...")
@@ -186,23 +259,28 @@ def download_model(

            elif local_hash != upstream_hash:
                print(
-                    "Hash does not match upstream in gs://shark_tank/latest. If you want to use locally generated artifacts, this is working as intended. Otherwise, run with --update_tank."
+                    "Hash does not match upstream in gs://shark_tank/. If you want to use locally generated artifacts, this is working as intended. Otherwise, run with --update_tank."
+                )
+            else:
+                print(
+                    "Local and upstream hashes match. Using cached model artifacts."
                )

    model_dir = os.path.join(WORKDIR, model_dir_name)
    tuned_str = "" if tuned is None else "_" + tuned
    suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
    filename = os.path.join(model_dir, model_name + suffix)
-
-    try:
-        with open(filename, mode="rb") as f:
-            mlir_file = f.read()
-    except FileNotFoundError:
+    if not os.path.exists(filename):
        from tank.generate_sharktank import gen_shark_files

-        tank_dir = WORKDIR
-        gen_shark_files(model_name, frontend, tank_dir)
+        print(
+            "The model data was not found. Trying to generate artifacts locally."
+        )
+        gen_shark_files(model_name, frontend, WORKDIR, import_args)

+    assert os.path.exists(filename), f"MLIR not found at {filename}"
+    with open(filename, mode="rb") as f:
+        mlir_file = f.read()
    function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
    inputs = np.load(os.path.join(model_dir, "inputs.npz"))
    golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
@@ -210,13 +288,3 @@ def download_model(
    inputs_tuple = tuple([inputs[key] for key in inputs])
    golden_out_tuple = tuple([golden_out[key] for key in golden_out])
    return mlir_file, function_name, inputs_tuple, golden_out_tuple
-
-
-def _internet_connected():
-    import requests as req
-
-    try:
-        req.get("http://1.1.1.1")
-        return True
-    except:
-        return False
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -9,8 +9,8 @@ import hashlib

 def create_hash(file_name):
    with open(file_name, "rb") as f:
-        file_hash = hashlib.blake2b()
-        while chunk := f.read(2**20):
+        file_hash = hashlib.blake2b(digest_size=64)
+        while chunk := f.read(2**10):
            file_hash.update(chunk)

    return file_hash.hexdigest()
@@ -165,8 +165,17 @@ class SharkImporter:
        if self.frontend == "torch":
            with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
                mlir_file.write(mlir_data)
-            mlir_hash = create_hash(os.path.join(dir, model_name_mlir))
-            np.save(os.path.join(dir, "hash"), np.array(mlir_hash))
+        hash_gen_attempts = 2
+        for i in range(hash_gen_attempts):
+            try:
+                mlir_hash = create_hash(os.path.join(dir, model_name_mlir))
+            except FileNotFoundError as err:
+                if i < hash_gen_attempts:
+                    continue
+                else:
+                    raise err
+
+        np.save(os.path.join(dir, "hash"), np.array(mlir_hash))
        return

    def import_debug(
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -22,7 +22,7 @@ bert-large-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
 bert-large-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
 google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
-microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390",""
+microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390","macos"
 microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
 google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"https://github.com/nod-ai/SHARK/issues/344",""
 mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/388","macos"
@@ -35,18 +35,12 @@ squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","mac
 wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
 efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
-t5-base,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+efficientnet_b0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/1243",""
+efficientnet_b7,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"Fails on MacOS builder, VK device lost","macos"
+efficientnet_b0,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"",""
+efficientnet_b7,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"Fails on MacOS builder, VK device lost","macos"
+gpt2,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"",""
+t5-base,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported.",""
 t5-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-t5-large,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-t5-large,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-efficientnet_b0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"",""
-efficientnet_b7,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"",""
-efficientnet_b0,mhlo,tf,1e-2,1e-3,default,None,nhcw-nhwc,False,False,False,"",""
-efficientnet_b7,mhlo,tf,1e-2,1e-3,default,None,nhcw-nhwc,False,False,False,"",""
-efficientnet_b0,mhlo,tf,1e-2,1e-3,default,None,nhcw-nhwc,False,False,"",""
-efficientnet_b7,mhlo,tf,1e-2,1e-3,default,None,nhcw-nhwc,False,False,"",""
-gpt2,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-t5-base,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-t5-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-t5-large,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+t5-large,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported",""
 t5-large,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
--- a/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
+++ b/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
@@ -70,7 +70,7 @@ if __name__ == "__main__":
    backend_config = "dylib"
    # backend = "cuda"
    # backend_config = "cuda"
-    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-enable-fusion-with-reduction-ops"]
    flatbuffer_blob = compile_str(
        compiler_module,
        target_backends=[backend],
--- a/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
@@ -146,7 +146,6 @@ if __name__ == "__main__":
        backend_config = "cuda"
        args = [
            "--iree-cuda-llvm-target-arch=sm_80",
-            "--iree-hal-cuda-disable-loop-nounroll-wa",
            "--iree-enable-fusion-with-reduction-ops",
        ]

--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -91,7 +91,7 @@ if __name__ == "__main__":
    backend_config = "dylib"
    # backend = "cuda"
    # backend_config = "cuda"
-    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-enable-fusion-with-reduction-ops"]
    flatbuffer_blob = compile_str(
        compiler_module,
        target_backends=[backend],
--- a/tank/examples/bert_tf/bert_small_run.py
+++ b/tank/examples/bert_tf/bert_small_run.py
@@ -86,7 +86,7 @@ if __name__ == "__main__":
    backend_config = "dylib"
    # backend = "cuda"
    # backend_config = "cuda"
-    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-hal-cuda-disable-loop-nounroll-wa", "--iree-enable-fusion-with-reduction-ops"]
+    # args = ["--iree-cuda-llvm-target-arch=sm_80", "--iree-enable-fusion-with-reduction-ops"]
    flatbuffer_blob = compile_str(
        compiler_module,
        target_backends=[backend],
--- a/tank/generate_sharktank.py
+++ b/tank/generate_sharktank.py
@@ -26,14 +26,14 @@ from apps.stable_diffusion.src.utils.stable_args import (

 def create_hash(file_name):
    with open(file_name, "rb") as f:
-        file_hash = hashlib.blake2b()
-        while chunk := f.read(2**20):
+        file_hash = hashlib.blake2b(digest_size=64)
+        while chunk := f.read(2**10):
            file_hash.update(chunk)

    return file_hash.hexdigest()


-def save_torch_model(torch_model_list, local_tank_cache):
+def save_torch_model(torch_model_list, local_tank_cache, import_args):
    from tank.model_utils import (
        get_hf_model,
        get_hf_seq2seq_model,
@@ -59,7 +59,6 @@ def save_torch_model(torch_model_list, local_tank_cache):
            if model_type == "stable_diffusion":
                args.use_tuned = False
                args.import_mlir = True
-                args.use_tuned = False
                args.local_tank_cache = local_tank_cache

                precision_values = ["fp16"]
@@ -75,6 +74,7 @@ def save_torch_model(torch_model_list, local_tank_cache):
                            width=512,
                            height=512,
                            use_base_vae=False,
+                            custom_vae="",
                            debug=True,
                            sharktank_dir=local_tank_cache,
                            generate_vmfb=False,
@@ -82,19 +82,33 @@ def save_torch_model(torch_model_list, local_tank_cache):
                        model()
                continue
            if model_type == "vision":
-                model, input, _ = get_vision_model(torch_model_name)
+                model, input, _ = get_vision_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "hf":
-                model, input, _ = get_hf_model(torch_model_name)
+                model, input, _ = get_hf_model(torch_model_name, import_args)
            elif model_type == "hf_seq2seq":
-                model, input, _ = get_hf_seq2seq_model(torch_model_name)
+                model, input, _ = get_hf_seq2seq_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "hf_img_cls":
-                model, input, _ = get_hf_img_cls_model(torch_model_name)
+                model, input, _ = get_hf_img_cls_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "fp16":
-                model, input, _ = get_fp16_model(torch_model_name)
+                model, input, _ = get_fp16_model(torch_model_name, import_args)
            torch_model_name = torch_model_name.replace("/", "_")
-            torch_model_dir = os.path.join(
-                local_tank_cache, str(torch_model_name) + "_torch"
-            )
+            if import_args["batch_size"] != 1:
+                torch_model_dir = os.path.join(
+                    local_tank_cache,
+                    str(torch_model_name)
+                    + "_torch"
+                    + f"_BS{str(import_args['batch_size'])}",
+                )
+            else:
+                torch_model_dir = os.path.join(
+                    local_tank_cache, str(torch_model_name) + "_torch"
+                )
            os.makedirs(torch_model_dir, exist_ok=True)

            mlir_importer = SharkImporter(
@@ -118,7 +132,7 @@ def save_torch_model(torch_model_list, local_tank_cache):
                )


-def save_tf_model(tf_model_list, local_tank_cache):
+def save_tf_model(tf_model_list, local_tank_cache, import_args):
    from tank.model_utils_tf import (
        get_causal_image_model,
        get_masked_lm_model,
@@ -127,6 +141,9 @@ def save_tf_model(tf_model_list, local_tank_cache):
        get_TFhf_model,
        get_tfhf_seq2seq_model,
    )
+    import os
+
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
    import tensorflow as tf

    visible_default = tf.config.list_physical_devices("GPU")
@@ -150,20 +167,38 @@ def save_tf_model(tf_model_list, local_tank_cache):
            input = None
            print(f"Generating artifacts for model {tf_model_name}")
            if model_type == "hf":
-                model, input, _ = get_causal_lm_model(tf_model_name)
+                model, input, _ = get_masked_lm_model(
+                    tf_model_name, import_args
+                )
            elif model_type == "img":
-                model, input, _ = get_causal_image_model(tf_model_name)
+                model, input, _ = get_causal_image_model(
+                    tf_model_name, import_args
+                )
            elif model_type == "keras":
-                model, input, _ = get_keras_model(tf_model_name)
+                model, input, _ = get_keras_model(tf_model_name, import_args)
            elif model_type == "TFhf":
-                model, input, _ = get_TFhf_model(tf_model_name)
+                model, input, _ = get_TFhf_model(tf_model_name, import_args)
            elif model_type == "tfhf_seq2seq":
-                model, input, _ = get_tfhf_seq2seq_model(tf_model_name)
+                model, input, _ = get_tfhf_seq2seq_model(
+                    tf_model_name, import_args
+                )
+            elif model_type == "hf_causallm":
+                model, input, _ = get_causal_lm_model(
+                    tf_model_name, import_args
+                )

            tf_model_name = tf_model_name.replace("/", "_")
-            tf_model_dir = os.path.join(
-                local_tank_cache, str(tf_model_name) + "_tf"
-            )
+            if import_args["batch_size"] != 1:
+                tf_model_dir = os.path.join(
+                    local_tank_cache,
+                    str(tf_model_name)
+                    + "_tf"
+                    + f"_BS{str(import_args['batch_size'])}",
+                )
+            else:
+                tf_model_dir = os.path.join(
+                    local_tank_cache, str(tf_model_name) + "_tf"
+                )
            os.makedirs(tf_model_dir, exist_ok=True)
            mlir_importer = SharkImporter(
                model,
@@ -175,13 +210,9 @@ def save_tf_model(tf_model_list, local_tank_cache):
                dir=tf_model_dir,
                model_name=tf_model_name,
            )
-            mlir_hash = create_hash(
-                os.path.join(tf_model_dir, tf_model_name + "_tf" + ".mlir")
-            )
-            np.save(os.path.join(tf_model_dir, "hash"), np.array(mlir_hash))


-def save_tflite_model(tflite_model_list, local_tank_cache):
+def save_tflite_model(tflite_model_list, local_tank_cache, import_args):
    from shark.tflite_utils import TFLitePreprocessor

    with open(tflite_model_list) as csvfile:
@@ -198,13 +229,13 @@ def save_tflite_model(tflite_model_list, local_tank_cache):
            os.makedirs(tflite_model_name_dir, exist_ok=True)
            print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")

-            # Preprocess to get SharkImporter input args
+            # Preprocess to get SharkImporter input import_args
            tflite_preprocessor = TFLitePreprocessor(str(tflite_model_name))
            raw_model_file_path = tflite_preprocessor.get_raw_model_file()
            inputs = tflite_preprocessor.get_inputs()
            tflite_interpreter = tflite_preprocessor.get_interpreter()

-            # Use SharkImporter to get SharkInference input args
+            # Use SharkImporter to get SharkInference input import_args
            my_shark_importer = SharkImporter(
                module=tflite_interpreter,
                inputs=inputs,
@@ -228,43 +259,69 @@ def save_tflite_model(tflite_model_list, local_tank_cache):
            )


-def gen_shark_files(modelname, frontend, tank_dir):
+def check_requirements(frontend):
+    import importlib
+
+    has_pkgs = False
+    if frontend == "torch":
+        tv_spec = importlib.util.find_spec("torchvision")
+        has_pkgs = tv_spec is not None
+
+    elif frontend in ["tensorflow", "tf"]:
+        tf_spec = importlib.util.find_spec("tensorflow")
+        has_pkgs = tf_spec is not None
+
+    return has_pkgs
+
+
+class NoImportException(Exception):
+    "Raised when requirements are not met for OTF model artifact generation."
+    pass
+
+
+def gen_shark_files(modelname, frontend, tank_dir, importer_args):
    # If a model's artifacts are requested by shark_downloader but they don't exist in the cloud, we call this function to generate the artifacts on-the-fly.
    # TODO: Add TFlite support.
    import tempfile

-    torch_model_csv = os.path.join(
-        os.path.dirname(__file__), "torch_model_list.csv"
-    )
-    tf_model_csv = os.path.join(os.path.dirname(__file__), "tf_model_list.csv")
-    custom_model_csv = tempfile.NamedTemporaryFile(
-        dir=os.path.dirname(__file__),
-        delete=True,
-    )
-    # Create a temporary .csv with only the desired entry.
-    if frontend == "tf":
-        with open(tf_model_csv, mode="r") as src:
-            reader = csv.reader(src)
-            for row in reader:
-                if row[0] == modelname:
-                    target = row
-        with open(custom_model_csv.name, mode="w") as trg:
-            writer = csv.writer(trg)
-            writer.writerow(["modelname", "src"])
-            writer.writerow(target)
-        save_tf_model(custom_model_csv.name, tank_dir)
+    import_args = importer_args
+    if check_requirements(frontend):
+        torch_model_csv = os.path.join(
+            os.path.dirname(__file__), "torch_model_list.csv"
+        )
+        tf_model_csv = os.path.join(
+            os.path.dirname(__file__), "tf_model_list.csv"
+        )
+        custom_model_csv = tempfile.NamedTemporaryFile(
+            dir=os.path.dirname(__file__),
+            delete=True,
+        )
+        # Create a temporary .csv with only the desired entry.
+        if frontend == "tf":
+            with open(tf_model_csv, mode="r") as src:
+                reader = csv.reader(src)
+                for row in reader:
+                    if row[0] == modelname:
+                        target = row
+            with open(custom_model_csv.name, mode="w") as trg:
+                writer = csv.writer(trg)
+                writer.writerow(["modelname", "src"])
+                writer.writerow(target)
+            save_tf_model(custom_model_csv.name, tank_dir, import_args)

-    if frontend == "torch":
-        with open(torch_model_csv, mode="r") as src:
-            reader = csv.reader(src)
-            for row in reader:
-                if row[0] == modelname:
-                    target = row
-        with open(custom_model_csv.name, mode="w") as trg:
-            writer = csv.writer(trg)
-            writer.writerow(["modelname", "src"])
-            writer.writerow(target)
-        save_torch_model(custom_model_csv.name, tank_dir)
+        elif frontend == "torch":
+            with open(torch_model_csv, mode="r") as src:
+                reader = csv.reader(src)
+                for row in reader:
+                    if row[0] == modelname:
+                        target = row
+            with open(custom_model_csv.name, mode="w") as trg:
+                writer = csv.writer(trg)
+                writer.writerow(["modelname", "src"])
+                writer.writerow(target)
+            save_torch_model(custom_model_csv.name, tank_dir, import_args)
+    else:
+        raise NoImportException


 # Validates whether the file is present or not.
@@ -276,7 +333,7 @@ def is_valid_file(arg):


 if __name__ == "__main__":
-    # Note, all of these flags are overridden by the import of args from stable_args.py, flags are duplicated temporarily to preserve functionality
+    # Note, all of these flags are overridden by the import of import_args from stable_args.py, flags are duplicated temporarily to preserve functionality
    # parser = argparse.ArgumentParser()
    # parser.add_argument(
    #    "--torch_model_csv",
@@ -304,8 +361,11 @@ if __name__ == "__main__":
    # )
    # parser.add_argument("--upload", type=bool, default=False)

-    # old_args = parser.parse_args()
-
+    # old_import_args = parser.parse_import_args()
+    import_args = {
+        "batch_size": "1",
+    }
+    print(import_args)
    home = str(Path.home())
    WORKDIR = os.path.join(os.path.dirname(__file__), "..", "gen_shark_tank")
    torch_model_csv = os.path.join(
@@ -319,7 +379,8 @@ if __name__ == "__main__":
    save_torch_model(
        os.path.join(os.path.dirname(__file__), "torch_sd_list.csv"),
        WORKDIR,
+        import_args,
    )
-    save_torch_model(torch_model_csv, WORKDIR)
-    save_tf_model(tf_model_csv, WORKDIR)
-    save_tflite_model(tflite_model_csv, WORKDIR)
+    save_torch_model(torch_model_csv, WORKDIR, import_args)
+    save_tf_model(tf_model_csv, WORKDIR, import_args)
+    save_tflite_model(tflite_model_csv, WORKDIR, import_args)
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -1,5 +1,4 @@
 from shark.shark_inference import SharkInference
-from shark.parser import shark_args

 import torch
 import numpy as np
@@ -35,17 +34,17 @@ hf_seq2seq_models = [
 ]


-def get_torch_model(modelname):
+def get_torch_model(modelname, import_args):
    if modelname in vision_models:
-        return get_vision_model(modelname)
+        return get_vision_model(modelname, import_args)
    elif modelname in hf_img_cls_models:
-        return get_hf_img_cls_model(modelname)
+        return get_hf_img_cls_model(modelname, import_args)
    elif modelname in hf_seq2seq_models:
-        return get_hf_seq2seq_model(modelname)
+        return get_hf_seq2seq_model(modelname, import_args)
    elif "fp16" in modelname:
-        return get_fp16_model(modelname)
+        return get_fp16_model(modelname, import_args)
    else:
-        return get_hf_model(modelname)
+        return get_hf_model(modelname, import_args)


 ##################### Hugging Face Image Classification Models ###################################
@@ -88,14 +87,14 @@ class HuggingFaceImageClassification(torch.nn.Module):
        return self.model.forward(inputs)[0]


-def get_hf_img_cls_model(name):
+def get_hf_img_cls_model(name, import_args):
    model = HuggingFaceImageClassification(name)
    # you can use preprocess_input_image to get the test_input or just random value.
    test_input = preprocess_input_image(name)
    # test_input = torch.FloatTensor(1, 3, 224, 224).uniform_(-1, 1)
    # print("test_input.shape: ", test_input.shape)
    # test_input.shape:  torch.Size([1, 3, 224, 224])
-    test_input = test_input.repeat(BATCH_SIZE, 1, 1, 1)
+    test_input = test_input.repeat(int(import_args["batch_size"]), 1, 1, 1)
    actual_out = model(test_input)
    # print("actual_out.shape： ", actual_out.shape)
    # actual_out.shape：  torch.Size([1, 1000])
@@ -125,14 +124,13 @@ class HuggingFaceLanguage(torch.nn.Module):
        return self.model.forward(tokens)[0]


-def get_hf_model(name):
+def get_hf_model(name, import_args):
    from transformers import (
        BertTokenizer,
    )

    model = HuggingFaceLanguage(name)
-    # TODO: Currently the test input is set to (1,128)
-    test_input = torch.randint(2, (BATCH_SIZE, 128))
+    test_input = torch.randint(2, (int(import_args["batch_size"]), 128))
    actual_out = model(test_input)
    return model, test_input, actual_out

@@ -165,7 +163,7 @@ class HFSeq2SeqLanguageModel(torch.nn.Module):
        )[0]


-def get_hf_seq2seq_model(name):
+def get_hf_seq2seq_model(name, import_args):
    m = HFSeq2SeqLanguageModel(name)
    encoded_input_ids = m.preprocess_input(
        "Studies have been shown that owning a dog is good for you"
@@ -193,53 +191,55 @@ class VisionModule(torch.nn.Module):
        return self.model.forward(input)


-def get_vision_model(torch_model):
+def get_vision_model(torch_model, import_args):
    import torchvision.models as models

    default_image_size = (224, 224)
+    modelname = torch_model
+    if modelname == "alexnet":
+        torch_model = models.alexnet(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet18":
+        torch_model = models.resnet18(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet50":
+        torch_model = models.resnet50(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet50_fp16":
+        torch_model = models.resnet50(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet50_fp16":
+        torch_model = models.resnet50(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "resnet101":
+        torch_model = models.resnet101(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "squeezenet1_0":
+        torch_model = models.squeezenet1_0(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "wide_resnet50_2":
+        torch_model = models.wide_resnet50_2(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "mobilenet_v3_small":
+        torch_model = models.mobilenet_v3_small(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "mnasnet1_0":
+        torch_model = models.mnasnet1_0(weights="DEFAULT")
+        input_image_size = default_image_size
+    if modelname == "efficientnet_b0":
+        torch_model = models.efficientnet_b0(weights="DEFAULT")
+        input_image_size = (224, 224)
+    if modelname == "efficientnet_b7":
+        torch_model = models.efficientnet_b7(weights="DEFAULT")
+        input_image_size = (600, 600)

-    vision_models_dict = {
-        "alexnet": (models.alexnet(weights="DEFAULT"), default_image_size),
-        "resnet18": (models.resnet18(weights="DEFAULT"), default_image_size),
-        "resnet50": (models.resnet50(weights="DEFAULT"), default_image_size),
-        "resnet50_fp16": (
-            models.resnet50(weights="DEFAULT"),
-            default_image_size,
-        ),
-        "resnet101": (models.resnet101(weights="DEFAULT"), default_image_size),
-        "squeezenet1_0": (
-            models.squeezenet1_0(weights="DEFAULT"),
-            default_image_size,
-        ),
-        "wide_resnet50_2": (
-            models.wide_resnet50_2(weights="DEFAULT"),
-            default_image_size,
-        ),
-        "mobilenet_v3_small": (
-            models.mobilenet_v3_small(weights="DEFAULT"),
-            default_image_size,
-        ),
-        "mnasnet1_0": (
-            models.mnasnet1_0(weights="DEFAULT"),
-            default_image_size,
-        ),
-        # EfficientNet input image size varies on the size of the model.
-        "efficientnet_b0": (
-            models.efficientnet_b0(weights="DEFAULT"),
-            (224, 224),
-        ),
-        "efficientnet_b7": (
-            models.efficientnet_b7(weights="DEFAULT"),
-            (600, 600),
-        ),
-    }
-    if isinstance(torch_model, str):
-        fp16_model = None
-        if "fp16" in torch_model:
-            fp16_model = True
-        torch_model, input_image_size = vision_models_dict[torch_model]
+    fp16_model = False
+    if "fp16" in modelname:
+        fp16_model = True
    model = VisionModule(torch_model)
-    test_input = torch.randn(BATCH_SIZE, 3, 224, 224)
+    test_input = torch.randn(
+        int(import_args["batch_size"]), 3, *input_image_size
+    )
    actual_out = model(test_input)
    if fp16_model is not None:
        test_input_fp16 = test_input.to(
@@ -280,14 +280,14 @@ class BertHalfPrecisionModel(torch.nn.Module):
        return self.model.forward(tokens)[0]


-def get_fp16_model(torch_model):
+def get_fp16_model(torch_model, import_args):
    from transformers import AutoTokenizer

    modelname = torch_model.replace("_fp16", "")
    model = BertHalfPrecisionModel(modelname)
    tokenizer = AutoTokenizer.from_pretrained(modelname)
    text = "Replace me by any text you like."
-    text = [text] * BATCH_SIZE
+    text = [text] * int(import_args["batch_size"])
    test_input_fp16 = tokenizer(
        text,
        truncation=True,
--- a/tank/model_utils_tf.py
+++ b/tank/model_utils_tf.py
@@ -1,10 +1,8 @@
+import os
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 import tensorflow as tf
 import numpy as np
-from transformers import (
-    AutoModelForSequenceClassification,
-    BertTokenizer,
-    TFBertModel,
-)

 BATCH_SIZE = 1

@@ -52,19 +50,19 @@ img_models = [
 ]


-def get_tf_model(name):
+def get_tf_model(name, import_args):
    if name in keras_models:
-        return get_keras_model(name)
+        return get_keras_model(name, import_args)
    elif name in maskedlm_models:
-        return get_masked_lm_model(name)
+        return get_masked_lm_model(name, import_args)
    elif name in causallm_models:
-        return get_causal_lm_model(name)
+        return get_causal_lm_model(name, import_args)
    elif name in tfhf_models:
-        return get_TFhf_model(name)
+        return get_TFhf_model(name, import_args)
    elif name in img_models:
-        return get_causal_image_model(name)
+        return get_causal_image_model(name, import_args)
    elif name in tfhf_seq2seq_models:
-        return get_tfhf_seq2seq_model(name)
+        return get_tfhf_seq2seq_model(name, import_args)
    else:
        raise Exception(
            "TF model not found! Please check that the modelname has been input correctly."
@@ -72,6 +70,12 @@ def get_tf_model(name):


 ##################### Tensorflow Hugging Face Bert Models ###################################
+from transformers import (
+    AutoModelForSequenceClassification,
+    BertTokenizer,
+    TFBertModel,
+)
+
 BERT_MAX_SEQUENCE_LENGTH = 128

 # Create a set of 2-dimensional inputs
@@ -104,7 +108,7 @@ class TFHuggingFaceLanguage(tf.Module):
        return self.m.predict(input_ids, attention_mask, token_type_ids)


-def get_TFhf_model(name):
+def get_TFhf_model(name, import_args):
    model = TFHuggingFaceLanguage(name)
    tokenizer = BertTokenizer.from_pretrained(
        "microsoft/MiniLM-L12-H384-uncased"
@@ -166,7 +170,6 @@ def preprocess_input(

 ##################### Tensorflow Hugging Face Masked LM Models ###################################
 from transformers import TFAutoModelForMaskedLM, AutoTokenizer
-import tensorflow as tf

 MASKED_LM_MAX_SEQUENCE_LENGTH = 128

@@ -196,7 +199,9 @@ class MaskedLM(tf.Module):
        return self.m.predict(input_ids, attention_mask)


-def get_masked_lm_model(hf_name, text="Hello, this is the default text."):
+def get_masked_lm_model(
+    hf_name, import_args, text="Hello, this is the default text."
+):
    model = MaskedLM(hf_name)
    encoded_input = preprocess_input(
        hf_name, MASKED_LM_MAX_SEQUENCE_LENGTH, text
@@ -251,7 +256,9 @@ class CausalLM(tf.Module):
        return self.model.predict(input_ids, attention_mask)


-def get_causal_lm_model(hf_name, text="Hello, this is the default text."):
+def get_causal_lm_model(
+    hf_name, import_args, text="Hello, this is the default text."
+):
    model = CausalLM(hf_name)
    batched_text = [text] * BATCH_SIZE
    encoded_input = model.preprocess_input(batched_text)
@@ -306,7 +313,7 @@ class TFHFSeq2SeqLanguageModel(tf.Module):
        return self.model.predict(input_ids, decoder_input_ids)


-def get_tfhf_seq2seq_model(name):
+def get_tfhf_seq2seq_model(name, import_args):
    m = TFHFSeq2SeqLanguageModel(name)
    text = "Studies have been shown that owning a dog is good for you"
    batched_text = [text] * BATCH_SIZE
@@ -442,7 +449,7 @@ def load_image(path_to_image, width, height, channels):
    return image


-def get_keras_model(modelname):
+def get_keras_model(modelname, import_args):
    if modelname == "efficientnet-v2-s":
        model = EfficientNetV2SModule()
    elif modelname == "efficientnet_b0":
@@ -530,7 +537,7 @@ def preprocess_input_image(model_name):
    return [inputs[str(*inputs)]]


-def get_causal_image_model(hf_name):
+def get_causal_image_model(hf_name, import_args):
    model = AutoModelImageClassfication(hf_name)
    test_input = preprocess_input_image(hf_name)
    # TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 1000), dtype=float32, numpy=
--- a/tank/test_models.py
+++ b/tank/test_models.py
@@ -4,10 +4,8 @@ from shark.iree_utils._common import (
    get_supported_device_list,
 )
 from shark.iree_utils.vulkan_utils import get_vulkan_triple_flag
-from parameterized import parameterized
-from shark.shark_downloader import download_model
-from shark.shark_inference import SharkInference
 from shark.parser import shark_args
+from parameterized import parameterized
 import iree.compiler as ireec
 import pytest
 import unittest
@@ -15,8 +13,8 @@ import numpy as np
 import csv
 import tempfile
 import os
+import sys
 import shutil
-import multiprocessing


 def load_csv_and_convert(filename, gen=False):
@@ -48,7 +46,9 @@ def load_csv_and_convert(filename, gen=False):
            )
    # This is a pytest workaround
    if gen:
-        with open("tank/dict_configs.py", "w+") as out:
+        with open(
+            os.path.join(os.path.dirname(__file__), "dict_configs.py"), "w+"
+        ) as out:
            out.write("ALL = [\n")
            for c in model_configs:
                out.write(str(c) + ",\n")
@@ -68,7 +68,9 @@ def get_valid_test_params():
    dynamic_list = (True, False)
    # TODO: This is soooo ugly, but for some reason creating the dict at runtime
    # results in strange pytest failures.
-    load_csv_and_convert("tank/all_models.csv", True)
+    load_csv_and_convert(
+        os.path.join(os.path.dirname(__file__), "all_models.csv"), True
+    )
    from tank.dict_configs import ALL

    config_list = ALL
@@ -135,9 +137,12 @@ class SharkModuleTester:
        self.config = config

    def create_and_check_module(self, dynamic, device):
+        shark_args.update_tank = self.update_tank
+        shark_args.force_update_tank = self.force_update_tank
+        shark_args.shark_prefix = self.shark_tank_prefix
        shark_args.local_tank_cache = self.local_tank_cache
-        shark_args.force_update_tank = self.update_tank
        shark_args.dispatch_benchmarks = self.benchmark_dispatches
+
        if self.benchmark_dispatches is not None:
            _m = self.config["model_name"].split("/")
            _m.extend([self.config["framework"], str(dynamic), device])
@@ -161,17 +166,40 @@ class SharkModuleTester:
        if "winograd" in self.config["flags"]:
            shark_args.use_winograd = True

-        model, func_name, inputs, golden_out = download_model(
-            self.config["model_name"],
-            tank_url=self.tank_url,
-            frontend=self.config["framework"],
-        )
+        import_config = {
+            "batch_size": self.batch_size,
+        }

+        from shark.shark_downloader import download_model
+        from shark.shark_inference import SharkInference
+        from tank.generate_sharktank import NoImportException
+
+        dl_gen_attempts = 2
+        for i in range(dl_gen_attempts):
+            try:
+                model, func_name, inputs, golden_out = download_model(
+                    self.config["model_name"],
+                    frontend=self.config["framework"],
+                    import_args=import_config,
+                )
+            except NoImportException as err:
+                pytest.xfail(
+                    reason=f"Artifacts for this model/config must be generated locally. Please make sure {self.config['framework']} is installed."
+                )
+            except AssertionError as err:
+                if i < dl_gen_attempts - 1:
+                    continue
+                else:
+                    pytest.xfail(
+                        "Generating OTF may require exiting the subprocess for files to be available."
+                    )
+            break
+        is_bench = True if self.benchmark is not None else False
        shark_module = SharkInference(
            model,
            device=device,
            mlir_dialect=self.config["dialect"],
-            is_benchmark=self.benchmark,
+            is_benchmark=is_bench,
        )

        try:
@@ -185,6 +213,10 @@ class SharkModuleTester:

        result = shark_module(func_name, inputs)
        golden_out, result = self.postprocess_outputs(golden_out, result)
+        if self.tf32 == "true":
+            print("Validating with relaxed tolerances.")
+            atol = 1e-02
+            rtol = 1e-03
        try:
            np.testing.assert_allclose(
                golden_out,
@@ -197,23 +229,31 @@ class SharkModuleTester:
                self.save_reproducers()
            if self.ci == True:
                self.upload_repro()
-            if self.benchmark == True:
-                self.benchmark_module(shark_module, inputs, dynamic, device)
+            if self.benchmark is not None:
+                self.benchmark_module(
+                    shark_module, inputs, dynamic, device, mode=self.benchmark
+                )
                print(msg)
                pytest.xfail(
                    reason=f"Numerics Mismatch: Use -s flag to print stderr during pytests."
                )
-        if self.benchmark == True:
-            self.benchmark_module(shark_module, inputs, dynamic, device)
+        if self.benchmark is not None:
+            self.benchmark_module(
+                shark_module, inputs, dynamic, device, mode=self.benchmark
+            )

        if self.save_repro == True:
            self.save_reproducers()

-    def benchmark_module(self, shark_module, inputs, dynamic, device):
+    def benchmark_module(
+        self, shark_module, inputs, dynamic, device, mode="native"
+    ):
+        model_config = {
+            "batch_size": self.batch_size,
+        }
        shark_args.enable_tf32 = self.tf32
        if shark_args.enable_tf32 == True:
            shark_module.compile()
-            shark_args.enable_tf32 = False

        shark_args.onnx_bench = self.onnx_bench
        shark_module.shark_runner.benchmark_all_csv(
@@ -222,6 +262,8 @@ class SharkModuleTester:
            dynamic,
            device,
            self.config["framework"],
+            import_args=model_config,
+            mode=mode,
        )

    def save_reproducers(self):
@@ -271,6 +313,9 @@ class SharkModuleTest(unittest.TestCase):
    @parameterized.expand(param_list, name_func=shark_test_name_func)
    def test_module(self, dynamic, device, config):
        self.module_tester = SharkModuleTester(config)
+        self.module_tester.batch_size = self.pytestconfig.getoption(
+            "batchsize"
+        )
        self.module_tester.benchmark = self.pytestconfig.getoption("benchmark")
        self.module_tester.save_repro = self.pytestconfig.getoption(
            "save_repro"
@@ -290,7 +335,12 @@ class SharkModuleTest(unittest.TestCase):
        self.module_tester.update_tank = self.pytestconfig.getoption(
            "update_tank"
        )
-        self.module_tester.tank_url = self.pytestconfig.getoption("tank_url")
+        self.module_tester.force_update_tank = self.pytestconfig.getoption(
+            "force_update_tank"
+        )
+        self.module_tester.shark_tank_prefix = self.pytestconfig.getoption(
+            "tank_prefix"
+        )
        self.module_tester.benchmark_dispatches = self.pytestconfig.getoption(
            "benchmark_dispatches"
        )
@@ -307,19 +357,26 @@ class SharkModuleTest(unittest.TestCase):
        if config["xfail_vkm"] == "True" and device in ["metal", "vulkan"]:
            pytest.xfail(reason=config["xfail_reason"])

-        if os.name == "nt" and "enabled_windows" not in config["xfail_other"]:
+        if (
+            self.pytestconfig.getoption("ci") == True
+            and os.name == "nt"
+            and "enabled_windows" not in config["xfail_other"]
+        ):
            pytest.xfail(reason="this model skipped on windows")

        # Special cases that need to be marked.
-        if "macos" in config["xfail_other"] and device in [
-            "metal",
-            "vulkan",
-        ]:
-            if get_vulkan_triple_flag() is not None:
-                if "m1-moltenvk-macos" in get_vulkan_triple_flag():
-                    pytest.xfail(
-                        reason="conv-related issue on MacStudio, returns VK_ERROR_DEVICE_LOST."
-                    )
+        if (
+            "macos" in config["xfail_other"]
+            and device
+            in [
+                "metal",
+                "vulkan",
+            ]
+            and sys.platform == "darwin"
+        ):
+            pytest.skip(
+                reason="conv-related issue on MacStudio, returns VK_ERROR_DEVICE_LOST."
+            )
        if (
            config["model_name"]
            in [
@@ -342,6 +399,10 @@ class SharkModuleTest(unittest.TestCase):
            pytest.xfail(
                reason="Numerics issues: https://github.com/nod-ai/SHARK/issues/476"
            )
+        if config["framework"] == "tf" and self.module_tester.batch_size != 1:
+            pytest.xfail(
+                reason="Configurable batch sizes temp. unavailable for tensorflow models."
+            )
        safe_name = (
            f"{config['model_name']}_{config['framework']}_{dynamic}_{device}"
        )
--- a/tank/torch_model_list.csv
+++ b/tank/torch_model_list.csv
@@ -1,4 +1,6 @@
 model_name, use_tracing, model_type, dynamic, param_count, tags, notes
+efficientnet_b0,True,vision,False,5.3M,"image-classification;cnn;conv2d;depthwise-conv","Smallest EfficientNet variant with 224x224 input"
+efficientnet_b7,True,vision,False,66M,"image-classification;cnn;conv2d;depthwise-conv","Largest EfficientNet variant with 600x600 input"
 microsoft/MiniLM-L12-H384-uncased,True,hf,True,66M,"nlp;bert-variant;transformer-encoder","Large version has 12 layers; 384 hidden size; Smaller than BERTbase (66M params vs 109M params)"
 bert-base-uncased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 bert-base-cased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
@@ -19,9 +21,3 @@ mnasnet1_0,False,vision,True,-,"cnn, torchvision, mobile, architecture-search","
 resnet50_fp16,False,vision,True,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
 bert-base-uncased_fp16,True,fp16,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 bert-large-uncased,True,hf,True,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
-t5-base,True,hf_seq2seq,True,220M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
-t5-large,True,hf_seq2seq,True,770M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
-efficientnet_b0,True,vision,False,5.3M,"image-classification;cnn;conv2d;depthwise-conv","Smallest EfficientNet variant with 224x224 input"
-efficientnet_b7,True,vision,False,66M,"image-classification;cnn;conv2d;depthwise-conv","Largest EfficientNet variant with 600x600 input"
-t5-base,True,hf_seq2seq,True,220M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
-t5-large,True,hf_seq2seq,True,770M,"nlp;transformer-encoder;transformer-decoder","Text-to-Text Transfer Transformer"
--- a/tank/torch_sd_list.csv
+++ b/tank/torch_sd_list.csv
@@ -1,4 +1,3 @@
 model_name, use_tracing, model_type, dynamic, param_count, tags, notes
 stabilityai/stable-diffusion-2-1-base,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
 stabilityai/stable-diffusion-2-1,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
-prompthero/openjourney,True,stable_diffusion,False,??M,"stable diffusion 2.1 base, LLM, Text to image", N/A
--- a/tank_version.json
+++ b/tank_version.json
@@ -0,0 +1,3 @@
+{
+	"version": "2023-03-31_02d52bb"
+}
Author	SHA1	Message	Date
m68k-fr	35de7e27fa	[Web] remove txt2img ui dependencies from png import metadata (#1275 )	2023-04-12 07:32:47 -10:00
yzhang93	467f900759	Add auto-tuner to SD apps (#1291 )	2023-04-12 09:21:17 -07:00
Ean Garvey	0bd9d582c7	Add documentation for using SHARK with AI-Render (#1296 )	2023-04-12 03:09:34 -10:00
jinchen62	428cfe8dae	Fix low vram mode issues (#1295 ) - add ondemand back to img2img - workaround memory leak for batch count	2023-04-11 17:59:09 -07:00
Ean Garvey	f17915bedc	Fix batch size appending to model name. (#1294 ) * Update shark_downloader.py * Update shark_downloader.py	2023-04-11 15:34:25 -05:00
Gaurav Shukla	1b49b5149a	[SD] Add Img2Img rest API Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-04-11 23:06:58 +05:30
jinchen62	3002793301	Unload clip on demand and workaround memory leak (#1283 )	2023-04-10 16:59:03 -07:00
Phaneesh Barwaria	d25ef5529f	Add fix for vae fp32 Upscalar (#1284 ) - fixes size mismatch error for upscalar vae	2023-04-07 14:36:40 -05:00
Ean Garvey	308856a947	Touch unet if base cfg needed for SD pipeline init (#1281 )	2023-04-05 03:02:29 -05:00
m68k-fr	151b4e142f	[SD] Fix encoder error for model_max_length not beeing 77 (#1278 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-04-04 22:39:29 -07:00
Ean Garvey	e5a69a7c36	pin diffusers to e47459c (#1279 )	2023-04-04 18:29:21 -07:00
m68k-fr	450b6cafc4	[SD] Add weight emphasis to prompts encoder (#1276 )	2023-04-04 09:47:04 -07:00
Daniel Garvey	237d26baa2	update model db to reflect changes (#1277 ) * remove 1/1 tqdm progress bar * update model_db to reflect changes	2023-04-04 11:46:55 -05:00
Daniel Garvey	67d6ee1104	remove 1/1 tqdm progress bar (#1274 )	2023-04-03 22:30:09 -05:00
Ean Garvey	98b069488e	Add tank_version.json (#1272 )	2023-04-03 18:36:23 -07:00
jinchen62	e0f227643a	Fix webui circular import issue (#1271 )	2023-04-03 16:00:10 -07:00
jinchen62	a0af3bb0cb	xload and unload models (#1242 )	2023-04-03 14:42:18 -07:00
powderluv	2cd61a5b96	strip source map (#1270 )	2023-04-03 14:41:32 -07:00
Gaurav Shukla	f49d41a807	[SD] Add Stable diffusion text2image rest API (#1265 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-04-03 12:02:24 -07:00
Ean Garvey	2191fc8952	Separate pytest benchmark modes and fix model updates for SHARK downloader / pytest. (#1264 ) * Only xfail windows models in CI * downloader: make model updates more robust. * Separate baseline and native benchmarks in pytest. * Fix native benchmarks * Fix torchvision model utils.	2023-04-03 08:24:21 -07:00
PhaneeshB	aea7796e60	add gradio client to spec	2023-04-03 18:57:19 +05:30
Abhishek Varma	a376619f1e	[SD] Improve vmfb caching algo and retry mechanism (#1248 ) -- This commit gets rid of the all-or-nothing vmfb caching mechanism and improves the retry mechanism by providing lower-level granularity for compiling each model units. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Ean Garvey <87458719+monorimet@users.noreply.github.com>	2023-03-31 09:38:14 -07:00
powderluv	02d52bb626	Add Intel ARC A770 target triple (#1263 ) This just enables the plumbing. It generates black images.	2023-03-29 14:49:05 -07:00
Abhishek Varma	3b63645f79	[SD] Fix custom model path for WebUI (#1260 ) -- This commit fixes custom model path for WebUI. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com>	2023-03-29 09:48:11 -07:00
Ean Garvey	d6f740b998	allow pytest to retry getting model artifacts + disable autotuning for pytorch benchmarks (#1257 ) * Adds a few xfails to enable macOS builder * Convert string batch sizes to ints where needed. * allow pytest to retry getting model artifacts * Reduce attempts and add assert msg.	2023-03-28 23:38:45 -05:00
Daniel Garvey	594c6b8ea2	fix ckpt dir (#1258 )	2023-03-28 14:31:01 -07:00
Ean Garvey	96b1560da5	Make batch size configurable via pytest and fix sharktank generation. (#1227 ) * Fix sharktank generation and add batch_size pytest option for torch. * Disable torch dynamo until py3.11 supported * Compile torchmodel without dynamo if torch.compile fails * Use release versions of TF/Keras for importer. * Pin torchvision and remove debug prints. * Remove duplicates from torch model list. * Update generate_sharktank.py * xfail a few models that fail sharktank generation/ numerics	2023-03-28 14:33:39 -05:00
Abhishek Varma	0ef6a0e234	[SD] Fix Stencil scribble crash by updating image resize (#1255 ) -- This commit updates Stencil resize feature to cap the size of images within [128,768] as supported by the SD pipeline. -- This solves the issue of scribble crashing on larger image. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com>	2023-03-28 10:13:11 -07:00
Gaurav Shukla	641d535f44	[SD] Fix device path issue for cpu (#1256 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-03-28 10:09:49 -07:00
Daniel Garvey	5bb7846227	single entry point exe for all cli apps (#1158 ) usage: add --app="img2img" (or "inpaint" "outpaint" "txt2img")	2023-03-28 11:15:21 -05:00
yzhang93	8f84258fb8	Fix check for use_tuned conditions (#1252 )	2023-03-27 11:21:25 -07:00
Ean Garvey	7619e76bbd	Disable and xfail some models that fail validation/compilation. (#1251 ) * Rollback T5 models for torch as the inputs give some issues that aren't trivial to resolve * xfail efficientnet-b0 on torch+cuda -- see CUDA requesting shared memory size larger than allowed size openxla/iree#12771	2023-03-27 12:42:53 -05:00
Daniel Garvey	9267eadbfa	disable openjourney gen for nightly (#1249 )	2023-03-27 11:55:34 -05:00
Phaneesh Barwaria	431132b8ee	Fix img2img mode switch (#1247 ) * add updated scheduler value in global config * clear scheduler global variable with others	2023-03-27 07:01:22 -07:00
cstueckrath	fb35e13e7a	fix Python version detection bug (#1246 ) * fix Python version detection bug * Update setup_venv.ps1	2023-03-27 07:00:40 -07:00
yzhang93	17a67897d1	Add SD v2.1 768x768 tuned model (#1244 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-03-24 10:39:15 -07:00
Gaurav Shukla	da449b73aa	[SD] Disable lora training tab for now (#1241 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-03-24 09:16:24 -07:00
Kyle Herndon	0b0526699a	Fix incorrect device argument initialization for LoRA training by extracting the device type and number and formatting it for pytorch (#1237 ) Co-authored-by: Kyle Herndon <kyle@nod-labs.com>	2023-03-24 01:10:50 -07:00
Boian Petkantchin	4fac46f7bb	In models testing fix paths to be relative to the script dir not cwd (#1128 ) authored-by: Boian Petkantchin <boian@nod-labs.com>	2023-03-22 15:26:52 -05:00
Daniel Garvey	49925950f1	fix false positives (#1193 )	2023-03-22 15:25:39 -05:00
Thomas	807947c0c8	Remove deprecated cli option iree-hal-cuda-disable-loop-nounroll-wa (#1235 )	2023-03-22 12:05:15 -05:00
Abhishek Varma	593428bda4	[SD] Fix for transformers/__init__.py issue in PyInstaller (#1233 ) -- This commit fixes the transformers/__init__.py issue in PyInstaller. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com>	2023-03-22 08:43:53 -07:00
Abhishek Varma	cede9b4fec	[SD] Fix custom_vae as a required parameter in inpaint (#1232 )	2023-03-22 04:30:17 -07:00
Prashant Kumar	c2360303f0	Add the int8 quantized model.	2023-03-22 16:28:13 +05:30
jinchen62	420366c1b8	Move schedulers to global obj (#1225 )	2023-03-21 22:40:43 -07:00