add generate_sharktank for stable_diffusion model defaults (#742)

Co-authored-by: dan <dan@nod-labs.com> Co-authored-by: powderluv <powderluv@users.noreply.github.com>
2026-04-03 03:00:17 -04:00 · 2023-01-31 11:44:54 -06:00
parent d17e8dc5ad
commit a4634632ba
9 changed files with 400 additions and 44 deletions
--- a/build_tools/stable_diff_main_test.sh
+++ b/build_tools/stable_diff_main_test.sh
@@ -0,0 +1,7 @@
+rm -rf ./test_images
+mkdir test_images
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned
+python shark/examples/shark_inference/stable_diffusion/main.py --device=vulkan --output_dir=./test_images --no-load_vmfb --no-use_tuned --beta_models=True
+
+python build_tools/image_comparison.py -n ./test_images/*.png
+exit $?
--- a/generate_sharktank.py
+++ b/generate_sharktank.py
@@ -18,6 +18,12 @@ import subprocess as sp
 import hashlib
 import numpy as np
 from pathlib import Path
+from shark.examples.shark_inference.stable_diffusion import (
+    model_wrappers as mw,
+)
+from shark.examples.shark_inference.stable_diffusion.stable_args import (
+    args,
+)


 def create_hash(file_name):
@@ -51,6 +57,32 @@ def save_torch_model(torch_model_list):

            model = None
            input = None
+            if model_type == "stable_diffusion":
+
+                args.use_tuned = False
+                args.import_mlir = True
+                args.use_tuned = False
+                args.local_tank_cache = WORKDIR
+
+                precision_values = ["fp16"]
+                seq_lengths = [64, 77]
+                for precision_value in precision_values:
+                    args.precision = precision_value
+                    for length in seq_lengths:
+                        model = mw.SharkifyStableDiffusionModel(
+                            model_id=torch_model_name,
+                            custom_weights="",
+                            precision=precision_value,
+                            max_len=length,
+                            width=512,
+                            height=512,
+                            use_base_vae=False,
+                            debug=True,
+                            sharktank_dir=WORKDIR,
+                            generate_vmfb=False,
+                        )
+                        model()
+                continue
            if model_type == "vision":
                model, input, _ = get_vision_model(torch_model_name)
            elif model_type == "hf":
@@ -205,34 +237,35 @@ def is_valid_file(arg):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--torch_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/torch_model_list.csv",
-        help="""Contains the file with torch_model name and args.
-             Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
-    )
-    parser.add_argument(
-        "--tf_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/tf_model_list.csv",
-        help="Contains the file with tf model name and args.",
-    )
-    parser.add_argument(
-        "--tflite_model_csv",
-        type=lambda x: is_valid_file(x),
-        default="./tank/tflite/tflite_model_list.csv",
-        help="Contains the file with tf model name and args.",
-    )
-    parser.add_argument(
-        "--ci_tank_dir",
-        type=bool,
-        default=False,
-    )
-    parser.add_argument("--upload", type=bool, default=False)
+    # Note, all of these flags are overridden by the import of args from stable_args.py, flags are duplicated temporarily to preserve functionality
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument(
+    #    "--torch_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/torch_model_list.csv",
+    #    help="""Contains the file with torch_model name and args.
+    #         Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+    # )
+    # parser.add_argument(
+    #    "--tf_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tf_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--tflite_model_csv",
+    #    type=lambda x: is_valid_file(x),
+    #    default="./tank/tflite/tflite_model_list.csv",
+    #    help="Contains the file with tf model name and args.",
+    # )
+    # parser.add_argument(
+    #    "--ci_tank_dir",
+    #    type=bool,
+    #    default=False,
+    # )
+    # parser.add_argument("--upload", type=bool, default=False)

-    args = parser.parse_args()
+    # old_args = parser.parse_args()

    home = str(Path.home())
    if args.ci_tank_dir == True:
--- a/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
+++ b/shark/examples/shark_inference/stable_diffusion/model_wrappers.py
@@ -1,10 +1,13 @@
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 from diffusers import AutoencoderKL, UNet2DConditionModel
 from transformers import CLIPTextModel
 from utils import compile_through_fx, get_opt_flags
 from resources import base_models
 from collections import defaultdict
 import torch
-import sys


 # These shapes are parameter dependent.
@@ -63,6 +66,9 @@ class SharkifyStableDiffusionModel:
        batch_size: int = 1,
        use_base_vae: bool = False,
        use_tuned: bool = False,
+        debug: bool = False,
+        sharktank_dir: str = "",
+        generate_vmfb: bool = True,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
@@ -73,7 +79,8 @@ class SharkifyStableDiffusionModel:
        self.precision = precision
        self.base_vae = use_base_vae
        self.model_name = (
-            str(batch_size)
+            "_"
+            + str(batch_size)
            + "_"
            + str(max_len)
            + "_"
@@ -84,6 +91,9 @@ class SharkifyStableDiffusionModel:
            + precision
        )
        self.use_tuned = use_tuned
+        self.debug = debug
+        self.sharktank_dir = sharktank_dir
+        self.generate_vmfb = generate_vmfb
        # We need a better naming convention for the .vmfbs because despite
        # using the custom model variant the .vmfb names remain the same and
        # it'll always pick up the compiled .vmfb instead of compiling the
@@ -130,13 +140,20 @@ class SharkifyStableDiffusionModel:
        inputs = tuple(self.inputs["vae"])
        is_f16 = True if self.precision == "fp16" else False
        vae_name = "base_vae" if self.base_vae else "vae"
+        vae_model_name = vae_name + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, vae_model_name), exist_ok=True
+            )
        shark_vae = compile_through_fx(
            vae,
            inputs,
            is_f16=is_f16,
-            model_name=vae_name + self.model_name,
            use_tuned=self.use_tuned,
+            model_name=vae_model_name,
            extra_args=get_opt_flags("vae", precision=self.precision),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
        )
        return shark_vae

@@ -169,14 +186,22 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False]
+        unet_model_name = "unet" + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, unet_model_name),
+                exist_ok=True,
+            )
        shark_unet = compile_through_fx(
            unet,
            inputs,
-            model_name="unet" + self.model_name,
+            model_name=unet_model_name,
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
        )
        return shark_unet

@@ -193,12 +218,20 @@ class SharkifyStableDiffusionModel:
                return self.text_encoder(input)[0]

        clip_model = CLIPText()
+        clip_model_name = "clip" + self.model_name
+        if self.debug:
+            os.makedirs(
+                os.path.join(self.sharktank_dir, clip_model_name),
+                exist_ok=True,
+            )

        shark_clip = compile_through_fx(
            clip_model,
            tuple(self.inputs["clip"]),
-            model_name="clip" + self.model_name,
+            model_name=clip_model_name,
            extra_args=get_opt_flags("clip", precision="fp32"),
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
        )
        return shark_clip

--- a/shark/examples/shark_inference/stable_diffusion/opt_params.py
+++ b/shark/examples/shark_inference/stable_diffusion/opt_params.py
@@ -1,8 +1,11 @@
 import sys
-from resources import models_db
+import resources
 from stable_args import args
 from utils import get_shark_model

+models_db = (
+    resources.beta_models_db if args.beta_models else resources.models_db
+)
 BATCH_SIZE = len(args.prompts)
 if BATCH_SIZE != 1:
    sys.exit("Only batch size 1 is supported.")
--- a/shark/examples/shark_inference/stable_diffusion/resources.py
+++ b/shark/examples/shark_inference/stable_diffusion/resources.py
@@ -28,6 +28,7 @@ def get_json_file(path):
 # it will run all the global vars.
 prompts_examples = get_json_file("resources/prompts.json")
 models_db = get_json_file("resources/model_db.json")
+beta_models_db = get_json_file("resources/beta_model_db.json")

 # The base_model contains the input configuration for the different
 # models and also helps in providing information for the variants.
--- a/shark/examples/shark_inference/stable_diffusion/resources/beta_model_db.json
+++ b/shark/examples/shark_inference/stable_diffusion/resources/beta_model_db.json
@@ -0,0 +1,177 @@
+[
+  {
+    "stablediffusion/untuned":"gs://shark_tank/latest",
+    "stablediffusion/tuned":"gs://shark_tank/sd_tuned",
+    "stablediffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "anythingv3/untuned":"gs://shark_tank/sd_anythingv3",
+    "anythingv3/tuned":"gs://shark_tank/sd_tuned",
+    "anythingv3/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "analogdiffusion/untuned":"gs://shark_tank/sd_analog_diffusion",
+    "analogdiffusion/tuned":"gs://shark_tank/sd_tuned",
+    "analogdiffusion/tuned/cuda":"gs://shark_tank/sd_tuned/cuda",
+    "openjourney/untuned":"gs://shark_tank/sd_openjourney",
+    "openjourney/tuned":"gs://shark_tank/sd_tuned",
+    "dreamlike/untuned":"gs://shark_tank/sd_dreamlike_diffusion"
+  },
+  {
+    "stablediffusion/v1_4/unet/fp16/length_77/untuned":"unet_8dec_fp16",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned":"unet_8dec_fp16_tuned",
+    "stablediffusion/v1_4/unet/fp16/length_77/tuned/cuda":"unet_8dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/unet/fp32/length_77/untuned":"unet_1dec_fp32",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_19dec_fp16",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned":"vae_19dec_fp16_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/tuned/cuda":"vae_19dec_fp16_cuda_tuned",
+    "stablediffusion/v1_4/vae/fp16/length_77/untuned/base":"vae_8dec_fp16",
+    "stablediffusion/v1_4/vae/fp32/length_77/untuned":"vae_1dec_fp32",
+    "stablediffusion/v1_4/clip/fp32/length_77/untuned":"clip_18dec_fp32",
+    "stablediffusion/v2_1base/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned":"unet2base_8dec_fp16_tuned_v2",
+    "stablediffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"unet2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/untuned":"unet64_512_512_fp16_stabilityai_stable_diffusion_2_1_basec",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned":"unet_19dec_v2p1base_fp16_64_tuned",
+    "stablediffusion/v2_1base/unet/fp16/length_64/tuned/cuda":"unet_19dec_v2p1base_fp16_64_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned":"vae2base_19dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"vae2base_19dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/untuned/base":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base":"vae2base_8dec_fp16_tuned",
+    "stablediffusion/v2_1base/vae/fp16/length_77/tuned/base/cuda":"vae2base_8dec_fp16_cuda_tuned",
+    "stablediffusion/v2_1base/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1base/clip/fp32/length_64/untuned":"clip64_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/unet/fp16/length_77/untuned":"unet77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned":"vae77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/vae/fp16/length_77/untuned/base":"77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "stablediffusion/v2_1/clip/fp32/length_77/untuned":"clip77_512_512_fp16_stabilityai_stable_diffusion_2_1_base",
+    "anythingv3/v2_1base/unet/fp16/length_77/untuned":"av3_unet_19dec_fp16",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned":"av3_unet_19dec_fp16_tuned",
+    "anythingv3/v2_1base/unet/fp16/length_77/tuned/cuda":"av3_unet_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/unet/fp32/length_77/untuned":"av3_unet_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned":"av3_vae_19dec_fp16",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned":"av3_vae_19dec_fp16_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/tuned/cuda":"av3_vae_19dec_fp16_cuda_tuned",
+    "anythingv3/v2_1base/vae/fp16/length_77/untuned/base":"av3_vaebase_22dec_fp16",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned":"av3_vae_19dec_fp32",
+    "anythingv3/v2_1base/vae/fp32/length_77/untuned/base":"av3_vaebase_22dec_fp32",
+    "anythingv3/v2_1base/clip/fp32/length_77/untuned":"av3_clip_19dec_fp32",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/untuned":"ad_unet_19dec_fp16",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned":"ad_unet_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/unet/fp16/length_77/tuned/cuda":"ad_unet_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/unet/fp32/length_77/untuned":"ad_unet_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned":"ad_vae_19dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned":"ad_vae_19dec_fp16_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/tuned/cuda":"ad_vae_19dec_fp16_cuda_tuned",
+    "analogdiffusion/v2_1base/vae/fp16/length_77/untuned/base":"ad_vaebase_22dec_fp16",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned":"ad_vae_19dec_fp32",
+    "analogdiffusion/v2_1base/vae/fp32/length_77/untuned/base":"ad_vaebase_22dec_fp32",
+    "analogdiffusion/v2_1base/clip/fp32/length_77/untuned":"ad_clip_19dec_fp32",
+    "openjourney/v2_1base/unet/fp16/length_64/untuned":"oj_unet_22dec_fp16_64",
+    "openjourney/v2_1base/unet/fp32/length_64/untuned":"oj_unet_22dec_fp32_64",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned":"oj_vae_22dec_fp16",
+    "openjourney/v2_1base/vae/fp16/length_77/untuned/base":"oj_vaebase_22dec_fp16",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned":"oj_vae_22dec_fp32",
+    "openjourney/v2_1base/vae/fp32/length_77/untuned/base":"oj_vaebase_22dec_fp32",
+    "openjourney/v2_1base/clip/fp32/length_64/untuned":"oj_clip_22dec_fp32_64",
+    "dreamlike/v2_1base/unet/fp16/length_77/untuned":"dl_unet_23dec_fp16_77",
+    "dreamlike/v2_1base/unet/fp32/length_77/untuned":"dl_unet_23dec_fp32_77",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned":"dl_vae_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp16/length_77/untuned/base":"dl_vaebase_23dec_fp16",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned":"dl_vae_23dec_fp32",
+    "dreamlike/v2_1base/vae/fp32/length_77/untuned/base":"dl_vaebase_23dec_fp32",
+    "dreamlike/v2_1base/clip/fp32/length_77/untuned":"dl_clip_23dec_fp32_77"
+  },
+  {
+    "unet": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": []
+        },
+        "fp32": {
+          "default_compilation_flags": []
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32"
+          ],
+          "specified_compilation_flags": {
+            "cuda": ["--iree-flow-enable-conv-nchw-to-nhwc-transform"],
+            "default_device": ["--iree-flow-enable-conv-img2col-transform"]
+          }
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "vae": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=32",
+            "--iree-flow-enable-conv-img2col-transform"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-enable-conv-nchw-to-nhwc-transform",
+            "--iree-flow-enable-padding-linalg-ops",
+            "--iree-flow-linalg-ops-padding-size=16"
+          ]
+        }
+      }
+    },
+    "clip": {
+      "tuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      },
+      "untuned": {
+        "fp16": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        },
+        "fp32": {
+          "default_compilation_flags": [
+            "--iree-flow-linalg-ops-padding-size=16",
+            "--iree-flow-enable-padding-linalg-ops"
+          ]
+        }
+      }
+    }
+  }
+]
--- a/shark/examples/shark_inference/stable_diffusion/stable_args.py
+++ b/shark/examples/shark_inference/stable_diffusion/stable_args.py
@@ -1,3 +1,4 @@
+import os
 import argparse
 from pathlib import Path

@@ -6,6 +7,13 @@ def path_expand(s):
    return Path(s).expanduser().resolve()


+def is_valid_file(arg):
+    if not os.path.exists(arg):
+        return None
+    else:
+        return arg
+
+
 p = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )
@@ -174,7 +182,12 @@ p.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Enable showing the stack trace when retrying the base model configuration",
 )
-
+p.add_argument(
+    "--beta_models",
+    default=False,
+    type=bool,
+    help="(False/True), use beta model files",
+)
 ##############################################################################
 ### IREE - Vulkan supported flags
 ##############################################################################
@@ -299,4 +312,47 @@ p.add_argument(
    help="Options are unet and vae.",
 )

+p.add_argument(
+    "--use_winograd",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Apply Winograd on selected conv ops.",
+)
+##############################################################################
+### CI generation tags
+##############################################################################
+
+# TODO: remove from here once argparse is not required by half of sd, none of these are relevant to main.py
+
+p.add_argument(
+    "--ci_tank_dir",
+    default=True,
+    type=bool,
+    help="used for CI generation purposes only.",
+)
+p.add_argument(
+    "--upload",
+    default=False,
+    type=bool,
+    help="upload generated models to shark tank (builder only), irrelevant to main.py",
+)
+p.add_argument(
+    "--torch_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/torch_model_list.csv",
+    help="""Contains the file with torch_model name and args.
+		 Please see: https://github.com/nod-ai/SHARK/blob/main/tank/torch_model_list.csv""",
+)
+p.add_argument(
+    "--tf_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/tf_model_list.csv",
+    help="Contains the file with tf model name and args.",
+)
+p.add_argument(
+    "--tflite_model_csv",
+    type=lambda x: is_valid_file(x),
+    default="./tank/tflite/tflite_model_list.csv",
+    help="Contains the file with tf model name and args.",
+)
 args = p.parse_args()
--- a/shark/examples/shark_inference/stable_diffusion/utils.py
+++ b/shark/examples/shark_inference/stable_diffusion/utils.py
@@ -1,7 +1,9 @@
 import os
 import gc
+import tempfile
+import torch
 from shark.shark_inference import SharkInference
-from stable_args import args
+from shark.examples.shark_inference.stable_diffusion.stable_args import args
 from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
@@ -81,6 +83,9 @@ def compile_through_fx(
    f16_input_mask=None,
    use_tuned=False,
    extra_args=[],
+    save_dir=tempfile.gettempdir(),
+    debug=False,
+    generate_vmfb=True,
 ):

    from shark.parser import shark_args
@@ -109,13 +114,25 @@ def compile_through_fx(
            mlir_module = f.read()
            f.close()

-    shark_module = SharkInference(
-        mlir_module,
-        device=args.device,
-        mlir_dialect="linalg",
-    )
+    save_dir = os.path.join(args.local_tank_cache, model_name)

-    return _compile_module(shark_module, model_name, extra_args)
+    mlir_module, func_name, = import_with_fx(
+        model=model,
+        inputs=inputs,
+        is_f16=is_f16,
+        f16_input_mask=f16_input_mask,
+        debug=debug,
+        model_name=model_name,
+        save_dir=save_dir,
+    )
+    if generate_vmfb:
+        shark_module = SharkInference(
+            mlir_module,
+            device=args.device,
+            mlir_dialect="linalg",
+        )
+
+        return _compile_module(shark_module, model_name, extra_args)


 def set_iree_runtime_flags():
@@ -267,6 +284,23 @@ def set_init_device_flags():
    ]:
        args.use_tuned = False

+    # Use tuned model in the case of stablediffusion/fp16 and cuda device sm_80
+    if (
+        args.hf_model_id
+        in [
+            "stabilityai/stable-diffusion-2-1-base",
+            "Linaqruf/anything-v3.0",
+            "wavymulder/Analog-Diffusion",
+        ]
+        and args.precision == "fp16"
+        and "cuda" in args.device
+        and get_cuda_sm_cc() in ["sm_80", "sm_89"]
+        and args.use_tuned  # required to avoid always forcing true on these cards
+    ):
+        args.use_tuned = True
+    else:
+        args.use_tuned = False
+
    if args.use_tuned:
        print(f"Using {args.device} tuned models for stablediffusion/fp16.")
    else:
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -164,6 +164,7 @@ class SharkImporter:
        func_name="forward",
        dir=tempfile.gettempdir(),
        model_name="model",
+        golden_values=None,
    ):
        if self.inputs == None:
            print(
@@ -183,7 +184,11 @@ class SharkImporter:
        if self.frontend in ["torch", "pytorch"]:
            import torch

-            golden_out = self.module(*self.inputs)
+            golden_out = None
+            if golden_values is not None:
+                golden_out = golden_values
+            else:
+                golden_out = self.module(*self.inputs)
            if torch.is_tensor(golden_out):
                golden_out = tuple(
                    golden_out.detach().cpu().numpy(),
@@ -364,11 +369,16 @@ def import_with_fx(
    debug=False,
    training=False,
    return_str=False,
+    save_dir=tempfile.gettempdir(),
+    model_name="model",
 ):
    import torch
    from torch.fx.experimental.proxy_tensor import make_fx
    from torch._decomp import get_decompositions

+    golden_values = None
+    if debug:
+        golden_values = model(*inputs)
    # TODO: Control the decompositions.
    fx_g = make_fx(
        model,
@@ -422,8 +432,10 @@ def import_with_fx(
        return_str=return_str,
    )

-    if debug and not is_f16:
-        (mlir_module, func_name), _, _ = mlir_importer.import_debug()
+    if debug:  # and not is_f16:
+        (mlir_module, func_name), _, _ = mlir_importer.import_debug(
+            dir=save_dir, model_name=model_name, golden_values=golden_values
+        )
        return mlir_module, func_name

    mlir_module, func_name = mlir_importer.import_mlir()