Add simple OPT tuning script.

fix formatting
Exclude non-square sizes from use_tuned on rdna2
2026-04-20 03:00:34 -04:00 · 2023-06-21 20:02:42 -05:00 · 2023-06-21 09:26:33 -05:00 · 2023-06-21 08:42:19 -05:00 · 2023-06-20 19:27:41 -07:00 · 2023-06-20 13:43:21 -05:00
32 changed files with 631 additions and 229 deletions
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -61,7 +61,6 @@ jobs:

    steps:
    - uses: actions/checkout@v3
-      if: matrix.os != '7950x'
    
    - name: Set Environment Variables
      if: matrix.os != '7950x'
@@ -84,9 +83,6 @@ jobs:
        #cache-dependency-path: |
        #  **/requirements-importer.txt
        #  **/requirements.txt
-    
-    - uses: actions/checkout@v2
-      if: matrix.os == '7950x'
          
    - name: Install dependencies
      if: matrix.suite == 'lint'
--- a/apps/language_models/src/pipelines/falcon_pipeline.py
+++ b/apps/language_models/src/pipelines/falcon_pipeline.py
@@ -28,8 +28,9 @@ parser = argparse.ArgumentParser(
    description="runs a falcon model",
 )

+parser.add_argument("--falcon_variant_to_use", default="7b", help="7b, 40b")
 parser.add_argument(
-    "--precision", "-p", default="fp32", help="fp32, fp16, int8, int4"
+    "--precision", "-p", default="fp16", help="fp32, fp16, int8, int4"
 )
 parser.add_argument("--device", "-d", default="cuda", help="vulkan, cpu, cuda")
 parser.add_argument(
@@ -40,7 +41,12 @@ parser.add_argument(
    default=None,
    help="path to falcon's mlir file",
 )
-
+parser.add_argument(
+    "--use_precompiled_model",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the precompiled vmfb",
+)
 parser.add_argument(
    "--load_mlir_from_shark_tank",
    default=False,
@@ -59,12 +65,12 @@ class Falcon(SharkLLMBase):
    def __init__(
        self,
        model_name,
-        hf_model_path="tiiuae/falcon-7b-instruct",
+        hf_model_path,
        max_num_tokens=150,
        device="cuda",
        precision="fp32",
-        falcon_mlir_path=Path("falcon.mlir"),
-        falcon_vmfb_path=Path("falcon.vmfb"),
+        falcon_mlir_path=None,
+        falcon_vmfb_path=None,
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_padding_length = 100
@@ -85,7 +91,7 @@ class Falcon(SharkLLMBase):
        return tokenizer

    def get_src_model(self):
-        print("Loading src model")
+        print("Loading src model: ", self.model_name)
        kwargs = {"torch_dtype": torch.float, "trust_remote_code": True}
        falcon_model = AutoModelForCausalLM.from_pretrained(
            self.hf_model_path, **kwargs
@@ -93,9 +99,26 @@ class Falcon(SharkLLMBase):
        return falcon_model

    def compile_falcon(self):
-        vmfb = get_vmfb_from_path(self.falcon_vmfb_path, self.device, "linalg")
-        if vmfb is not None:
-            return vmfb
+        if args.use_precompiled_model:
+            if not self.falcon_vmfb_path.exists():
+                # Downloading VMFB from shark_tank
+                download_public_file(
+                    "gs://shark_tank/falcon/"
+                    + "falcon_"
+                    + args.falcon_variant_to_use
+                    + "_"
+                    + self.precision
+                    + "_"
+                    + self.device
+                    + ".vmfb",
+                    self.falcon_vmfb_path.absolute(),
+                    single_file=True,
+                )
+            vmfb = get_vmfb_from_path(
+                self.falcon_vmfb_path, self.device, "linalg"
+            )
+            if vmfb is not None:
+                return vmfb

        print(
            f"[DEBUG] vmfb not found at {self.falcon_vmfb_path.absolute()}. Trying to work with"
@@ -106,27 +129,26 @@ class Falcon(SharkLLMBase):
                bytecode = f.read()
        else:
            mlir_generated = False
-            if args.load_mlir_from_shark_tank:
-                if self.precision == "fp32":
-                    # download MLIR from shark_tank for fp32
-                    download_public_file(
-                        "gs://shark_tank/falcon/7b/cuda/falcon.mlir",
-                        self.falcon_mlir_path.absolute(),
-                        single_file=True,
-                    )
-                    if self.falcon_mlir_path.exists():
-                        with open(self.falcon_mlir_path, "rb") as f:
-                            bytecode = f.read()
-                        mlir_generated = True
-                    else:
-                        raise ValueError(
-                            f"MLIR not found at {self.falcon_mlir_path.absolute()}"
-                            " after downloading! Please check path and try again"
-                        )
-                else:
-                    print(
-                        "Only fp32 mlir added to tank, generating mlir on device."
-                    )
+            # Downloading MLIR from shark_tank
+            download_public_file(
+                "gs://shark_tank/falcon/"
+                + "falcon_"
+                + args.falcon_variant_to_use
+                + "_"
+                + self.precision
+                + ".mlir",
+                self.falcon_mlir_path.absolute(),
+                single_file=True,
+            )
+            if self.falcon_mlir_path.exists():
+                with open(self.falcon_mlir_path, "rb") as f:
+                    bytecode = f.read()
+                mlir_generated = True
+            else:
+                raise ValueError(
+                    f"MLIR not found at {self.falcon_mlir_path.absolute()}"
+                    " after downloading! Please check path and try again"
+                )

            if not mlir_generated:
                compilation_input_ids = torch.randint(
@@ -184,6 +206,7 @@ class Falcon(SharkLLMBase):
                "--iree-vm-target-truncate-unsupported-floats",
                "--iree-codegen-check-ir-before-llvm-conversion=false",
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                "--iree-spirv-index-bits=64",
            ],
        )
        print("Saved falcon vmfb at ", str(path))
@@ -192,17 +215,6 @@ class Falcon(SharkLLMBase):
        return shark_module

    def compile(self):
-        if (
-            not self.falcon_vmfb_path.exists()
-            and self.device == "cuda"
-            and self.precision == "fp32"
-        ):
-            download_public_file(
-                "gs://shark_tank/falcon/7b/cuda/falcon.vmfb",
-                self.falcon_vmfb_path.absolute(),
-                single_file=True,
-            )
-
        falcon_shark_model = self.compile_falcon()
        return falcon_shark_model

@@ -375,6 +387,8 @@ class Falcon(SharkLLMBase):
                (model_inputs["input_ids"], model_inputs["attention_mask"]),
            )
        )
+        if self.precision == "fp16":
+            outputs = outputs.to(dtype=torch.float32)
        next_token_logits = outputs

        # pre-process distribution
@@ -428,18 +442,35 @@ if __name__ == "__main__":
    args = parser.parse_args()

    falcon_mlir_path = (
-        Path("falcon.mlir")
+        Path(
+            "falcon_"
+            + args.falcon_variant_to_use
+            + "_"
+            + args.precision
+            + ".mlir"
+        )
        if args.falcon_mlir_path is None
        else Path(args.falcon_mlir_path)
    )
    falcon_vmfb_path = (
-        Path("falcon.vmfb")
+        Path(
+            "falcon_"
+            + args.falcon_variant_to_use
+            + "_"
+            + args.precision
+            + "_"
+            + args.device
+            + ".vmfb"
+        )
        if args.falcon_vmfb_path is None
        else Path(args.falcon_vmfb_path)
    )

    falcon = Falcon(
-        "falcon",
+        "falcon_" + args.falcon_variant_to_use,
+        hf_model_path="tiiuae/falcon-"
+        + args.falcon_variant_to_use
+        + "-instruct",
        device=args.device,
        precision=args.precision,
        falcon_mlir_path=falcon_mlir_path,
@@ -451,11 +482,16 @@ if __name__ == "__main__":
    default_prompt_text = "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:"
    continue_execution = True

+    print("\n-----\nScript executing for the following config: \n")
+    print("Falcon Model: ", falcon.model_name)
+    print("Precision:    ", args.precision)
+    print("Device:       ", args.device)
+
    while continue_execution:
        use_default_prompt = input(
-            "\nDo you wish to use the default prompt text? True or False?: "
+            "\nDo you wish to use the default prompt text? Y/N ?: "
        )
-        if use_default_prompt:
+        if use_default_prompt in ["Y", "y"]:
            prompt = default_prompt_text
        else:
            prompt = input("Please enter the prompt text: ")
@@ -469,5 +505,8 @@ if __name__ == "__main__":
            res_str,
        )
        continue_execution = input(
-            "\nDo you wish to run script one more time? True or False?: "
+            "\nDo you wish to run script one more time? Y/N ?: "
+        )
+        continue_execution = (
+            True if continue_execution in ["Y", "y"] else False
        )
--- a/apps/language_models/src/pipelines/vicuna_pipeline.py
+++ b/apps/language_models/src/pipelines/vicuna_pipeline.py
@@ -10,7 +10,7 @@ from apps.language_models.utils import (
 from io import BytesIO
 from pathlib import Path
 from shark.shark_downloader import download_public_file
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, get_f16_inputs
 from shark.shark_inference import SharkInference
 from transformers import AutoTokenizer, AutoModelForCausalLM

@@ -78,10 +78,10 @@ class Vicuna(SharkLLMBase):
        else:
            mlir_generated = False
            if self.load_mlir_from_shark_tank:
-                if self.precision == "fp32":
-                    # download MLIR from shark_tank for fp32
+                if self.precision in ["fp32", "fp16"]:
+                    # download MLIR from shark_tank for fp32/fp16
                    download_public_file(
-                        "gs://shark_tank/vicuna/unsharded/mlir/first_vicuna.mlir",
+                        f"gs://shark_tank/vicuna/unsharded/mlir/{self.first_vicuna_mlir_path.name}",
                        self.first_vicuna_mlir_path.absolute(),
                        single_file=True,
                    )
@@ -96,7 +96,7 @@ class Vicuna(SharkLLMBase):
                        )
                else:
                    print(
-                        "Only fp32 mlir added to tank, generating mlir on device."
+                        f"Only fp32 and fp16 mlir added to tank, generating {self.precision} mlir on device."
                    )

            if not mlir_generated:
@@ -220,10 +220,10 @@ class Vicuna(SharkLLMBase):
        else:
            mlir_generated = False
            if self.load_mlir_from_shark_tank:
-                if self.precision == "fp32":
-                    # download MLIR from shark_tank for fp32
+                if self.precision in ["fp32", "fp16"]:
+                    # download MLIR from shark_tank for fp32/fp16
                    download_public_file(
-                        "gs://shark_tank/vicuna/unsharded/mlir/second_vicuna.mlir",
+                        f"gs://shark_tank/vicuna/unsharded/mlir/{self.second_vicuna_mlir_path.name}",
                        self.second_vicuna_mlir_path.absolute(),
                        single_file=True,
                    )
@@ -253,9 +253,15 @@ class Vicuna(SharkLLMBase):
                    model,
                    secondVicunaCompileInput,
                    is_f16=self.precision == "fp16",
-                    f16_input_mask=[False, False],
+                    f16_input_mask=[False] + [True] * 64,
                    mlir_type="torchscript",
                )
+                if self.precision == "fp16":
+                    secondVicunaCompileInput = get_f16_inputs(
+                        secondVicunaCompileInput,
+                        True,
+                        f16_input_mask=[False] + [True] * 64,
+                    )
                secondVicunaCompileInput = list(secondVicunaCompileInput)
                for i in range(len(secondVicunaCompileInput)):
                    if i != 0:
@@ -307,7 +313,7 @@ class Vicuna(SharkLLMBase):
                    if "%c19_i64 = arith.constant 19 : i64" in line:
                        new_lines.append("%c2 = arith.constant 2 : index")
                        new_lines.append(
-                            "%dim_4_int = tensor.dim %arg1, %c2 : tensor<1x32x?x128xf32>"
+                            f"%dim_4_int = tensor.dim %arg1, %c2 : tensor<1x32x?x128x{'f16' if self.precision == 'fp16' else 'f32'}>"
                        )
                        new_lines.append(
                            "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
@@ -365,14 +371,19 @@ class Vicuna(SharkLLMBase):
        # download vmfbs for A100
        if (
            not self.first_vicuna_vmfb_path.exists()
-            and self.device == "cuda"
-            and self.precision == "fp32"
+            and self.device in ["cuda", "cpu"]
+            and self.precision in ["fp32", "fp16"]
        ):
-            download_public_file(
-                "gs://shark_tank/vicuna/unsharded/first_vicuna.vmfb",
-                self.first_vicuna_vmfb_path.absolute(),
-                single_file=True,
-            )
+            # combinations that are still in the works
+            if not (self.device == "cuda" and self.precision == "fp16"):
+                # Will generate vmfb on device
+                pass
+            else:
+                download_public_file(
+                    f"gs://shark_tank/vicuna/unsharded/vmfb/{self.first_vicuna_vmfb_path.name}",
+                    self.first_vicuna_vmfb_path.absolute(),
+                    single_file=True,
+                )
        else:
            # get first vic
            # TODO: Remove after testing to avoid memory overload
@@ -380,26 +391,25 @@ class Vicuna(SharkLLMBase):
            pass
        if (
            not self.second_vicuna_vmfb_path.exists()
-            and self.device == "cuda"
-            and self.precision == "fp32"
+            and self.device in ["cuda", "cpu"]
+            and self.precision in ["fp32", "fp16"]
        ):
-            download_public_file(
-                "gs://shark_tank/vicuna/unsharded/second_vicuna.vmfb",
-                self.second_vicuna_vmfb_path.absolute(),
-                single_file=True,
-            )
+            # combinations that are still in the works
+            if not (self.device == "cuda" and self.precision == "fp16"):
+                # Will generate vmfb on device
+                pass
+            else:
+                download_public_file(
+                    f"gs://shark_tank/vicuna/unsharded/vmfb/{self.second_vicuna_vmfb_path.name}",
+                    self.second_vicuna_vmfb_path.absolute(),
+                    single_file=True,
+                )
        else:
            # get second vic
            # TODO: Remove after testing to avoid memory overload
            # svic_shark_model = self.compile_second_vicuna()
            pass

-        # get first vic
-        # fvic_shark_model = self.compile_first_vicuna()
-        # get second vic
-        # svic_shark_model = self.compile_second_vicuna()
-        # return tuple of shark_modules
-        # return fvic_shark_model, svic_shark_model
        return None
        # return tuple of shark_modules once mem is supported
        # return fvic_shark_model, svic_shark_model
--- a/apps/language_models/src/pipelines/vicuna_sharded_pipeline.py
+++ b/apps/language_models/src/pipelines/vicuna_sharded_pipeline.py
@@ -35,7 +35,7 @@ class Vicuna(SharkLLMBase):
        self.device = device
        self.precision = precision
        self.tokenizer = self.get_tokenizer()
-        self.shark_model = self.compile()
+        self.shark_model = self.compile(device=device)

    def get_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(
@@ -126,7 +126,7 @@ class Vicuna(SharkLLMBase):
        )
        return mlir_bytecode

-    def compile_to_vmfb(self, inputs, layers, is_first=True):
+    def compile_to_vmfb(self, inputs, layers, device="cpu", is_first=True):
        mlirs, modules = [], []
        for idx, layer in tqdm(enumerate(layers), desc="Getting mlirs"):
            if is_first:
@@ -210,19 +210,6 @@ class Vicuna(SharkLLMBase):

                else:
                    module = self.write_in_dynamic_inputs1(str(module), 138)
-                    if idx in [0, 5, 6, 7]:
-                        module_str = module
-                        module_str = module_str.splitlines()
-                        new_lines = []
-                        for line in module_str:
-                            if len(line) < 1000:
-                                new_lines.append(line)
-                            else:
-                                new_lines.append(line[:999])
-                        module_str = "\n".join(new_lines)
-                        f1_ = open(f"{idx}_1_test.mlir", "w+")
-                        f1_.write(module_str)
-                        f1_.close()

                    bytecode = module.encode("UTF-8")
                    bytecode_stream = BytesIO(bytecode)
@@ -236,20 +223,22 @@ class Vicuna(SharkLLMBase):
        for idx, layer in tqdm(enumerate(layers), desc="compiling modules"):
            if is_first:
                vmfb_path = Path(f"{idx}_0.vmfb")
-                if idx < 25:
-                    device = "cpu"
-                else:
-                    device = "cpu"
                if vmfb_path.exists():
                    # print(f"Found layer {idx} vmfb")
                    module = SharkInference(
-                        None, device=device, mlir_dialect="tm_tensor"
+                        None,
+                        device=device,
+                        device_idx=idx % 1,
+                        mlir_dialect="tm_tensor",
                    )
                    module.load_module(vmfb_path)
                else:
                    print(f"Compiling layer {idx} vmfb")
                    module = SharkInference(
-                        mlirs[idx], device=device, mlir_dialect="tm_tensor"
+                        mlirs[idx],
+                        device=device,
+                        device_idx=idx % 1,
+                        mlir_dialect="tm_tensor",
                    )
                    module.save_module(
                        module_name=f"{idx}_0",
@@ -264,20 +253,22 @@ class Vicuna(SharkLLMBase):
                modules.append(module)
            else:
                vmfb_path = Path(f"{idx}_1.vmfb")
-                if idx < 25:
-                    device = "cpu"
-                else:
-                    device = "cpu"
                if vmfb_path.exists():
                    # print(f"Found layer {idx} vmfb")
                    module = SharkInference(
-                        None, device=device, mlir_dialect="tm_tensor"
+                        None,
+                        device=device,
+                        device_idx=idx % 1,
+                        mlir_dialect="tm_tensor",
                    )
                    module.load_module(vmfb_path)
                else:
                    print(f"Compiling layer {idx} vmfb")
                    module = SharkInference(
-                        mlirs[idx], device=device, mlir_dialect="tm_tensor"
+                        mlirs[idx],
+                        device=device,
+                        device_idx=idx % 1,
+                        mlir_dialect="tm_tensor",
                    )
                    module.save_module(
                        module_name=f"{idx}_1",
@@ -293,7 +284,7 @@ class Vicuna(SharkLLMBase):

        return mlirs, modules

-    def get_sharded_model(self):
+    def get_sharded_model(self, device="cpu"):
        # SAMPLE_INPUT_LEN is used for creating mlir with dynamic inputs, which is currently an increadibly hacky proccess
        # please don't change it
        SAMPLE_INPUT_LEN = 137
@@ -316,7 +307,10 @@ class Vicuna(SharkLLMBase):
            FirstVicunaLayer(layer) for layer in vicuna_model.model.layers
        ]
        _, modules0 = self.compile_to_vmfb(
-            placeholder_input0, layers0, is_first=True
+            placeholder_input0,
+            layers0,
+            is_first=True,
+            device=device,
        )
        shark_layers0 = [CompiledFirstVicunaLayer(m) for m in modules0]

@@ -324,7 +318,7 @@ class Vicuna(SharkLLMBase):
            SecondVicunaLayer(layer) for layer in vicuna_model.model.layers
        ]
        _, modules1 = self.compile_to_vmfb(
-            placeholder_input1, layers1, is_first=False
+            placeholder_input1, layers1, is_first=False, device=device
        )
        shark_layers1 = [CompiledSecondVicunaLayer(m) for m in modules1]

@@ -333,8 +327,8 @@ class Vicuna(SharkLLMBase):
        )
        return sharded_model

-    def compile(self):
-        return self.get_sharded_model()
+    def compile(self, device="cpu"):
+        return self.get_sharded_model(device=device)

    def generate(self, prompt, cli=False):
        # TODO: refactor for cleaner integration
--- a/apps/stable_diffusion/scripts/tuner.py
+++ b/apps/stable_diffusion/scripts/tuner.py
@@ -17,6 +17,10 @@ from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel


 def load_mlir_module():
+    if "upscaler" in args.hf_model_id:
+        is_upscaler = True
+    else:
+        is_upscaler = False
    sd_model = SharkifyStableDiffusionModel(
        args.hf_model_id,
        args.ckpt_loc,
@@ -27,6 +31,7 @@ def load_mlir_module():
        height=args.height,
        width=args.width,
        use_base_vae=args.use_base_vae,
+        is_upscaler=is_upscaler,
        use_tuned=False,
        low_cpu_mem_usage=args.low_cpu_mem_usage,
        return_mlir=True,
--- a/apps/stable_diffusion/scripts/txt2img.py
+++ b/apps/stable_diffusion/scripts/txt2img.py
@@ -61,6 +61,7 @@ def main():
            dtype,
            args.use_base_vae,
            cpu_scheduling,
+            args.max_embeddings_multiples,
        )
        total_time = time.time() - start_time
        text_output = f"prompt={args.prompts}"
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -19,6 +19,7 @@ datas += copy_metadata('importlib_metadata')
 datas += copy_metadata('torch-mlir')
 datas += copy_metadata('omegaconf')
 datas += copy_metadata('safetensors')
+datas += copy_metadata('Pillow')
 datas += collect_data_files('diffusers')
 datas += collect_data_files('transformers')
 datas += collect_data_files('pytorch_lightning')
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -163,7 +163,7 @@ class SharkifyStableDiffusionModel:

    def get_extended_name_for_all_model(self):
        model_name = {}
-        sub_model_list = ["clip", "unet", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
+        sub_model_list = ["clip", "unet", "unet512", "stencil_unet", "vae", "vae_encode", "stencil_adaptor"]
        index = 0
        for model in sub_model_list:
            sub_model = model
@@ -415,7 +415,7 @@ class SharkifyStableDiffusionModel:
        )
        return shark_cnet, cnet_mlir

-    def get_unet(self):
+    def get_unet(self, use_large=False):
        class UnetModel(torch.nn.Module):
            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False, use_lora=self.use_lora):
                super().__init__()
@@ -426,7 +426,7 @@ class SharkifyStableDiffusionModel:
                )
                if use_lora != "":
                    update_lora_weight(self.unet, use_lora, "unet")
-                self.in_channels = self.unet.in_channels
+                self.in_channels = self.unet.config.in_channels
                self.train(False)
                if(args.attention_slicing is not None and args.attention_slicing != "none"):
                    if(args.attention_slicing.isdigit()):
@@ -452,17 +452,27 @@ class SharkifyStableDiffusionModel:
        unet = UnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
+        if(use_large):
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (inputs[0],
+                inputs[1],
+                torch.nn.functional.pad(inputs[2], pad),
+                inputs[3])
+            save_dir = os.path.join(self.sharktank_dir, self.model_name["unet512"])
+        else:
+            save_dir = os.path.join(self.sharktank_dir, self.model_name["unet"])
        input_mask = [True, True, True, False]
-        save_dir = os.path.join(self.sharktank_dir, self.model_name["unet"])
        if self.debug:
            os.makedirs(
                save_dir,
                exist_ok=True,
            )
+        model_name = "unet512" if use_large else "unet"
        shark_unet, unet_mlir = compile_through_fx(
            unet,
            inputs,
-            extended_model_name=self.model_name["unet"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
@@ -471,13 +481,13 @@ class SharkifyStableDiffusionModel:
            save_dir=save_dir,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="unet",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
        return shark_unet, unet_mlir

-    def get_unet_upscaler(self):
+    def get_unet_upscaler(self, use_large=False):
        class UnetModel(torch.nn.Module):
            def __init__(self, model_id=self.model_id, low_cpu_mem_usage=False):
                super().__init__()
@@ -502,6 +512,13 @@ class SharkifyStableDiffusionModel:
        unet = UnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
+        if(use_large):
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (inputs[0],
+                inputs[1],
+                torch.nn.functional.pad(inputs[2], pad),
+                inputs[3])
        input_mask = [True, True, True, False]
        shark_unet, unet_mlir = compile_through_fx(
            unet,
@@ -579,16 +596,16 @@ class SharkifyStableDiffusionModel:
                vae_dict = {k: v for k, v in vae_checkpoint.items() if k[0:4] != "loss" and k not in vae_ignore_keys}
                return vae_dict

-    def compile_unet_variants(self, model):
+    def compile_unet_variants(self, model, use_large=False):
        if model == "unet":
            if self.is_upscaler:
-                return self.get_unet_upscaler()
+                return self.get_unet_upscaler(use_large=use_large)
            # TODO: Plug the experimental "int8" support at right place.
            elif self.use_quantize == "int8":
                from apps.stable_diffusion.src.models.opt_params import get_unet
                return get_unet()
            else:
-                return self.get_unet()
+                return self.get_unet(use_large=use_large)
        else:
            return self.get_controlled_unet()

@@ -616,7 +633,7 @@ class SharkifyStableDiffusionModel:
        except Exception as e:
            sys.exit(e)

-    def unet(self):
+    def unet(self, use_large=False):
        try:
            model = "stencil_unet" if self.use_stencil is not None else "unet"
            compiled_unet = None
@@ -624,14 +641,14 @@ class SharkifyStableDiffusionModel:

            if self.base_model_id != "":
                self.inputs["unet"] = self.get_input_info_for(unet_inputs[self.base_model_id])
-                compiled_unet, unet_mlir = self.compile_unet_variants(model)
+                compiled_unet, unet_mlir = self.compile_unet_variants(model, use_large=use_large)
            else:
                for model_id in unet_inputs:
                    self.base_model_id = model_id
                    self.inputs["unet"] = self.get_input_info_for(unet_inputs[model_id])

                    try:
-                        compiled_unet, unet_mlir = self.compile_unet_variants(model)
+                        compiled_unet, unet_mlir = self.compile_unet_variants(model, use_large=use_large)
                    except Exception as e:
                        print(e)
                        print("Retrying with a different base model configuration")
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -81,6 +81,7 @@ class Text2ImagePipeline(StableDiffusionPipeline):
        dtype,
        use_base_vae,
        cpu_scheduling,
+        max_embeddings_multiples,
    ):
        # prompts and negative prompts must be a list.
        if isinstance(prompts, str):
@@ -112,7 +113,10 @@ class Text2ImagePipeline(StableDiffusionPipeline):

        # Get text embeddings with weight emphasis from prompts
        text_embeddings = self.encode_prompts_weight(
-            prompts, neg_prompts, max_length
+            prompts,
+            neg_prompts,
+            max_length,
+            max_embeddings_multiples=max_embeddings_multiples,
        )

        # guidance scale as a float32 tensor.
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -57,6 +57,7 @@ class StableDiffusionPipeline:
        self.vae = None
        self.text_encoder = None
        self.unet = None
+        self.unet_512 = None
        self.model_max_length = 77
        self.scheduler = scheduler
        # TODO: Implement using logging python utility.
@@ -114,6 +115,24 @@ class StableDiffusionPipeline:
        del self.unet
        self.unet = None

+    def load_unet_512(self):
+        if self.unet_512 is not None:
+            return
+
+        if self.import_mlir or self.use_lora:
+            self.unet_512 = self.sd_model.unet(use_large=True)
+        else:
+            try:
+                self.unet_512 = get_unet(use_large=True)
+            except Exception as e:
+                print(e)
+                print("download pipeline failed, falling back to import_mlir")
+                self.unet_512 = self.sd_model.unet(use_large=True)
+
+    def unload_unet_512(self):
+        del self.unet_512
+        self.unet_512 = None
+
    def load_vae(self):
        if self.vae is not None:
            return
@@ -203,7 +222,10 @@ class StableDiffusionPipeline:
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
-        self.load_unet()
+        if text_embeddings.shape[1] <= self.model_max_length:
+            self.load_unet()
+        else:
+            self.load_unet_512()
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype).detach().numpy()
@@ -222,16 +244,28 @@ class StableDiffusionPipeline:

            # Profiling Unet.
            profile_device = start_profiling(file_path="unet.rdc")
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                ),
-                send_to_host=False,
-            )
+            if text_embeddings.shape[1] <= self.model_max_length:
+                noise_pred = self.unet(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                noise_pred = self.unet_512(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                    ),
+                    send_to_host=False,
+                )
            end_profiling(profile_device)

            if cpu_scheduling:
@@ -254,6 +288,7 @@ class StableDiffusionPipeline:

        if self.ondemand:
            self.unload_unet()
+            self.unload_unet_512()
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -412,6 +447,11 @@ class StableDiffusionPipeline:
            # uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

+        if text_embeddings.shape[1] > model_max_length:
+            pad = (0, 0) * (len(text_embeddings.shape) - 2)
+            pad = pad + (0, 512 - text_embeddings.shape[1])
+            text_embeddings = torch.nn.functional.pad(text_embeddings, pad)
+
        # SHARK: Report clip inference time
        clip_inf_time = (time.time() - clip_inf_start) * 1000
        if self.ondemand:
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -37,4 +37,5 @@ from apps.stable_diffusion.src.utils.utils import (
    get_generation_text_info,
    update_lora_weight,
    resize_stencil,
+    _compile_module,
 )
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -116,7 +116,7 @@ def load_lower_configs(base_model_id=None):
        else:
            config_name = f"{args.annotation_model}_{args.precision}_{device}_{spec}.json"
    else:
-        if not spec or spec in ["rdna3", "sm_80"]:
+        if not spec or spec in ["sm_80"]:
            if (
                version in ["v2_1", "v2_1base"]
                and args.height == 768
@@ -125,6 +125,13 @@ def load_lower_configs(base_model_id=None):
                config_name = f"{args.annotation_model}_v2_1_768_{args.precision}_{device}.json"
            else:
                config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}.json"
+        elif spec in ["rdna3"] and version in [
+            "v2_1",
+            "v2_1base",
+            "v1_4",
+            "v1_5",
+        ]:
+            config_name = f"{args.annotation_model}_{version}_{args.max_length}_{args.precision}_{device}_{spec}_{args.width}x{args.height}.json"
        elif spec in ["rdna2"] and version in ["v2_1", "v2_1base", "v1_4"]:
            config_name = f"{args.annotation_model}_{version}_{args.precision}_{device}_{spec}_{args.width}x{args.height}.json"
        else:
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -108,6 +108,13 @@ p.add_argument(
    help="max length of the tokenizer output, options are 64 and 77.",
 )

+p.add_argument(
+    "--max_embeddings_multiples",
+    type=int,
+    default=5,
+    help="The max multiple length of prompt embeddings compared to the max output length of text encoder.",
+)
+
 p.add_argument(
    "--strength",
    type=float,
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -47,6 +47,7 @@ def get_vmfb_path_name(model_name):
 def _load_vmfb(shark_module, vmfb_path, model, precision):
    model = "vae" if "base_vae" in model or "vae_encode" in model else model
    model = "unet" if "stencil" in model else model
+    model = "unet" if "unet512" in model else model
    precision = "fp32" if "clip" in model else precision
    extra_args = get_opt_flags(model, precision)
    shark_module.load_module(vmfb_path, extra_args=extra_args)
@@ -115,6 +116,7 @@ def compile_through_fx(
    model_name=None,
    precision=None,
    return_mlir=False,
+    device=None,
 ):
    if not return_mlir and model_name is not None:
        vmfb_path = get_vmfb_path_name(extended_model_name)
@@ -145,7 +147,10 @@ def compile_through_fx(
    if use_tuned:
        if "vae" in extended_model_name.split("_")[0]:
            args.annotation_model = "vae"
-        if "unet" in model_name.split("_")[0]:
+        if (
+            "unet" in model_name.split("_")[0]
+            or "unet_512" in model_name.split("_")[0]
+        ):
            args.annotation_model = "unet"
        mlir_module = sd_model_annotation(
            mlir_module, extended_model_name, base_model_id
@@ -153,7 +158,7 @@ def compile_through_fx(

    shark_module = SharkInference(
        mlir_module,
-        device=args.device,
+        device=args.device if device is None else device,
        mlir_dialect="tm_tensor",
    )
    if generate_vmfb:
@@ -293,13 +298,18 @@ def set_init_device_flags():
    if (
        args.precision != "fp16"
        or args.height not in [512, 768]
-        or (args.height == 512 and args.width != 512)
-        or (args.height == 768 and args.width != 768)
+        or (args.height == 512 and args.width not in [512, 768])
+        or (args.height == 768 and args.width not in [512, 768])
        or args.batch_size != 1
        or ("vulkan" not in args.device and "cuda" not in args.device)
    ):
        args.use_tuned = False

+    elif (
+        args.height != args.width and "rdna2" in args.iree_vulkan_target_triple
+    ):
+        args.use_tuned = False
+
    elif base_model_id not in [
        "Linaqruf/anything-v3.0",
        "dreamlike-art/dreamlike-diffusion-1.0",
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,7 +1,8 @@
 from multiprocessing import Process, freeze_support
 import os
 import sys
-import transformers  # ensures inclusion in pysintaller exe generation
+import shutil
+import PIL, transformers  # ensures inclusion in pysintaller exe generation
 from apps.stable_diffusion.src import args, clear_all
 import apps.stable_diffusion.web.utils.global_obj as global_obj

@@ -57,15 +58,19 @@ if __name__ == "__main__":
        uvicorn.run(app, host="127.0.0.1", port=args.server_port)
        sys.exit(0)

-    import gradio as gr
+    # Setup to use shark_tmp for gradio's temporary image files and clear any
+    # existing temporary images there if they exist. Then we can import gradio.
+    # It has to be in this order or gradio ignores what we've set up.
    from apps.stable_diffusion.web.utils.gradio_configs import (
-        clear_gradio_tmp_imgs_folder,
+        config_gradio_tmp_imgs_folder,
    )
+
+    config_gradio_tmp_imgs_folder()
+    import gradio as gr
+
+    # Create custom models folders if they don't exist
    from apps.stable_diffusion.web.ui.utils import create_custom_models_folders

-    # Clear all gradio tmp images from the last session
-    clear_gradio_tmp_imgs_folder()
-    # Create custom models folders if they don't exist
    create_custom_models_folders()

    def resource_path(relative_path):
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -340,6 +340,10 @@ def img2img_api(
        lora_hf_id="",
        ondemand=False,
    )
+
+    # Converts generator type to subscriptable
+    res = list(res)[0]
+
    return {
        "images": encode_pil_to_base64(res[0]),
        "parameters": {},
@@ -585,10 +589,10 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                    with gr.Column(scale=2):
                        random_seed = gr.Button("Randomize Seed")
                        random_seed.click(
-                            None,
+                            lambda: -1,
                            inputs=[],
                            outputs=[seed],
-                            _js="() => -1",
+                            queue=False,
                        )
                    with gr.Column(scale=6):
                        stable_diffusion = gr.Button("Generate Image(s)")
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -484,10 +484,10 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                    with gr.Column(scale=2):
                        random_seed = gr.Button("Randomize Seed")
                        random_seed.click(
-                            None,
+                            lambda: -1,
                            inputs=[],
                            outputs=[seed],
-                            _js="() => -1",
+                            queue=False,
                        )
                    with gr.Column(scale=6):
                        stable_diffusion = gr.Button("Generate Image(s)")
--- a/apps/stable_diffusion/web/ui/lora_train_ui.py
+++ b/apps/stable_diffusion/web/ui/lora_train_ui.py
@@ -159,10 +159,10 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                    with gr.Column(scale=2):
                        random_seed = gr.Button("Randomize Seed")
                        random_seed.click(
-                            None,
+                            lambda: -1,
                            inputs=[],
                            outputs=[seed],
-                            _js="() => -1",
+                            queue=False,
                        )
                    with gr.Column(scale=6):
                        train_lora = gr.Button("Train LoRA")
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -512,10 +512,10 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                    with gr.Column(scale=2):
                        random_seed = gr.Button("Randomize Seed")
                        random_seed.click(
-                            None,
+                            lambda: -1,
                            inputs=[],
                            outputs=[seed],
-                            _js="() => -1",
+                            queue=False,
                        )
                    with gr.Column(scale=6):
                        stable_diffusion = gr.Button("Generate Image(s)")
--- a/apps/stable_diffusion/web/ui/outputgallery_ui.py
+++ b/apps/stable_diffusion/web/ui/outputgallery_ui.py
@@ -9,9 +9,6 @@ from apps.stable_diffusion.src.utils import (
    get_generated_imgs_todays_subdir,
 )
 from apps.stable_diffusion.web.ui.utils import nodlogo_loc
-from apps.stable_diffusion.web.utils.gradio_configs import (
-    gradio_tmp_galleries_folder,
-)
 from apps.stable_diffusion.web.utils.metadata import displayable_metadata

 # -- Functions for file, directory and image info querying
@@ -63,19 +60,6 @@ def output_subdirs() -> list[str]:
    return result_paths


-# clear zero length temporary files that gradio 3.22.0 buggily creates
-# TODO: remove once gradio is upgraded to or past 3.32.0
-def clear_zero_length_temps():
-    zero_length_temps = [
-        os.path.join(root, file)
-        for root, dirs, files in os.walk(gradio_tmp_galleries_folder)
-        for file in files
-        if os.path.getsize(os.path.join(root, file)) == 0
-    ]
-    for file in zero_length_temps:
-        os.remove(file)
-
-
 # --- Define UI layout for Gradio

 with gr.Blocks() as outputgallery_web:
@@ -104,8 +88,7 @@ with gr.Blocks() as outputgallery_web:
                value=gallery_files.value,
                visible=False,
                show_label=True,
-            ).style(grid=4)
-            gallery.DEFAULT_TEMP_DIR = gradio_tmp_galleries_folder
+            ).style(columns=4)

        with gr.Column(scale=4):
            with gr.Box():
@@ -179,7 +162,6 @@ with gr.Blocks() as outputgallery_web:
    # --- Event handlers

    def on_clear_gallery():
-        clear_zero_length_temps()
        return [
            gr.Gallery.update(
                value=[],
@@ -247,7 +229,6 @@ with gr.Blocks() as outputgallery_web:

        # only update if the current subdir is the most recent one as new images only go there
        if subdir_paths[0] == subdir:
-            clear_zero_length_temps()
            new_images = outputgallery_filenames(subdir)
            new_label = f"{len(new_images)} images in {os.path.join(output_dir, subdir)} - {status}"

--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -193,6 +193,7 @@ def txt2img_inf(
            dtype,
            args.use_base_vae,
            cpu_scheduling,
+            args.max_embeddings_multiples,
        )
        seeds.append(img_seed)
        total_time = time.time() - start_time
@@ -451,10 +452,10 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                    with gr.Column(scale=2):
                        random_seed = gr.Button("Randomize Seed")
                        random_seed.click(
-                            None,
+                            lambda: -1,
                            inputs=[],
                            outputs=[seed],
-                            _js="() => -1",
+                            queue=False,
                        )
                    with gr.Column(scale=6):
                        stable_diffusion = gr.Button("Generate Image(s)")
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -299,6 +299,9 @@ def upscaler_api(
        lora_hf_id="",
        ondemand=False,
    )
+    # Converts generator type to subscriptable
+    res = list(res)[0]
+
    return {
        "images": encode_pil_to_base64(res[0]),
        "parameters": {},
@@ -492,10 +495,10 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                    with gr.Column(scale=2):
                        random_seed = gr.Button("Randomize Seed")
                        random_seed.click(
-                            None,
+                            lambda: -1,
                            inputs=[],
                            outputs=[seed],
-                            _js="() => -1",
+                            queue=False,
                        )
                    with gr.Column(scale=6):
                        stable_diffusion = gr.Button("Generate Image(s)")
--- a/apps/stable_diffusion/web/utils/gradio_configs.py
+++ b/apps/stable_diffusion/web/utils/gradio_configs.py
@@ -1,60 +1,54 @@
 import os
 import shutil
-import tempfile
-import gradio
 from time import time

-gradio_tmp_imgs_folder = os.path.join(os.getcwd(), "shark_tmp/")
-gradio_tmp_galleries_folder = os.path.join(gradio_tmp_imgs_folder, "galleries")
+shark_tmp = os.path.join(os.getcwd(), "shark_tmp/")


-# Clear all gradio tmp images
-def clear_gradio_tmp_imgs_folder():
-    if not os.path.exists(gradio_tmp_imgs_folder):
-        return
+def config_gradio_tmp_imgs_folder():
+    # create shark_tmp if it does not exist
+    if not os.path.exists(shark_tmp):
+        os.mkdir(shark_tmp)
+
+    # tell gradio to use a directory under shark_tmp for its temporary
+    # image files unless somewhere else has been set
+    if "GRADIO_TEMP_DIR" not in os.environ:
+        os.environ["GRADIO_TEMP_DIR"] = os.path.join(shark_tmp, "gradio")

-    # clear all gradio tmp files created by generation galleries
    print(
-        "Clearing gradio temporary image files from a prior run. This may take some time..."
+        f"gradio temporary image cache located at {os.environ['GRADIO_TEMP_DIR']}. "
+        + "You may change this by setting the GRADIO_TEMP_DIR environment variable."
    )
-    image_files = [
-        filename
-        for filename in os.listdir(gradio_tmp_imgs_folder)
-        if os.path.isfile(os.path.join(gradio_tmp_imgs_folder, filename))
-        and filename.startswith("tmp")
-        and filename.endswith(".png")
-    ]
-    if len(image_files) > 0:
+
+    # Clear all gradio tmp images from the last session
+    if os.path.exists(os.environ["GRADIO_TEMP_DIR"]):
        cleanup_start = time()
-        for filename in image_files:
-            os.remove(gradio_tmp_imgs_folder + filename)
        print(
-            f"Clearing generation temporary image files took {time() - cleanup_start:4f} seconds"
+            "Clearing gradio UI temporary image files from a prior run. This may take some time..."
        )
-    else:
-        print("no generation temporary files to clear")
-
-    # Clear all gradio tmp files created by output galleries
-    if os.path.exists(gradio_tmp_galleries_folder):
-        cleanup_start = time()
-        shutil.rmtree(gradio_tmp_galleries_folder, ignore_errors=True)
+        shutil.rmtree(os.environ["GRADIO_TEMP_DIR"], ignore_errors=True)
        print(
-            f"Clearing output gallery temporary image files took {time() - cleanup_start:4f} seconds"
+            f"Clearing gradio UI temporary image files took {time() - cleanup_start:.4f} seconds."
        )
+
+    # older SHARK versions had to workaround gradio bugs and stored things differently
    else:
-        print("no output gallery temporary files to clear")
-
-
-# Overwrite save_pil_to_file from gradio to save tmp images generated by gradio into our own tmp folder
-def save_pil_to_file(pil_image, dir=None):
-    if not os.path.exists(gradio_tmp_imgs_folder):
-        os.mkdir(gradio_tmp_imgs_folder)
-    file_obj = tempfile.NamedTemporaryFile(
-        delete=False, suffix=".png", dir=gradio_tmp_imgs_folder
-    )
-    pil_image.save(file_obj)
-    return file_obj
-
-
-# Register save_pil_to_file override
-gradio.processing_utils.save_pil_to_file = save_pil_to_file
+        image_files = [
+            filename
+            for filename in os.listdir(shark_tmp)
+            if os.path.isfile(os.path.join(shark_tmp, filename))
+            and filename.startswith("tmp")
+            and filename.endswith(".png")
+        ]
+        if len(image_files) > 0:
+            print(
+                "Clearing temporary image files of a prior run of a previous SHARK version. This may take some time..."
+            )
+            cleanup_start = time()
+            for filename in image_files:
+                os.remove(shark_tmp + filename)
+            print(
+                f"Clearing temporary image files took {time() - cleanup_start:.4f} seconds."
+            )
+        else:
+            print("No temporary images files to clear.")
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,7 @@ parameterized

 # Add transformers, diffusers and scipy since it most commonly used
 transformers
-diffusers @ git+https://github.com/huggingface/diffusers@e47459c80f6f6a5a1c19d32c3fd74edf94f47aa2
+diffusers
 scipy
 ftfy
 gradio==3.34.0
--- a/rest_api_tests/api_test.py
+++ b/rest_api_tests/api_test.py
@@ -0,0 +1,109 @@
+import requests
+from PIL import Image
+import base64
+from io import BytesIO
+
+
+def upscaler_test():
+    # Define values here
+    prompt = ""
+    negative_prompt = ""
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    noise_level = 10
+    cfg_scale = 7
+    image_path = r"./rest_api_tests/dog.png"
+
+    # Converting Image to base64
+    img_file = open(image_path, "rb")
+    init_images = [
+        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+    ]
+
+    url = "http://127.0.0.1:8080/sdapi/v1/upscaler"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "seed": seed,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "noise_level": noise_level,
+        "cfg_scale": cfg_scale,
+        "init_images": init_images,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"response from server was : {res.status_code}")
+
+
+def img2img_test():
+    # Define values here
+    prompt = "Paint a rabbit riding on the dog"
+    negative_prompt = "ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+    seed = 2121991605
+    height = 512
+    width = 512
+    steps = 50
+    denoising_strength = 0.75
+    cfg_scale = 7
+    image_path = r"./rest_api_tests/dog.png"
+
+    # Converting Image to Base64
+    img_file = open(image_path, "rb")
+    init_images = [
+        "data:image/png;base64," + base64.b64encode(img_file.read()).decode()
+    ]
+
+    url = "http://127.0.0.1:8080/sdapi/v1/img2img"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "init_images": init_images,
+        "height": height,
+        "width": width,
+        "steps": steps,
+        "denoising_strength": denoising_strength,
+        "cfg_scale": cfg_scale,
+        "seed": seed,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+
+    print(f"response from server was : {res.status_code}")
+
+    print("Extracting response object")
+
+    # Uncomment below to save the picture
+
+    response_obj = res.json()
+    img_b64 = response_obj.get("images", [False])[0] or response_obj.get(
+        "image"
+    )
+    img_b2 = base64.b64decode(img_b64.replace("data:image/png;base64,", ""))
+    im_file = BytesIO(img_b2)
+    response_img = Image.open(im_file)
+    print("Saving Response Image to: response_img")
+    response_img.save(r"rest_api_tests/response_img.png")
+
+
+if __name__ == "__main__":
+    img2img_test()
+    upscaler_test()
--- a/rest_api_tests/dog.png
+++ b/rest_api_tests/dog.png
--- a/shark/examples/shark_inference/mega_test.py
+++ b/shark/examples/shark_inference/mega_test.py
@@ -0,0 +1,76 @@
+import torch
+import torch_mlir
+from shark.shark_inference import SharkInference
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    args,
+)
+from MEGABYTE_pytorch import MEGABYTE
+
+import os
+
+
+class MegaModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = MEGABYTE(
+            num_tokens=16000,  # number of tokens
+            dim=(
+                512,
+                256,
+            ),  # transformer model dimension (512 for coarsest, 256 for fine in this example)
+            max_seq_len=(
+                1024,
+                4,
+            ),  # sequence length for global and then local. this can be more than 2
+            depth=(
+                6,
+                4,
+            ),  # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's
+            dim_head=64,  # dimension per head
+            heads=8,  # number of attention heads
+            flash_attn=True,  # use flash attention
+        )
+
+    def forward(self, input):
+        return self.model(input)
+
+
+megaModel = MegaModel()
+input = [torch.randint(0, 16000, (1, 1024, 4))]
+
+# CURRENTLY IT BAILS OUT HERE BECAUSE OF MISSING OP LOWERINGS :-
+# 1. aten.alias
+shark_module, _ = compile_through_fx(
+    megaModel,
+    inputs=input,
+    extended_model_name="mega_shark",
+    debug=False,
+    generate_vmfb=True,
+    save_dir=os.getcwd(),
+    extra_args=[],
+    base_model_id=None,
+    model_name="mega_shark",
+    precision=None,
+    return_mlir=True,
+    device="cuda",
+)
+# logits = model(x)
+
+
+def print_output_info(output, msg):
+    print("\n", msg)
+    print("\n\t", output.shape)
+
+
+ans = shark_module("forward", input)
+print_output_info(torch.from_numpy(ans), "SHARK's output")
+
+ans = megaModel.forward(*input)
+print_output_info(ans, "ORIGINAL Model's output")
+
+# and sample from the logits accordingly
+# or you can use the generate function
+
+# NEED TO LOOK AT THIS LATER IF REQUIRED IN SHARK.
+# sampled = model.generate(temperature = 0.9, filter_thres = 0.9) # (1, 1024, 4)
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -136,7 +136,7 @@ def get_vendor(triple):
        return "Intel"
    if arch in ["turing", "ampere", "pascal"]:
        return "NVIDIA"
-    if arch == "ardeno":
+    if arch == "adreno":
        return "Qualcomm"
    if arch == "cpu":
        if product == "swiftshader":
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -114,6 +114,11 @@ def get_vulkan_target_triple(device_name):
    # Intel Targets
    elif any(x in device_name for x in ("A770", "A750")):
        triple = f"arc-770-{system_os}"
+
+    # Adreno Targets
+    elif all(x in device_name for x in ("Adreno", "740")):
+        triple = f"adreno-a740-{system_os}"
+
    else:
        triple = None
    return triple
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -370,6 +370,7 @@ def transform_fx(fx_g):
                torch.ops.aten.arange,
                torch.ops.aten.empty,
                torch.ops.aten.zeros,
+                torch.ops.aten.zeros_like,
            ]:
                if node.kwargs.get("dtype") == torch.float32:
                    node.kwargs = kwargs_dict
@@ -525,6 +526,8 @@ def import_with_fx(
                torch.ops.aten.split.Tensor,
                torch.ops.aten.split_with_sizes,
                torch.ops.aten.native_layer_norm,
+                torch.ops.aten.masked_fill.Tensor,
+                torch.ops.aten.masked_fill.Scalar,
            ]
        ),
    )(*inputs)
--- a/tank/examples/opt/tune_opt.py
+++ b/tank/examples/opt/tune_opt.py
@@ -0,0 +1,88 @@
+import os
+from pathlib import Path
+from shark_tuner.codegen_tuner import SharkCodegenTuner
+from shark_tuner.iree_utils import (
+    dump_dispatches,
+    create_context,
+    export_module_to_mlir_file,
+)
+from shark_tuner.model_annotation import model_annotation
+from shark_opt_wrapper import OPTForCausalLMModel
+from transformers import AutoTokenizer, OPTForCausalLM
+from shark.shark_importer import import_with_fx
+
+NUM_ITERS = 400
+MODEL_NAME = "facebook/opt-1.3b"
+MODEL_FNAME = "opt-1_3b-causallm"
+
+def load_mlir_module():
+    hf_model = OPTForCausalLM.from_pretrained(MODEL_NAME)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
+
+    opt_model = OPTForCausalLMModel(hf_model)
+
+    prompt = "What is the meaning of life?"
+    model_inputs = tokenizer(prompt, return_tensors="pt")
+    inputs = (
+        model_inputs["input_ids"],
+        model_inputs["attention_mask"],
+    )
+    
+    (
+        mlir_module,
+        func_name,
+    ) = import_with_fx(
+        model=opt_model,
+        inputs=inputs,
+        is_f16=False,
+        model_name=MODEL_NAME.split("/")[1],
+    )
+    return mlir_module, model_name
+
+
+def main():
+    #mlir_module, model_name = load_mlir_module()
+
+    # Get device and device specific arguments
+    device = "cpu"
+
+    # Dump model dispatches
+    model_name = MODEL_NAME
+    #generates_dir = "."
+    #if not os.path.exists(generates_dir):
+    #    os.makedirs(generates_dir)
+    #dump_mlir = generates_dir / "temp.mlir"
+    dispatch_dir = f"./{MODEL_FNAME}_{device}_dispatches"
+    #export_module_to_mlir_file(mlir_module, dump_mlir)
+    #dump_dispatches(
+    #    dump_mlir,
+    #    device,
+    #    dispatch_dir,
+    #)
+
+    # Tune each dispatch
+    dtype = "f32"
+    config_filename = f"{MODEL_FNAME}_{device}_configs.json"
+    for f_path in os.listdir(dispatch_dir):
+        if not f_path.endswith(".mlir"):
+            continue
+
+        model_dir = os.path.join(dispatch_dir, f_path)
+
+        tuner = SharkCodegenTuner(
+            model_dir,
+            device,
+            "random",
+            NUM_ITERS,
+            ".",
+            dtype,
+            search_op="all",
+            batch_size=1,
+            config_filename=config_filename,
+            use_dispatch=True,
+        )
+        tuner.tune()
+
+
+if __name__ == "__main__":
+    main()
--- a/tank/torch_model_list.csv
+++ b/tank/torch_model_list.csv
@@ -24,4 +24,5 @@ bert-large-uncased,True,hf,True,linalg,False,330M,"nlp;bert-variant;transformer-
 bert-base-uncased,True,hf,False,stablehlo,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
 gpt2,True,hf_causallm,False,stablehlo,True,125M,"nlp;transformer-encoder","-"
 facebook/opt-125m,True,hf,False,stablehlo,True,125M,"nlp;transformer-encoder","-"
-distilgpt2,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"
+distilgpt2,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"
+microsoft/deberta-v3-base,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"
Author	SHA1	Message	Date
Ean Garvey	dd40a3fafe	Add simple OPT tuning script.	2023-06-21 20:02:42 -05:00
Ean Garvey	bf6fcc353a	fix formatting	2023-06-21 09:26:33 -05:00
Ean Garvey	918eba6524	Exclude non-square sizes from use_tuned on rdna2	2023-06-21 08:42:19 -05:00
AyaanShah2204	d61b6641fb	Rest API: Resolved Generator Object not Subscripatable error (#1556 )	2023-06-20 19:27:41 -07:00
Phaneesh Barwaria	88cc2423cc	Enable Vicuna fp16 cpu (#1562 ) * fix second vic mlir gen * fp16 mlir/vmfb download from shark_tank	2023-06-20 13:43:21 -05:00
Ean Garvey	ccf944c1bd	Enable tuner for upscaler unet. (#1563 )	2023-06-20 13:40:13 -05:00
Ean Garvey	0def74f520	[SD] Update unet in_channels API and add PIL metadata to spec. (#1560 ) * Fix deprecation warning for unet config. * Include PIL metadata instead of hidden imports in SD spec.	2023-06-20 10:26:36 -07:00
Abhishek Varma	3fb72e192e	Add patch for making compile API work for both MEGABYTE and MiniGPT4 (#1559 ) -- It also modifies the mega_test.py script Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-06-20 10:04:17 -07:00
Vivek Khandelwal	855435ee24	Fix for the user input for Falcon pipeline	2023-06-20 18:09:32 +05:30
Elias Joseph	6f9f868fc0	fixed a bug where designating device for vicuna didn't work	2023-06-20 17:09:32 +05:30
powderluv	fb865f1b99	Move to checkout@v3 This will break Windows again but we have to fix it up since the old node.js is now deprecated.	2023-06-19 18:44:36 -07:00
rprasad2	3e5c50f07b	changes for tuning (#1542 ) * Add tuning sizes for rdna3	2023-06-19 15:29:08 -05:00
powderluv	a544f30a8f	Move mega to the shark examples (#1555 )	2023-06-19 11:10:51 -07:00
Abhishek Varma	1fe56d460a	[MEGABYTE] Add script to compile MEGABYTE through SHARK (#1553 ) -- Usage: `python mega_test.py`. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>	2023-06-19 11:00:35 -07:00
Vivek Khandelwal	fafd713141	Minor change to falcon pipeline	2023-06-19 22:36:32 +05:30
Vivek Khandelwal	015d0132c3	Modify falcon pipeline to add fp16 support (#1551 )	2023-06-19 09:57:13 -07:00
powderluv	20ddd96ef7	unpin diffusers (#1550 )	2023-06-18 13:45:55 -07:00
powderluv	ee33cfd2d1	Add PIL in main index.py (#1549 ) * Add PIL in main index.py This is to ensure pyinstaller picks it up * Update index.py	2023-06-18 11:51:44 -07:00
Stefan Kapusniak	a3cba21d5b	Fix load of unet512 vmfb fail on get of iree opts (#1546 ) * Change retrieval of Iree options used when loading an existing unet512 vmfb to look up the "unet" options rather than attempt to find a non-existent set of options for "unet512" Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-06-18 06:42:20 -07:00
Stefan Kapusniak	a7b6ec4095	Fix unet512 always being used when --max_length=77 (#1547 ) * Switches a few places in the SD pipeline where an assumption of max_length=64 was being made, to using the actual max_length as passed into the pipeline. This prevents unet512 always being used and producing different images than previously when --max_length=77	2023-06-18 06:41:25 -07:00
Ean Garvey	d80b087d95	Add PIL hidden imports to sd spec. (#1544 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-06-18 06:39:08 -07:00
Stefan Kapusniak	297a209608	Remove workarounds for gradio tempfile bugs (#1548 )	2023-06-17 19:50:36 -07:00
gpetters94	b204113563	Add UNet512 (#1504 ) Co-authored-by: Ean Garvey <87458719+monorimet@users.noreply.github.com>	2023-06-17 03:46:25 -04:00
Chi_Liu	f60ab1f4fa	Add Deberta to stablehlo in shark tank (#1545 )	2023-06-16 13:24:44 -07:00
Surya Jasper	b203779462	Added Adreno target triples to vulkan_utils (#1543 )	2023-06-15 16:42:59 -07:00
Stefan Kapusniak	38570a9bbb	Some Fixes for update to gradio 3.34.0 (#1538 ) * Fixes randomize seed buttons that stopped working. * Update now deprecated method to set initial colums for output gallery to the newer undeprecated one.	2023-06-15 01:10:36 -07:00