Remove nightly signing again

2026-01-11 14:58:11 -05:00 · 2024-05-16 05:43:56 -04:00
31 changed files with 618 additions and 685 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -56,7 +56,7 @@ jobs:
        pip freeze -l
        pyinstaller .\apps\shark_studio\shark_studio.spec
        mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
-        signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
+        #signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
  
    - name: Upload Release Assets
      id: upload-release-assets
--- a/.github/workflows/test-studio.yml
+++ b/.github/workflows/test-studio.yml
@@ -81,5 +81,6 @@ jobs:
        source shark.venv/bin/activate
        pip install -r requirements.txt --no-cache-dir
        pip install -e .
-        # Disabled due to hang when exporting test llama2
-        # python apps/shark_studio/tests/api_test.py
+        pip uninstall -y torch
+        pip install torch==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        python apps/shark_studio/tests/api_test.py
--- a/.gitignore
+++ b/.gitignore
@@ -164,7 +164,7 @@ cython_debug/
 # vscode related
 .vscode

-# Shark related artifacts
+# Shark related artefacts
 *venv/
 shark_tmp/
 *.vmfb
@@ -172,7 +172,6 @@ shark_tmp/
 tank/dict_configs.py
 *.csv
 reproducers/
-apps/shark_studio/web/configs

 # ORT related artefacts
 cache_models/
@@ -189,11 +188,6 @@ variants.json
 # models folder
 apps/stable_diffusion/web/models/

-# model artifacts (SHARK)
-*.tempfile
-*.mlir
-*.vmfb
-
 # Stencil annotators.
 stencil_annotator/

--- a/apps/shark_studio/api/llm.py
+++ b/apps/shark_studio/api/llm.py
@@ -3,13 +3,8 @@ from turbine_models.model_runner import vmfbRunner
 from turbine_models.gen_external_params.gen_external_params import gen_external_params
 import time
 from shark.iree_utils.compile_utils import compile_module_to_flatbuffer
-from apps.shark_studio.web.utils.file_utils import (
-    get_resource_path,
-    get_checkpoints_path,
-)
+from apps.shark_studio.web.utils.file_utils import get_resource_path
 from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
-from apps.shark_studio.api.utils import parse_device
-from urllib.request import urlopen
 import iree.runtime as ireert
 from itertools import chain
 import gc
@@ -70,7 +65,6 @@ class LanguageModel:
        use_system_prompt=True,
        streaming_llm=False,
    ):
-        _, _, self.triple = parse_device(device)
        self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
        self.device = device.split("=>")[-1].strip()
        self.backend = self.device.split("://")[0]
@@ -161,9 +155,7 @@ class LanguageModel:
                use_auth_token=hf_auth_token,
            )
        elif not os.path.exists(self.tempfile_name):
-            self.torch_ir, self.tokenizer = llm_model_map[self.hf_model_name][
-                "initializer"
-            ](
+            self.torch_ir, self.tokenizer = llm_model_map[model_name]["initializer"](
                self.hf_model_name,
                hf_auth_token,
                compile_to="torch",
@@ -171,7 +163,6 @@ class LanguageModel:
                precision=self.precision,
                quantization=self.quantization,
                streaming_llm=self.streaming_llm,
-                decomp_attn=True,
            )
            with open(self.tempfile_name, "w+") as f:
                f.write(self.torch_ir)
@@ -201,27 +192,11 @@ class LanguageModel:
            )
        elif self.backend == "vulkan":
            flags.extend(["--iree-stream-resource-max-allocation-size=4294967296"])
-        elif self.backend == "rocm":
-            flags.extend(
-                [
-                    "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
-                    "--iree-llvmgpu-enable-prefetch=true",
-                    "--iree-opt-outer-dim-concat=true",
-                    "--iree-flow-enable-aggressive-fusion",
-                ]
-            )
-            if "gfx9" in self.triple:
-                flags.extend(
-                    [
-                        f"--iree-codegen-transform-dialect-library={get_mfma_spec_path(self.triple, get_checkpoints_path())}",
-                        "--iree-codegen-llvmgpu-use-vector-distribution=true",
-                    ]
-                )
        flags.extend(llm_model_map[self.hf_model_name]["compile_flags"])
        flatbuffer_blob = compile_module_to_flatbuffer(
            self.tempfile_name,
            device=self.device,
-            frontend="auto",
+            frontend="torch",
            model_config_path=None,
            extra_args=flags,
            write_to=self.vmfb_name,
@@ -283,7 +258,8 @@ class LanguageModel:

            history.append(format_out(token))
            while (
-                format_out(token) != llm_model_map[self.hf_model_name]["stop_token"]
+                format_out(token)
+                != llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]
                and len(history) < self.max_tokens
            ):
                dec_time = time.time()
@@ -297,7 +273,10 @@ class LanguageModel:

            self.prev_token_len = token_len + len(history)

-            if format_out(token) == llm_model_map[self.hf_model_name]["stop_token"]:
+            if (
+                format_out(token)
+                == llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]
+            ):
                break

        for i in range(len(history)):
@@ -331,7 +310,7 @@ class LanguageModel:
                self.first_input = False

            history.append(int(token))
-            while token != llm_model_map[self.hf_model_name]["stop_token"]:
+            while token != llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]:
                dec_time = time.time()
                result = self.hf_mod(token.reshape([1, 1]), past_key_values=pkv)
                history.append(int(token))
@@ -342,7 +321,7 @@ class LanguageModel:

            self.prev_token_len = token_len + len(history)

-            if token == llm_model_map[self.hf_model_name]["stop_token"]:
+            if token == llm_model_map["meta-llama/Llama-2-7b-chat-hf"]["stop_token"]:
                break
        for i in range(len(history)):
            if type(history[i]) != int:
@@ -352,17 +331,6 @@ class LanguageModel:
        return result_output, total_time


-def get_mfma_spec_path(target_chip, save_dir):
-    url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
-    attn_spec = urlopen(url).read().decode("utf-8")
-    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
-    if os.path.exists(spec_path):
-        return spec_path
-    with open(spec_path, "w") as f:
-        f.write(attn_spec)
-    return spec_path
-
-
 def llm_chat_api(InputData: dict):
    from datetime import datetime as dt

--- a/apps/shark_studio/api/sd.py
+++ b/apps/shark_studio/api/sd.py
@@ -1,82 +1,54 @@
 import gc
 import torch
-import gradio as gr
 import time
 import os
 import json
 import numpy as np
-import copy
-import importlib.util
-import sys
 from tqdm.auto import tqdm

 from pathlib import Path
 from random import randint
-from turbine_models.custom_models.sd_inference.sd_pipeline import SharkSDPipeline
-from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
-    SharkSDXLPipeline,
-)
-
-
+from turbine_models.custom_models.sd_inference import clip, unet, vae
 from apps.shark_studio.api.controlnet import control_adapter_map
-from apps.shark_studio.api.utils import parse_device
 from apps.shark_studio.web.utils.state import status_label
 from apps.shark_studio.web.utils.file_utils import (
    safe_name,
    get_resource_path,
    get_checkpoints_path,
 )
-
+from apps.shark_studio.modules.pipeline import SharkPipelineBase
+from apps.shark_studio.modules.schedulers import get_schedulers
+from apps.shark_studio.modules.prompt_encoding import (
+    get_weighted_text_embeddings,
+)
 from apps.shark_studio.modules.img_processing import (
+    resize_stencil,
    save_output_img,
+    resamplers,
+    resampler_list,
 )

 from apps.shark_studio.modules.ckpt_processing import (
    preprocessCKPT,
-    save_irpa,
+    process_custom_pipe_weights,
 )
+from transformers import CLIPTokenizer
+from diffusers.image_processor import VaeImageProcessor

-EMPTY_SD_MAP = {
-    "clip": None,
-    "scheduler": None,
-    "unet": None,
-    "vae_decode": None,
-}
-
-EMPTY_SDXL_MAP = {
-    "prompt_encoder": None,
-    "scheduled_unet": None,
-    "vae_decode": None,
-    "pipeline": None,
-    "full_pipeline": None,
-}
-
-EMPTY_FLAGS = {
-    "clip": None,
-    "unet": None,
-    "vae": None,
-    "pipeline": None,
+sd_model_map = {
+    "clip": {
+        "initializer": clip.export_clip_model,
+    },
+    "unet": {
+        "initializer": unet.export_unet_model,
+    },
+    "vae_decode": {
+        "initializer": vae.export_vae_model,
+    },
 }


-def load_script(source, module_name):
-    """
-    reads file source and loads it as a module
-
-    :param source: file to load
-    :param module_name: name of module to register in sys.modules
-    :return: loaded module
-    """
-
-    spec = importlib.util.spec_from_file_location(module_name, source)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)
-
-    return module
-
-
-class StableDiffusion:
+class StableDiffusion(SharkPipelineBase):
    # This class is responsible for executing image generation and creating
    # /managing a set of compiled modules to run Stable Diffusion. The init
    # aims to be as general as possible, and the class will infer and compile
@@ -89,45 +61,66 @@ class StableDiffusion:
        height: int,
        width: int,
        batch_size: int,
-        steps: int,
-        scheduler: str,
        precision: str,
        device: str,
-        target_triple: str = None,
        custom_vae: str = None,
        num_loras: int = 0,
        import_ir: bool = True,
        is_controlled: bool = False,
-        external_weights: str = "safetensors",
+        hf_auth_token=None,
    ):
+        self.model_max_length = 77
+        self.batch_size = batch_size
        self.precision = precision
-        self.compiled_pipeline = False
-        self.base_model_id = base_model_id
-        self.custom_vae = custom_vae
-        self.is_sdxl = "xl" in self.base_model_id.lower()
-        self.is_custom = ".py" in self.base_model_id.lower()
-        if self.is_custom:
-            custom_module = load_script(
-                os.path.join(get_checkpoints_path("scripts"), self.base_model_id),
-                "custom_pipeline",
-            )
-            self.turbine_pipe = custom_module.StudioPipeline
-            self.model_map = custom_module.MODEL_MAP
-        elif self.is_sdxl:
-            self.turbine_pipe = SharkSDXLPipeline
-            self.model_map = EMPTY_SDXL_MAP
-        else:
-            self.turbine_pipe = SharkSDPipeline
-            self.model_map = EMPTY_SD_MAP
-        max_length = 64
-        target_backend, self.rt_device, triple = parse_device(device, target_triple)
+        self.dtype = torch.float16 if precision == "fp16" else torch.float32
+        self.height = height
+        self.width = width
+        self.scheduler_obj = {}
+        static_kwargs = {
+            "pipe": {
+                "external_weights": "safetensors",
+            },
+            "clip": {"hf_model_name": base_model_id},
+            "unet": {
+                "hf_model_name": base_model_id,
+                "unet_model": unet.UnetModel(hf_model_name=base_model_id),
+                "batch_size": batch_size,
+                # "is_controlled": is_controlled,
+                # "num_loras": num_loras,
+                "height": height,
+                "width": width,
+                "precision": precision,
+                "max_length": self.model_max_length,
+            },
+            "vae_encode": {
+                "hf_model_name": base_model_id,
+                "vae_model": vae.VaeModel(
+                    hf_model_name=custom_vae if custom_vae else base_model_id,
+                ),
+                "batch_size": batch_size,
+                "height": height,
+                "width": width,
+                "precision": precision,
+            },
+            "vae_decode": {
+                "hf_model_name": base_model_id,
+                "vae_model": vae.VaeModel(
+                    hf_model_name=custom_vae if custom_vae else base_model_id,
+                ),
+                "batch_size": batch_size,
+                "height": height,
+                "width": width,
+                "precision": precision,
+            },
+        }
+        super().__init__(sd_model_map, base_model_id, static_kwargs, device, import_ir)
        pipe_id_list = [
            safe_name(base_model_id),
            str(batch_size),
-            str(max_length),
+            str(self.model_max_length),
            f"{str(height)}x{str(width)}",
            precision,
-            triple,
+            self.device,
        ]
        if num_loras > 0:
            pipe_id_list.append(str(num_loras) + "lora")
@@ -136,147 +129,305 @@ class StableDiffusion:
        if custom_vae:
            pipe_id_list.append(custom_vae)
        self.pipe_id = "_".join(pipe_id_list)
-        self.pipeline_dir = Path(os.path.join(get_checkpoints_path(), self.pipe_id))
-        self.weights_path = Path(
-            os.path.join(
-                get_checkpoints_path(), safe_name(self.base_model_id + "_" + precision)
-            )
+        print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
+        del static_kwargs
+        gc.collect()
+
+    def prepare_pipe(self, custom_weights, adapters, embeddings, is_img2img):
+        print(f"\n[LOG] Preparing pipeline...")
+        self.is_img2img = is_img2img
+        self.schedulers = get_schedulers(self.base_model_id)
+
+        self.weights_path = os.path.join(
+            get_checkpoints_path(), self.safe_name(self.base_model_id)
        )
        if not os.path.exists(self.weights_path):
            os.mkdir(self.weights_path)

-        decomp_attn = True
-        attn_spec = None
-        if triple in ["gfx940", "gfx942", "gfx90a"]:
-            decomp_attn = False
-            attn_spec = "mfma"
-        elif triple in ["gfx1100", "gfx1103", "gfx1150"]:
-            decomp_attn = False
-            attn_spec = "wmma"
-            if triple in ["gfx1103", "gfx1150"]:
-                # external weights have issues on igpu
-                external_weights = None
-        elif target_backend == "llvm-cpu":
-            decomp_attn = False
+        for model in adapters:
+            self.model_map[model] = adapters[model]

-        self.sd_pipe = self.turbine_pipe(
-            hf_model_name=base_model_id,
-            scheduler_id=scheduler,
-            height=height,
-            width=width,
-            precision=precision,
-            max_length=max_length,
-            batch_size=batch_size,
-            num_inference_steps=steps,
-            device=target_backend,
-            iree_target_triple=triple,
-            ireec_flags=EMPTY_FLAGS,
-            attn_spec=attn_spec,
-            decomp_attn=decomp_attn,
-            pipeline_dir=self.pipeline_dir,
-            external_weights_dir=self.weights_path,
-            external_weights=external_weights,
-            custom_vae=custom_vae,
-        )
-        print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
-        gc.collect()
-
-    def prepare_pipe(
-        self, custom_weights, adapters, embeddings, is_img2img, compiled_pipeline
-    ):
-        print(f"\n[LOG] Preparing pipeline...")
-        self.is_img2img = False
-        mlirs = copy.deepcopy(self.model_map)
-        vmfbs = copy.deepcopy(self.model_map)
-        weights = copy.deepcopy(self.model_map)
-        if not self.is_sdxl:
-            compiled_pipeline = False
-        self.compiled_pipeline = compiled_pipeline
-
-        if custom_weights:
-            custom_weights = os.path.join(
-                get_checkpoints_path("checkpoints"),
-                safe_name(self.base_model_id.split("/")[-1]),
-                custom_weights,
-            )
-            diffusers_weights_path = preprocessCKPT(custom_weights, self.precision)
-            for key in weights:
-                if key in ["scheduled_unet", "unet"]:
-                    unet_weights_path = os.path.join(
-                        diffusers_weights_path,
-                        "unet",
-                        "diffusion_pytorch_model.safetensors",
+        for submodel in self.static_kwargs:
+            if custom_weights:
+                custom_weights_params, _ = process_custom_pipe_weights(custom_weights)
+                if submodel not in ["clip", "clip2"]:
+                    self.static_kwargs[submodel][
+                        "external_weights"
+                    ] = custom_weights_params
+                else:
+                    self.static_kwargs[submodel]["external_weight_path"] = os.path.join(
+                        self.weights_path, submodel + ".safetensors"
                    )
-                    weights[key] = save_irpa(unet_weights_path, "unet.")
+            else:
+                self.static_kwargs[submodel]["external_weight_path"] = os.path.join(
+                    self.weights_path, submodel + ".safetensors"
+                )

-                elif key in ["clip", "prompt_encoder"]:
-                    if not self.is_sdxl:
-                        sd1_path = os.path.join(
-                            diffusers_weights_path, "text_encoder", "model.safetensors"
-                        )
-                        weights[key] = save_irpa(sd1_path, "text_encoder_model.")
-                    else:
-                        clip_1_path = os.path.join(
-                            diffusers_weights_path, "text_encoder", "model.safetensors"
-                        )
-                        clip_2_path = os.path.join(
-                            diffusers_weights_path,
-                            "text_encoder_2",
-                            "model.safetensors",
-                        )
-                        weights[key] = [
-                            save_irpa(clip_1_path, "text_encoder_model_1."),
-                            save_irpa(clip_2_path, "text_encoder_model_2."),
-                        ]
-
-                elif key in ["vae_decode"] and weights[key] is None:
-                    vae_weights_path = os.path.join(
-                        diffusers_weights_path,
-                        "vae",
-                        "diffusion_pytorch_model.safetensors",
-                    )
-                    weights[key] = save_irpa(vae_weights_path, "vae.")
-
-        vmfbs, weights = self.sd_pipe.check_prepared(
-            mlirs, vmfbs, weights, interactive=False
-        )
-        print(f"\n[LOG] Loading pipeline to device {self.rt_device}.")
-        self.sd_pipe.load_pipeline(
-            vmfbs, weights, self.rt_device, self.compiled_pipeline
-        )
-        print(
-            "\n[LOG] Pipeline successfully prepared for runtime. Generating images..."
-        )
+        self.get_compiled_map(pipe_id=self.pipe_id)
+        print("\n[LOG] Pipeline successfully prepared for runtime.")
        return

+    def encode_prompts_weight(
+        self,
+        prompt,
+        negative_prompt,
+        do_classifier_free_guidance=True,
+    ):
+        # Encodes the prompt into text encoder hidden states.
+        self.load_submodels(["clip"])
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            self.base_model_id,
+            subfolder="tokenizer",
+        )
+        clip_inf_start = time.time()
+
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+        )
+
+        if do_classifier_free_guidance:
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        pad = (0, 0) * (len(text_embeddings.shape) - 2)
+        pad = pad + (
+            0,
+            self.static_kwargs["unet"]["max_length"] - text_embeddings.shape[1],
+        )
+        text_embeddings = torch.nn.functional.pad(text_embeddings, pad)
+
+        # SHARK: Report clip inference time
+        clip_inf_time = (time.time() - clip_inf_start) * 1000
+        if self.ondemand:
+            self.unload_submodels(["clip"])
+            gc.collect()
+        print(f"\n[LOG] Clip Inference time (ms) = {clip_inf_time:.3f}")
+
+        return text_embeddings.numpy().astype(np.float16)
+
+    def prepare_latents(
+        self,
+        generator,
+        num_inference_steps,
+        image,
+        strength,
+    ):
+        noise = torch.randn(
+            (
+                self.batch_size,
+                4,
+                self.height // 8,
+                self.width // 8,
+            ),
+            generator=generator,
+            dtype=self.dtype,
+        ).to("cpu")
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        if self.is_img2img:
+            init_timestep = min(
+                int(num_inference_steps * strength), num_inference_steps
+            )
+            t_start = max(num_inference_steps - init_timestep, 0)
+            timesteps = self.scheduler.timesteps[t_start:]
+            latents = self.encode_image(image)
+            latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
+            return latents, [timesteps]
+        else:
+            self.scheduler.is_scale_input_called = True
+            latents = noise * self.scheduler.init_noise_sigma
+            return latents, self.scheduler.timesteps
+
+    def encode_image(self, input_image):
+        self.load_submodels(["vae_encode"])
+        vae_encode_start = time.time()
+        latents = self.run("vae_encode", input_image)
+        vae_inf_time = (time.time() - vae_encode_start) * 1000
+        if self.ondemand:
+            self.unload_submodels(["vae_encode"])
+        print(f"\n[LOG] VAE Encode Inference time (ms): {vae_inf_time:.3f}")
+
+        return latents
+
+    def produce_img_latents(
+        self,
+        latents,
+        text_embeddings,
+        guidance_scale,
+        total_timesteps,
+        cpu_scheduling,
+        mask=None,
+        masked_image_latents=None,
+        return_all_latents=False,
+    ):
+        # self.status = SD_STATE_IDLE
+        step_time_sum = 0
+        latent_history = [latents]
+        text_embeddings = torch.from_numpy(text_embeddings).to(self.dtype)
+        text_embeddings_numpy = text_embeddings.detach().numpy()
+        guidance_scale = torch.Tensor([guidance_scale]).to(self.dtype)
+        self.load_submodels(["unet"])
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(self.dtype).detach().numpy()
+            latent_model_input = self.scheduler.scale_model_input(latents, t).to(
+                self.dtype
+            )
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)).to(self.dtype),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(self.dtype)
+            if cpu_scheduling:
+                latent_model_input = latent_model_input.detach().numpy()
+
+            # Profiling Unet.
+            # profile_device = start_profiling(file_path="unet.rdc")
+            noise_pred = self.run(
+                "unet",
+                [
+                    latent_model_input,
+                    timestep,
+                    text_embeddings_numpy,
+                    guidance_scale,
+                ],
+            )
+            # end_profiling(profile_device)
+
+            if cpu_scheduling:
+                noise_pred = torch.from_numpy(noise_pred.to_host())
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+            else:
+                latents = self.run("scheduler_step", (noise_pred, t, latents))
+
+            latent_history.append(latents)
+            step_time = (time.time() - step_start_time) * 1000
+            # print(
+            #     f"\n [LOG] step = {i} | timestep = {t} | time = {step_time:.2f}ms"
+            # )
+            step_time_sum += step_time
+
+            # if self.status == SD_STATE_CANCEL:
+            #    break
+
+        if self.ondemand:
+            self.unload_submodels(["unet"])
+            gc.collect()
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        print(f"\n[LOG] Average step time: {avg_step_time}ms/it")
+
+        if not return_all_latents:
+            return latents
+        all_latents = torch.cat(latent_history, dim=0)
+        return all_latents
+
+    def decode_latents(self, latents, cpu_scheduling=True):
+        latents_numpy = latents.to(self.dtype)
+        if cpu_scheduling:
+            latents_numpy = latents.detach().numpy()
+
+        # profile_device = start_profiling(file_path="vae.rdc")
+        vae_start = time.time()
+        images = self.run("vae_decode", latents_numpy).to_host()
+        vae_inf_time = (time.time() - vae_start) * 1000
+        # end_profiling(profile_device)
+        print(f"\n[LOG] VAE Inference time (ms): {vae_inf_time:.3f}")
+
+        images = torch.from_numpy(images).permute(0, 2, 3, 1).float().numpy()
+        pil_images = self.image_processor.numpy_to_pil(images)
+        return pil_images
+
    def generate_images(
        self,
        prompt,
        negative_prompt,
        image,
+        scheduler,
+        steps,
        strength,
        guidance_scale,
        seed,
        ondemand,
+        repeatable_seeds,
        resample_type,
        control_mode,
        hints,
    ):
-        img = self.sd_pipe.generate_images(
+        # TODO: Batched args
+        self.image_processor = VaeImageProcessor(do_convert_rgb=True)
+        self.scheduler = self.schedulers[scheduler]
+        self.ondemand = ondemand
+        if self.is_img2img:
+            image, _ = self.image_processor.preprocess(image, resample_type)
+        else:
+            image = None
+
+        print("\n[LOG] Generating images...")
+        batched_args = [
+            prompt,
+            negative_prompt,
+            image,
+        ]
+        for arg in batched_args:
+            if not isinstance(arg, list):
+                arg = [arg] * self.batch_size
+            if len(arg) < self.batch_size:
+                arg = arg * self.batch_size
+            else:
+                arg = [arg[i] for i in range(self.batch_size)]
+
+        text_embeddings = self.encode_prompts_weight(
            prompt,
            negative_prompt,
-            1,
-            guidance_scale,
-            seed,
-            return_imgs=True,
        )
-        return img
+
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+
+        generator = torch.manual_seed(seed)
+
+        init_latents, final_timesteps = self.prepare_latents(
+            generator=generator,
+            num_inference_steps=steps,
+            image=image,
+            strength=strength,
+        )
+
+        latents = self.produce_img_latents(
+            latents=init_latents,
+            text_embeddings=text_embeddings,
+            guidance_scale=guidance_scale,
+            total_timesteps=final_timesteps,
+            cpu_scheduling=True,  # until we have schedulers through Turbine
+        )
+
+        # Img latents -> PIL images
+        all_imgs = []
+        self.load_submodels(["vae_decode"])
+        for i in tqdm(range(0, latents.shape[0], self.batch_size)):
+            imgs = self.decode_latents(
+                latents=latents[i : i + self.batch_size],
+                cpu_scheduling=True,
+            )
+            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_submodels(["vae_decode"])
+
+        return all_imgs


 def shark_sd_fn_dict_input(
    sd_kwargs: dict,
 ):
-    print("\n[LOG] Submitting Request...")
+    print("[LOG] Submitting Request...")

    for key in sd_kwargs:
        if sd_kwargs[key] in [None, []]:
@@ -286,34 +437,9 @@ def shark_sd_fn_dict_input(
        if key == "seed":
            sd_kwargs[key] = int(sd_kwargs[key])

-    # TODO: move these checks into the UI code so we don't have gradio warnings in a generalized dict input function.
-    if not sd_kwargs["device"]:
-        gr.Warning("No device specified. Please specify a device.")
-        return None, ""
-    if sd_kwargs["height"] not in [512, 1024]:
-        gr.Warning("Height must be 512 or 1024. This is a temporary limitation.")
-        return None, ""
-    if sd_kwargs["height"] != sd_kwargs["width"]:
-        gr.Warning("Height and width must be the same. This is a temporary limitation.")
-        return None, ""
-    if sd_kwargs["base_model_id"] == "stabilityai/sdxl-turbo":
-        if sd_kwargs["steps"] > 10:
-            gr.Warning("Max steps for sdxl-turbo is 10. 1 to 4 steps are recommended.")
-            return None, ""
-        if sd_kwargs["guidance_scale"] > 3:
-            gr.Warning(
-                "sdxl-turbo CFG scale should be less than 2.0 if using negative prompt, 0 otherwise."
-            )
-            return None, ""
-    if sd_kwargs["target_triple"] == "":
-        if parse_device(sd_kwargs["device"], sd_kwargs["target_triple"])[2] == "":
-            gr.Warning(
-                "Target device architecture could not be inferred. Please specify a target triple, e.g. 'gfx1100' for a Radeon 7900xtx."
-            )
-            return None, ""
-
-    generated_imgs = yield from shark_sd_fn(**sd_kwargs)
-    return generated_imgs
+    for i in range(1):
+        generated_imgs = yield from shark_sd_fn(**sd_kwargs)
+        yield generated_imgs


 def shark_sd_fn(
@@ -334,9 +460,8 @@ def shark_sd_fn(
    custom_vae: str,
    precision: str,
    device: str,
-    target_triple: str,
    ondemand: bool,
-    compiled_pipeline: bool,
+    repeatable_seeds: bool,
    resample_type: str,
    controlnets: dict,
    embeddings: dict,
@@ -346,6 +471,8 @@ def shark_sd_fn(
        sd_init_image = [sd_init_image]
    is_img2img = True if sd_init_image[0] is not None else False

+    print("\n[LOG] Performing Stable Diffusion Pipeline setup...")
+
    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
    import apps.shark_studio.web.utils.globals as global_obj

@@ -354,7 +481,6 @@ def shark_sd_fn(
    control_mode = None
    hints = []
    num_loras = 0
-    import_ir = True
    for i in embeddings:
        num_loras += 1 if embeddings[i] else 0
    if "model" in controlnets:
@@ -386,29 +512,28 @@ def shark_sd_fn(
        "batch_size": batch_size,
        "precision": precision,
        "device": device,
-        "target_triple": target_triple,
        "custom_vae": custom_vae,
        "num_loras": num_loras,
-        "import_ir": import_ir,
+        "import_ir": cmd_opts.import_mlir,
        "is_controlled": is_controlled,
-        "steps": steps,
-        "scheduler": scheduler,
    }
    submit_prep_kwargs = {
        "custom_weights": custom_weights,
        "adapters": adapters,
        "embeddings": embeddings,
        "is_img2img": is_img2img,
-        "compiled_pipeline": compiled_pipeline,
    }
    submit_run_kwargs = {
        "prompt": prompt,
        "negative_prompt": negative_prompt,
        "image": sd_init_image,
+        "steps": steps,
+        "scheduler": scheduler,
        "strength": strength,
        "guidance_scale": guidance_scale,
        "seed": seed,
        "ondemand": ondemand,
+        "repeatable_seeds": repeatable_seeds,
        "resample_type": resample_type,
        "control_mode": control_mode,
        "hints": hints,
@@ -441,35 +566,22 @@ def shark_sd_fn(
    for current_batch in range(batch_count):
        start_time = time.time()
        out_imgs = global_obj.get_sd_obj().generate_images(**submit_run_kwargs)
-        if not isinstance(out_imgs, list):
-            out_imgs = [out_imgs]
-        # total_time = time.time() - start_time
-        # text_output = f"Total image(s) generation time: {total_time:.4f}sec"
-        # print(f"\n[LOG] {text_output}")
+        total_time = time.time() - start_time
+        text_output = f"Total image(s) generation time: {total_time:.4f}sec"
+        print(f"\n[LOG] {text_output}")
        # if global_obj.get_sd_status() == SD_STATE_CANCEL:
        #     break
        # else:
-        for batch in range(batch_size):
-            save_output_img(
-                out_imgs[batch],
-                seed,
-                sd_kwargs,
-            )
+        save_output_img(
+            out_imgs[current_batch],
+            seed,
+            sd_kwargs,
+        )
        generated_imgs.extend(out_imgs)
-        # TODO: make seed changes over batch counts more configurable.
-        submit_run_kwargs["seed"] = submit_run_kwargs["seed"] + 1
        yield generated_imgs, status_label(
            "Stable Diffusion", current_batch + 1, batch_count, batch_size
        )
-    return (generated_imgs, "")
-
-
-def unload_sd():
-    print("Unloading models.")
-    import apps.shark_studio.web.utils.globals as global_obj
-
-    global_obj.clear_cache()
-    gc.collect()
+    return generated_imgs, ""


 def cancel_sd():
@@ -484,10 +596,6 @@ def view_json_file(file_path):
    return content


-def safe_name(name):
-    return name.replace("/", "_").replace("\\", "_").replace(".", "_")
-
-
 if __name__ == "__main__":
    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
    import apps.shark_studio.web.utils.globals as global_obj
--- a/apps/shark_studio/api/utils.py
+++ b/apps/shark_studio/api/utils.py
@@ -52,13 +52,6 @@ def get_available_devices():
    set_iree_runtime_flags()

    available_devices = []
-    rocm_devices = get_devices_by_name("rocm")
-    available_devices.extend(rocm_devices)
-    cpu_device = get_devices_by_name("cpu-sync")
-    available_devices.extend(cpu_device)
-    cpu_device = get_devices_by_name("cpu-task")
-    available_devices.extend(cpu_device)
-
    from shark.iree_utils.vulkan_utils import (
        get_all_vulkan_devices,
    )
@@ -71,28 +64,17 @@ def get_available_devices():
        id += 1
    if id != 0:
        print(f"vulkan devices are available.")
-
    available_devices.extend(vulkan_devices)
    metal_devices = get_devices_by_name("metal")
    available_devices.extend(metal_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
-    hip_devices = get_devices_by_name("hip")
-    available_devices.extend(hip_devices)
-
-    for idx, device_str in enumerate(available_devices):
-        if "AMD Radeon(TM) Graphics =>" in device_str:
-            igpu_id_candidates = [
-                x.split("w/")[-1].split("=>")[0]
-                for x in available_devices
-                if "M Graphics" in x
-            ]
-            for igpu_name in igpu_id_candidates:
-                if igpu_name:
-                    available_devices[idx] = device_str.replace(
-                        "AMD Radeon(TM) Graphics", igpu_name
-                    )
-                break
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
+    cpu_device = get_devices_by_name("cpu-sync")
+    available_devices.extend(cpu_device)
+    cpu_device = get_devices_by_name("cpu-task")
+    available_devices.extend(cpu_device)
    return available_devices


@@ -145,57 +127,6 @@ def set_iree_runtime_flags():
    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)


-def parse_device(device_str, target_override=""):
-    from shark.iree_utils.compile_utils import (
-        clean_device_info,
-        get_iree_target_triple,
-        iree_target_map,
-    )
-
-    rt_driver, device_id = clean_device_info(device_str)
-    target_backend = iree_target_map(rt_driver)
-    if device_id:
-        rt_device = f"{rt_driver}://{device_id}"
-    else:
-        rt_device = rt_driver
-
-    if target_override:
-        return target_backend, rt_device, target_override
-    match target_backend:
-        case "vulkan-spirv":
-            triple = get_iree_target_triple(device_str)
-            return target_backend, rt_device, triple
-        case "rocm":
-            triple = get_rocm_target_chip(device_str)
-            return target_backend, rt_device, triple
-        case "llvm-cpu":
-            return "llvm-cpu", "local-task", "x86_64-linux-gnu"
-
-
-def get_rocm_target_chip(device_str):
-    # TODO: Use a data file to map device_str to target chip.
-    rocm_chip_map = {
-        "6700": "gfx1031",
-        "6800": "gfx1030",
-        "6900": "gfx1030",
-        "7900": "gfx1100",
-        "MI300X": "gfx942",
-        "MI300A": "gfx940",
-        "MI210": "gfx90a",
-        "MI250": "gfx90a",
-        "MI100": "gfx908",
-        "MI50": "gfx906",
-        "MI60": "gfx906",
-        "780M": "gfx1103",
-    }
-    for key in rocm_chip_map:
-        if key in device_str:
-            return rocm_chip_map[key]
-    raise AssertionError(
-        f"Device {device_str} not recognized. Please file an issue at https://github.com/nod-ai/SHARK/issues."
-    )
-
-
 def get_all_devices(driver_name):
    """
    Inputs: driver_name
--- a/apps/shark_studio/modules/ckpt_processing.py
+++ b/apps/shark_studio/modules/ckpt_processing.py
@@ -2,11 +2,6 @@ import os
 import json
 import re
 import requests
-import torch
-import safetensors
-from shark_turbine.aot.params import (
-    ParameterArchiveBuilder,
-)
 from io import BytesIO
 from pathlib import Path
 from tqdm import tqdm
@@ -20,21 +15,21 @@ from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
 )


-def get_path_to_diffusers_checkpoint(custom_weights, precision="fp16"):
+def get_path_to_diffusers_checkpoint(custom_weights):
    path = Path(custom_weights)
    diffusers_path = path.parent.absolute()
-    diffusers_directory_name = os.path.join("diffusers", path.stem + f"_{precision}")
+    diffusers_directory_name = os.path.join("diffusers", path.stem)
    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
    path_to_diffusers = complete_path_to_diffusers.as_posix()
    return path_to_diffusers


-def preprocessCKPT(custom_weights, precision="fp16", is_inpaint=False):
-    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights, precision)
+def preprocessCKPT(custom_weights, is_inpaint=False):
+    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights)
    if next(Path(path_to_diffusers).iterdir(), None):
        print("Checkpoint already loaded at : ", path_to_diffusers)
-        return path_to_diffusers
+        return
    else:
        print(
            "Diffusers' checkpoint will be identified here : ",
@@ -56,24 +51,8 @@ def preprocessCKPT(custom_weights, precision="fp16", is_inpaint=False):
        from_safetensors=from_safetensors,
        num_in_channels=num_in_channels,
    )
-    if precision == "fp16":
-        pipe.to(dtype=torch.float16)
    pipe.save_pretrained(path_to_diffusers)
-    del pipe
    print("Loading complete")
-    return path_to_diffusers
-
-
-def save_irpa(weights_path, prepend_str):
-    weights = safetensors.torch.load_file(weights_path)
-    archive = ParameterArchiveBuilder()
-    for key in weights.keys():
-        new_key = prepend_str + key
-        archive.add_tensor(new_key, weights[key])
-
-    irpa_file = weights_path.replace(".safetensors", ".irpa")
-    archive.save(irpa_file)
-    return irpa_file


 def convert_original_vae(vae_checkpoint):
--- a/apps/shark_studio/modules/schedulers.py
+++ b/apps/shark_studio/modules/schedulers.py
@@ -24,47 +24,47 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
-    # schedulers["DDPM"] = DDPMScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["DDIM"] = DDIMScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
-    #     model_id, subfolder="scheduler", algorithm_type="dpmsolver"
-    # )
-    # schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
-    #     model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
-    # )
-    # schedulers["DPMSolverMultistepKarras"] = (
-    #     DPMSolverMultistepScheduler.from_pretrained(
-    #         model_id,
-    #         subfolder="scheduler",
-    #         use_karras_sigmas=True,
-    #     )
-    # )
-    # schedulers["DPMSolverMultistepKarras++"] = (
-    #     DPMSolverMultistepScheduler.from_pretrained(
-    #         model_id,
-    #         subfolder="scheduler",
-    #         algorithm_type="dpmsolver++",
-    #         use_karras_sigmas=True,
-    #     )
-    # )
+    schedulers["DDPM"] = DDPMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id, subfolder="scheduler", algorithm_type="dpmsolver"
+    )
+    schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
+        model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
+    )
+    schedulers["DPMSolverMultistepKarras"] = (
+        DPMSolverMultistepScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+            use_karras_sigmas=True,
+        )
+    )
+    schedulers["DPMSolverMultistepKarras++"] = (
+        DPMSolverMultistepScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+            algorithm_type="dpmsolver++",
+            use_karras_sigmas=True,
+        )
+    )
    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
@@ -75,24 +75,24 @@ def get_schedulers(model_id):
            subfolder="scheduler",
        )
    )
-    # schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
-    # schedulers["KDPM2AncestralDiscrete"] = (
-    #     KDPM2AncestralDiscreteScheduler.from_pretrained(
-    #         model_id,
-    #         subfolder="scheduler",
-    #     )
-    # )
-    # schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
-    #     model_id,
-    #     subfolder="scheduler",
-    # )
+    schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["KDPM2AncestralDiscrete"] = (
+        KDPM2AncestralDiscreteScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+        )
+    )
+    schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
    return schedulers


@@ -101,18 +101,17 @@ def export_scheduler_model(model):


 scheduler_model_map = {
-    "PNDM": export_scheduler_model("PNDMScheduler"),
-    # "DPMSolverSDE": export_scheduler_model("DpmSolverSDEScheduler"),
    "EulerDiscrete": export_scheduler_model("EulerDiscreteScheduler"),
    "EulerAncestralDiscrete": export_scheduler_model("EulerAncestralDiscreteScheduler"),
-    # "LCM": export_scheduler_model("LCMScheduler"),
-    # "LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
-    # "DDPM": export_scheduler_model("DDPMScheduler"),
-    # "DDIM": export_scheduler_model("DDIMScheduler"),
-    # "DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
-    # "KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
-    # "DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
-    # "DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
-    # "KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
-    # "HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
+    "LCM": export_scheduler_model("LCMScheduler"),
+    "LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
+    "PNDM": export_scheduler_model("PNDMScheduler"),
+    "DDPM": export_scheduler_model("DDPMScheduler"),
+    "DDIM": export_scheduler_model("DDIMScheduler"),
+    "DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
+    "KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
+    "DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
+    "DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
+    "KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
+    "HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
 }
--- a/apps/shark_studio/tests/api_test.py
+++ b/apps/shark_studio/tests/api_test.py
@@ -36,7 +36,6 @@ class LLMAPITest(unittest.TestCase):
            device="cpu",
            precision="fp32",
            quantization="None",
-            streaming_llm=True,
        )
        count = 0
        label = "Turkishoure Turkish"
--- a/apps/shark_studio/web/configs/default_sd_config.json
+++ b/apps/shark_studio/web/configs/default_sd_config.json
@@ -0,0 +1,28 @@
+{
+  "prompt": [
+    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
+  ],
+  "negative_prompt": [
+    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
+  ],
+  "sd_init_image": [null],
+  "height": 512,
+  "width": 512,
+  "steps": 50,
+  "strength": 0.8,
+  "guidance_scale": 7.5,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerDiscrete",
+  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "AMD Radeon RX 7900 XTX => vulkan://0",
+  "ondemand": false,
+  "repeatable_seeds": false,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}
--- a/apps/shark_studio/web/index.py
+++ b/apps/shark_studio/web/index.py
@@ -76,8 +76,8 @@ def launch_webui(address):
 def webui():
    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
    from apps.shark_studio.web.ui.utils import (
-        amdicon_loc,
-        amdlogo_loc,
+        nodicon_loc,
+        nodlogo_loc,
    )

    launch_api = cmd_opts.api
@@ -172,9 +172,9 @@ def webui():
        analytics_enabled=False,
        title="Shark Studio 2.0 Beta",
    ) as studio_web:
-        amd_logo = Image.open(amdlogo_loc)
+        nod_logo = Image.open(nodlogo_loc)
        gr.Image(
-            value=amd_logo,
+            value=nod_logo,
            show_label=False,
            interactive=False,
            elem_id="tab_bar_logo",
@@ -209,7 +209,7 @@ def webui():
        inbrowser=True,
        server_name="0.0.0.0",
        server_port=cmd_opts.server_port,
-        favicon_path=amdicon_loc,
+        favicon_path=nodicon_loc,
    )


--- a/apps/shark_studio/web/ui/chat.py
+++ b/apps/shark_studio/web/ui/chat.py
@@ -137,8 +137,7 @@ with gr.Blocks(title="Chat") as chat_element:
            streaming_llm = gr.Checkbox(
                label="Run in streaming mode (requires recompilation)",
                value=True,
-                interactive=False,
-                visible=False,
+                interactive=True,
            )
            prompt_prefix = gr.Checkbox(
                label="Add System Prompt",
--- a/apps/shark_studio/web/ui/css/sd_dark_theme.css
+++ b/apps/shark_studio/web/ui/css/sd_dark_theme.css
@@ -367,7 +367,7 @@ footer {
 #tab_bar_logo .image-container {
    object-fit: scale-down;
    position: absolute !important;
-    top: 10px;
+    top: 14px;
    right: 0px;
    height: 36px;
-}
+}
--- a/apps/shark_studio/web/ui/logos/amd-icon.jpg
+++ b/apps/shark_studio/web/ui/logos/amd-icon.jpg
--- a/apps/shark_studio/web/ui/logos/amd-logo.jpg
+++ b/apps/shark_studio/web/ui/logos/amd-logo.jpg
--- a/apps/shark_studio/web/ui/logos/nod-icon.png
+++ b/apps/shark_studio/web/ui/logos/nod-icon.png
--- a/apps/shark_studio/web/ui/logos/nod-logo.png
+++ b/apps/shark_studio/web/ui/logos/nod-logo.png
--- a/apps/shark_studio/web/ui/outputgallery.py
+++ b/apps/shark_studio/web/ui/outputgallery.py
@@ -10,7 +10,7 @@ from apps.shark_studio.web.utils.file_utils import (
    get_generated_imgs_path,
    get_generated_imgs_todays_subdir,
 )
-from apps.shark_studio.web.ui.utils import amdlogo_loc
+from apps.shark_studio.web.ui.utils import nodlogo_loc
 from apps.shark_studio.web.utils.metadata import displayable_metadata

 # -- Functions for file, directory and image info querying
@@ -60,7 +60,7 @@ def output_subdirs() -> list[str]:
 # --- Define UI layout for Gradio

 with gr.Blocks() as outputgallery_element:
-    amd_logo = Image.open(amdlogo_loc)
+    nod_logo = Image.open(nodlogo_loc)

    with gr.Row(elem_id="outputgallery_gallery"):
        # needed to workaround gradio issue:
@@ -73,7 +73,7 @@ with gr.Blocks() as outputgallery_element:
        with gr.Column(scale=6):
            logo = gr.Image(
                label="Getting subdirectories...",
-                value=amd_logo,
+                value=nod_logo,
                interactive=False,
                visible=True,
                show_label=True,
--- a/apps/shark_studio/web/ui/sd.py
+++ b/apps/shark_studio/web/ui/sd.py
@@ -14,12 +14,12 @@ from apps.shark_studio.web.utils.file_utils import (
    get_checkpoints_path,
    get_checkpoints,
    get_configs_path,
-    write_default_sd_configs,
+    write_default_sd_config,
 )
 from apps.shark_studio.api.sd import (
+    sd_model_map,
    shark_sd_fn_dict_input,
    cancel_sd,
-    unload_sd,
 )
 from apps.shark_studio.api.controlnet import (
    cnet_preview,
@@ -33,7 +33,7 @@ from apps.shark_studio.modules.img_processing import (
 )
 from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
 from apps.shark_studio.web.ui.utils import (
-    amdlogo_loc,
+    nodlogo_loc,
    none_to_str_none,
    str_none_to_none,
 )
@@ -45,10 +45,11 @@ from apps.shark_studio.modules import logger
 import apps.shark_studio.web.utils.globals as global_obj

 sd_default_models = [
+    "CompVis/stable-diffusion-v1-4",
    "runwayml/stable-diffusion-v1-5",
    "stabilityai/stable-diffusion-2-1-base",
    "stabilityai/stable-diffusion-2-1",
-    "stabilityai/stable-diffusion-xl-base-1.0",
+    "stabilityai/stable-diffusion-xl-1.0",
    "stabilityai/sdxl-turbo",
 ]

@@ -118,9 +119,8 @@ def pull_sd_configs(
    custom_vae,
    precision,
    device,
-    target_triple,
    ondemand,
-    compiled_pipeline,
+    repeatable_seeds,
    resample_type,
    controlnets,
    embeddings,
@@ -177,9 +177,8 @@ def load_sd_cfg(sd_json: dict, load_sd_config: str):
        sd_json["custom_vae"],
        sd_json["precision"],
        sd_json["device"],
-        sd_json["target_triple"],
        sd_json["ondemand"],
-        sd_json["compiled_pipeline"],
+        sd_json["repeatable_seeds"],
        sd_json["resample_type"],
        sd_json["controlnets"],
        sd_json["embeddings"],
@@ -232,9 +231,14 @@ def import_original(original_img, width, height):


 def base_model_changed(base_model_id):
-    new_choices = get_checkpoints(
-        os.path.join("checkpoints", os.path.basename(str(base_model_id)))
-    ) + get_checkpoints(model_type="checkpoints")
+    ckpt_path = Path(
+        os.path.join(
+            cmd_opts.model_dir, "checkpoints", os.path.basename(str(base_model_id))
+        )
+    )
+    ckpt_path.mkdir(parents=True, exist_ok=True)
+
+    new_choices = get_checkpoints(ckpt_path) + get_checkpoints(model_type="checkpoints")

    return gr.Dropdown(
        value=new_choices[0] if len(new_choices) > 0 else "None",
@@ -256,11 +260,6 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                        choices=global_obj.get_device_list(),
                        allow_custom_value=False,
                    )
-                    target_triple = gr.Textbox(
-                        elem_id="target_triple",
-                        label="Architecture",
-                        value="",
-                    )
                    with gr.Row():
                        ondemand = gr.Checkbox(
                            value=cmd_opts.lowvram,
@@ -283,19 +282,18 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                    elem_id="custom_model",
                    value="stabilityai/stable-diffusion-2-1-base",
                    choices=sd_default_models,
-                    allow_custom_value=True,
                )  # base_model_id
                with gr.Row():
                    height = gr.Slider(
                        384,
-                        1024,
+                        768,
                        value=cmd_opts.height,
                        step=8,
                        label="\U00002195\U0000FE0F Height",
                    )
                    width = gr.Slider(
                        384,
-                        1024,
+                        768,
                        value=cmd_opts.width,
                        step=8,
                        label="\U00002194\U0000FE0F Width",
@@ -606,34 +604,35 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                    interactive=True,
                                    visible=True,
                                )
-                                compiled_pipeline = gr.Checkbox(
-                                    False,
-                                    label="Faster txt2img (SDXL only)",
+                                repeatable_seeds = gr.Checkbox(
+                                    cmd_opts.repeatable_seeds,
+                                    label="Use Repeatable Seeds for Batches",
                                )
                            with gr.Row():
                                stable_diffusion = gr.Button("Start")
-                                unload = gr.Button("Unload Models")
-                                unload.click(
-                                    fn=unload_sd,
+                                random_seed = gr.Button("Randomize Seed")
+                                random_seed.click(
+                                    lambda: -1,
+                                    inputs=[],
+                                    outputs=[seed],
                                    queue=False,
                                    show_progress=False,
                                )
                                stop_batch = gr.Button("Stop")
                    with gr.Tab(label="Config", id=102) as sd_tab_config:
                        with gr.Column(elem_classes=["sd-right-panel"]):
-                            with gr.Row(elem_classes=["fill"]):
-                                Path(get_configs_path()).mkdir(
-                                    parents=True, exist_ok=True
-                                )
-                                default_config_file = os.path.join(
-                                    get_configs_path(),
-                                    "default_sd_config.json",
-                                )
-                                write_default_sd_configs(get_configs_path())
-                                sd_json = gr.JSON(
-                                    elem_classes=["fill"],
-                                    value=view_json_file(default_config_file),
-                                )
+                            Path(get_configs_path()).mkdir(parents=True, exist_ok=True)
+                            default_config_file = os.path.join(
+                                get_configs_path(),
+                                "default_sd_config.json",
+                            )
+                            write_default_sd_config(default_config_file)
+                            sd_json = gr.JSON(
+                                label="SD Config",
+                                elem_classes=["fill"],
+                                value=view_json_file(default_config_file),
+                                render=False,
+                            )
                            with gr.Row():
                                with gr.Column(scale=3):
                                    load_sd_config = gr.FileExplorer(
@@ -683,9 +682,8 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                        custom_vae,
                                        precision,
                                        device,
-                                        target_triple,
                                        ondemand,
-                                        compiled_pipeline,
+                                        repeatable_seeds,
                                        resample_type,
                                        cnet_config,
                                        embeddings_config,
@@ -697,6 +695,8 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                    inputs=[sd_json, sd_config_name],
                                    outputs=[sd_config_name],
                                )
+                            with gr.Row(elem_classes=["fill"]):
+                                sd_json.render()
                        save_sd_config.click(
                            fn=save_sd_cfg,
                            inputs=[sd_json, sd_config_name],
@@ -708,7 +708,6 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                value=f"{sd_model_info}\n"
                                f"Images will be saved at "
                                f"{get_generated_imgs_path()}",
-                                lines=2,
                                elem_id="std_output",
                                show_label=True,
                                label="Log",
@@ -718,6 +717,8 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                logger.read_sd_logs, None, std_output, every=1
                            )
                            sd_status = gr.Textbox(visible=False)
+                    with gr.Tab(label="Automation", id=104) as sd_tab_automation:
+                        pass

    pull_kwargs = dict(
        fn=pull_sd_configs,
@@ -739,9 +740,8 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
            custom_vae,
            precision,
            device,
-            target_triple,
            ondemand,
-            compiled_pipeline,
+            repeatable_seeds,
            resample_type,
            cnet_config,
            embeddings_config,
--- a/apps/shark_studio/web/ui/utils.py
+++ b/apps/shark_studio/web/ui/utils.py
@@ -10,8 +10,8 @@ def resource_path(relative_path):
    return os.path.join(base_path, relative_path)


-amdlogo_loc = resource_path("logos/amd-logo.jpg")
-amdicon_loc = resource_path("logos/amd-icon.jpg")
+nodlogo_loc = resource_path("logos/nod-logo.png")
+nodicon_loc = resource_path("logos/nod-icon.png")


 class HSLHue(IntEnum):
--- a/apps/shark_studio/web/utils/default_configs.py
+++ b/apps/shark_studio/web/utils/default_configs.py
@@ -1,95 +0,0 @@
-default_sd_config = r"""{
-  "prompt": [
-    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
-  ],
-  "negative_prompt": [
-    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
-  ],
-  "sd_init_image": [null],
-  "height": 512,
-  "width": 512,
-  "steps": 50,
-  "strength": 0.8,
-  "guidance_scale": 7.5,
-  "seed": "-1",
-  "batch_count": 1,
-  "batch_size": 1,
-  "scheduler": "EulerDiscrete",
-  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
-  "custom_weights": null,
-  "custom_vae": null,
-  "precision": "fp16",
-  "device": "",
-  "target_triple": "",
-  "ondemand": false,
-  "compiled_pipeline": false,
-  "resample_type": "Nearest Neighbor",
-  "controlnets": {},
-  "embeddings": {}
-}"""
-
-sdxl_30steps = r"""{
-  "prompt": [
-    "a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal"
-  ],
-  "negative_prompt": [
-    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
-  ],
-  "sd_init_image": [null],
-  "height": 1024,
-  "width": 1024,
-  "steps": 30,
-  "strength": 0.8,
-  "guidance_scale": 7.5,
-  "seed": "-1",
-  "batch_count": 1,
-  "batch_size": 1,
-  "scheduler": "EulerDiscrete",
-  "base_model_id": "stabilityai/stable-diffusion-xl-base-1.0",
-  "custom_weights": null,
-  "custom_vae": null,
-  "precision": "fp16",
-  "device": "",
-  "target_triple": "",
-  "ondemand": false,
-  "compiled_pipeline": true,
-  "resample_type": "Nearest Neighbor",
-  "controlnets": {},
-  "embeddings": {}
-}"""
-
-sdxl_turbo = r"""{
-  "prompt": [
-    "A cat wearing a hat that says 'TURBO' on it. The cat is sitting on a skateboard."
-  ],
-  "negative_prompt": [
-    ""
-  ],
-  "sd_init_image": [null],
-  "height": 512,
-  "width": 512,
-  "steps": 2,
-  "strength": 0.8,
-  "guidance_scale": 0,
-  "seed": "-1",
-  "batch_count": 1,
-  "batch_size": 1,
-  "scheduler": "EulerAncestralDiscrete",
-  "base_model_id": "stabilityai/sdxl-turbo",
-  "custom_weights": null,
-  "custom_vae": null,
-  "precision": "fp16",
-  "device": "",
-  "target_triple": "",
-  "ondemand": false,
-  "compiled_pipeline": true,
-  "resample_type": "Nearest Neighbor",
-  "controlnets": {},
-  "embeddings": {}
-}"""
-
-default_sd_configs = {
-    "default_sd_config.json": default_sd_config,
-    "sdxl-30steps.json": sdxl_30steps,
-    "sdxl-turbo.json": sdxl_turbo,
-}
--- a/apps/shark_studio/web/utils/file_utils.py
+++ b/apps/shark_studio/web/utils/file_utils.py
@@ -11,18 +11,43 @@ checkpoints_filetypes = (
    "*.safetensors",
 )

-from apps.shark_studio.web.utils.default_configs import default_sd_configs
+default_sd_config = r"""{
+  "prompt": [
+    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
+  ],
+  "negative_prompt": [
+    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
+  ],
+  "sd_init_image": [null],
+  "height": 512,
+  "width": 512,
+  "steps": 50,
+  "strength": 0.8,
+  "guidance_scale": 7.5,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerDiscrete",
+  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "AMD Radeon RX 7900 XTX => vulkan://0",
+  "ondemand": false,
+  "repeatable_seeds": false,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""


-def write_default_sd_configs(path):
-    for key in default_sd_configs.keys():
-        config_fpath = os.path.join(path, key)
-        with open(config_fpath, "w") as f:
-            f.write(default_sd_configs[key])
+def write_default_sd_config(path):
+    with open(path, "w") as f:
+        f.write(default_sd_config)


 def safe_name(name):
-    return name.split("/")[-1].replace("-", "_")
+    return name.replace("/", "_").replace("-", "_")


 def get_path_stem(path):
--- a/apps/shark_studio/web/utils/metadata/png_metadata.py
+++ b/apps/shark_studio/web/utils/metadata/png_metadata.py
@@ -3,8 +3,9 @@ from pathlib import Path
 from apps.shark_studio.web.utils.file_utils import (
    get_checkpoint_pathfile,
 )
-from apps.shark_studio.api.sd import EMPTY_SD_MAP as sd_model_map
-
+from apps.shark_studio.api.sd import (
+    sd_model_map,
+)
 from apps.shark_studio.modules.schedulers import (
    scheduler_model_map,
 )
--- a/apps/shark_studio/web/utils/tmp_configs.py
+++ b/apps/shark_studio/web/utils/tmp_configs.py
@@ -17,7 +17,7 @@ def clear_tmp_mlir():
        and filename.endswith(".mlir")
    ]
    for filename in mlir_files:
-        os.remove(os.path.join(shark_tmp, filename))
+        os.remove(shark_tmp + filename)
    print(f"Clearing .mlir temporary files took {time() - cleanup_start:.4f} seconds.")


--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -10,7 +10,7 @@ from utils import get_datasets

 shark_root = Path(__file__).parent.parent
 demo_css = shark_root.joinpath("web/demo.css").resolve()
-nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/amd-logo.jpg")
+nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/nod-logo.png")


 with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,13 @@
-f https://download.pytorch.org/whl/nightly/cpu
+-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
 -f https://iree.dev/pip-release-links.html
 --pre

 setuptools
 wheel

-
 torch==2.3.0
 shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
-turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@deprecated-constraints#subdirectory=models
-diffusers @ git+https://github.com/nod-ai/diffusers@0.29.0.dev0-shark
-brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
+turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@main#subdirectory=models

 # SHARK Runner
 tqdm
@@ -20,6 +17,8 @@ google-cloud-storage

 # Testing
 pytest
+pytest-xdist
+pytest-forked
 Pillow
 parameterized

@@ -27,10 +26,8 @@ parameterized
 #accelerate is now required for diffusers import from ckpt.
 accelerate
 scipy
-transformers==4.37.1
-torchsde # Required for Stable Diffusion SDE schedulers.
 ftfy
-gradio==4.29.0
+gradio==4.19.2
 altair
 omegaconf
 # 0.3.2 doesn't have binaries for arm64
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -88,8 +88,5 @@ else {python -m venv .\shark.venv\}
 .\shark.venv\Scripts\activate
 python -m pip install --upgrade pip
 pip install wheel
-pip install --pre -r requirements.txt
-pip install --force-reinstall https://github.com/nod-ai/SRT/releases/download/candidate-20240528.279/iree_compiler-20240528.279-cp311-cp311-win_amd64.whl https://github.com/nod-ai/SRT/releases/download/candidate-20240528.279/iree_runtime-20240528.279-cp311-cp311-win_amd64.whl 
-pip install -e .
-
+pip install -r requirements.txt
 Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -84,7 +84,21 @@ else
  PYTORCH_URL=https://download.pytorch.org/whl/nightly/cpu/
 fi

-$PYTHON -m pip install --no-warn-conflicts -e . -f ${RUNTIME} -f ${PYTORCH_URL}
+$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f ${PYTORCH_URL}
+
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
+  T_VER=$($PYTHON -m pip show torch | grep Version)
+  T_VER_MIN=${T_VER:14:12}
+  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
+  TV_VER_MAJ=${TV_VER:9:6}
+  $PYTHON -m pip uninstall -y torchvision
+  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
+  if [ $? -eq 0 ];then
+    echo "Successfully Installed torch + cu118."
+  else
+    echo "Could not install torch + cu118." >&2
+  fi
+fi

 if [[ -z "${NO_BREVITAS}" ]]; then
  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -76,7 +76,6 @@ _IREE_DEVICE_MAP = {
    "vulkan": "vulkan",
    "metal": "metal",
    "rocm": "rocm",
-    "hip": "hip",
    "intel-gpu": "level_zero",
 }

@@ -95,7 +94,6 @@ _IREE_TARGET_MAP = {
    "vulkan": "vulkan-spirv",
    "metal": "metal",
    "rocm": "rocm",
-    "hip": "rocm",
    "intel-gpu": "opencl-spirv",
 }

--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -62,16 +62,13 @@ def get_iree_device_args(device, extra_args=[]):
        from shark.iree_utils.gpu_utils import get_iree_rocm_args

        return get_iree_rocm_args(device_num=device_num, extra_args=extra_args)
-    if device == "hip":
-        from shark.iree_utils.gpu_utils import get_iree_rocm_args
-        return get_iree_rocm_args(device_num=device_num, extra_args=extra_args, hip_driver=True)
    return []

 def get_iree_target_triple(device):
    args = get_iree_device_args(device)
    for flag in args:
-        if "triple" in flag:
-            triple = flag.split("=")[-1]
+        if "triple" in flag.split("-"):
+            triple = flag.split("=")
            return triple
    return ""

@@ -92,9 +89,9 @@ def clean_device_info(raw_device):
        if len(device_id) <= 2:
            device_id = int(device_id)

-    if device not in ["hip", "rocm", "vulkan"]:
+    if device not in ["rocm", "vulkan"]:
        device_id = None
-    if device in ["hip", "rocm", "vulkan"] and device_id == None:
+    if device in ["rocm", "vulkan"] and device_id == None:
        device_id = 0
    return device, device_id

--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -52,7 +52,7 @@ def check_rocm_device_arch_in_args(extra_args):
    return None


-def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
+def get_rocm_device_arch(device_num=0, extra_args=[]):
    # ROCM Device Arch selection:
    # 1 : User given device arch using `--iree-rocm-target-chip` flag
    # 2 : Device arch from `iree-run-module --dump_devices=rocm` for device on index <device_num>
@@ -68,23 +68,15 @@ def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
    arch_in_device_dump = None

    # get rocm arch from iree dump devices
-    def get_devices_info_from_dump(dump, driver):
+    def get_devices_info_from_dump(dump):
        from os import linesep
-        
-        if driver == "hip":
-            dump_clean = list(
-                filter(
-                    lambda s: "AMD" in s,
-                    dump.split(linesep),
-                )
-            )
-        else:
-            dump_clean = list(
-                filter(
-                    lambda s: f"--device={driver}" in s or "gpu-arch-name:" in s,
-                    dump.split(linesep),
-                )
+
+        dump_clean = list(
+            filter(
+                lambda s: "--device=rocm" in s or "gpu-arch-name:" in s,
+                dump.split(linesep),
            )
+        )
        arch_pairs = [
            (
                dump_clean[i].split("=")[1].strip(),
@@ -95,17 +87,16 @@ def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
        return arch_pairs

    dump_device_info = None
-    driver = "hip" if hip_driver else "rocm"
    try:
        dump_device_info = run_cmd(
-            "iree-run-module --dump_devices=" + driver, raise_err=True
+            "iree-run-module --dump_devices=rocm", raise_err=True
        )
    except Exception as e:
-        print("could not execute `iree-run-module --dump_devices=" + driver + "`")
+        print("could not execute `iree-run-module --dump_devices=rocm`")

    if dump_device_info is not None:
        device_num = 0 if device_num is None else device_num
-        device_arch_pairs = get_devices_info_from_dump(dump_device_info[0], driver)
+        device_arch_pairs = get_devices_info_from_dump(dump_device_info[0])
        if len(device_arch_pairs) > device_num:  # can find arch in the list
            arch_in_device_dump = device_arch_pairs[device_num][1]

@@ -116,22 +107,24 @@ def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
    default_rocm_arch = "gfx1100"
    print(
        "Did not find ROCm architecture from `--iree-rocm-target-chip` flag"
-        "\n or from `iree-run-module --dump_devices` command."
+        "\n or from `iree-run-module --dump_devices=rocm` command."
        f"\nUsing {default_rocm_arch} as ROCm arch for compilation."
    )
    return default_rocm_arch


 # Get the default gpu args given the architecture.
-def get_iree_rocm_args(device_num=0, extra_args=[], hip_driver=False):
+def get_iree_rocm_args(device_num=0, extra_args=[]):
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    rocm_flags = []
+    rocm_flags = ["--iree-rocm-link-bc=true"]
+
    if check_rocm_device_arch_in_args(extra_args) is None:
-        rocm_arch = get_rocm_device_arch(device_num, extra_args, hip_driver=hip_driver)
+        rocm_arch = get_rocm_device_arch(device_num, extra_args)
        rocm_flags.append(f"--iree-rocm-target-chip={rocm_arch}")

    return rocm_flags

+
 # Some constants taken from cuda.h
 CUDA_SUCCESS = 0
 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16