Update requirements.txt

Fixes to UI config defaults, config loading, and warnings. (#2153 )
Fix batch count and tweaks to chatbot. (#2151 )
2026-01-11 14:58:11 -05:00 · 2024-06-18 11:42:12 -07:00 · 2024-05-31 18:14:27 -04:00 · 2024-05-31 18:48:28 +05:30 · 2024-05-30 11:40:42 -05:00 · 2024-05-30 21:43:15 +05:30
37 changed files with 753 additions and 723 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -50,10 +50,11 @@ jobs:
      shell: powershell
      run: |
        ./setup_venv.ps1
-        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
-        pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
        python process_skipfiles.py
-        pyinstaller .\apps\stable_diffusion\shark_sd.spec
+        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
+        pip install -e .
+        pip freeze -l
+        pyinstaller .\apps\shark_studio\shark_studio.spec
        mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
        signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
  
@@ -74,80 +75,3 @@ jobs:
        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
      with:
        release_id: ${{ steps.create_release.outputs.id }}
-
-  linux-build:
-
-    runs-on: a100
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.11"]
-        backend: [IREE, SHARK]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup pip cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Install dependencies
-      run: |
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-    - name: Build and validate the IREE package
-      if: ${{ matrix.backend == 'IREE' }}
-      continue-on-error: true
-      run: |
-        cd $GITHUB_WORKSPACE
-        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
-        source iree.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        if !(grep -Fxq " failed" pytest_results.txt) 
-          then 
-            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
-            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/nightly/
-        fi
-        rm -rf ./wheelhouse/nodai*
-
-    - name: Build and validate the SHARK Runtime package
-      if: ${{ matrix.backend == 'SHARK' }}
-      run: |
-        cd $GITHUB_WORKSPACE
-        ./setup_venv.sh
-        source shark.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
-          tail -n 1 |
-          tee -a pytest_results.txt
--- a/.github/workflows/test-studio.yml
+++ b/.github/workflows/test-studio.yml
@@ -81,6 +81,5 @@ jobs:
        source shark.venv/bin/activate
        pip install -r requirements.txt --no-cache-dir
        pip install -e .
-        pip uninstall -y torch
-        pip install torch==2.1.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-        python apps/shark_studio/tests/api_test.py
+        # Disabled due to hang when exporting test llama2
+        # python apps/shark_studio/tests/api_test.py
--- a/.gitignore
+++ b/.gitignore
@@ -164,7 +164,7 @@ cython_debug/
 # vscode related
 .vscode

-# Shark related artefacts
+# Shark related artifacts
 *venv/
 shark_tmp/
 *.vmfb
@@ -172,6 +172,7 @@ shark_tmp/
 tank/dict_configs.py
 *.csv
 reproducers/
+apps/shark_studio/web/configs

 # ORT related artefacts
 cache_models/
@@ -188,6 +189,11 @@ variants.json
 # models folder
 apps/stable_diffusion/web/models/

+# model artifacts (SHARK)
+*.tempfile
+*.mlir
+*.vmfb
+
 # Stencil annotators.
 stencil_annotator/

--- a/README.md
+++ b/README.md
@@ -372,7 +372,7 @@ For a complete list of the models supported in SHARK, please refer to [tank/READ

 *   [Upstream IREE issues](https://github.com/google/iree/issues): Feature requests,
    bugs, and other work tracking
-*   [Upstream IREE Discord server](https://discord.gg/26P4xW4): Daily development
+*   [Upstream IREE Discord server](https://discord.gg/wEWh6Z9nMU): Daily development
    discussions with the core team and collaborators
 *   [iree-discuss email list](https://groups.google.com/forum/#!forum/iree-discuss):
    Announcements, general and low-priority discussion
--- a/apps/shark_studio/api/initializers.py
+++ b/apps/shark_studio/api/initializers.py
@@ -53,11 +53,11 @@ def initialize():
    clear_tmp_imgs()

    from apps.shark_studio.web.utils.file_utils import (
-        create_checkpoint_folders,
+        create_model_folders,
    )

    # Create custom models folders if they don't exist
-    create_checkpoint_folders()
+    create_model_folders()

    import gradio as gr

--- a/apps/shark_studio/api/llm.py
+++ b/apps/shark_studio/api/llm.py
@@ -3,8 +3,13 @@ from turbine_models.model_runner import vmfbRunner
 from turbine_models.gen_external_params.gen_external_params import gen_external_params
 import time
 from shark.iree_utils.compile_utils import compile_module_to_flatbuffer
-from apps.shark_studio.web.utils.file_utils import get_resource_path
+from apps.shark_studio.web.utils.file_utils import (
+    get_resource_path,
+    get_checkpoints_path,
+)
 from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+from apps.shark_studio.api.utils import parse_device
+from urllib.request import urlopen
 import iree.runtime as ireert
 from itertools import chain
 import gc
@@ -13,7 +18,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 llm_model_map = {
-    "llama2_7b": {
+    "meta-llama/Llama-2-7b-chat-hf": {
        "initializer": stateless_llama.export_transformer_model,
        "hf_model_name": "meta-llama/Llama-2-7b-chat-hf",
        "compile_flags": ["--iree-opt-const-expr-hoisting=False"],
@@ -65,6 +70,7 @@ class LanguageModel:
        use_system_prompt=True,
        streaming_llm=False,
    ):
+        _, _, self.triple = parse_device(device)
        self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
        self.device = device.split("=>")[-1].strip()
        self.backend = self.device.split("://")[0]
@@ -155,7 +161,9 @@ class LanguageModel:
                use_auth_token=hf_auth_token,
            )
        elif not os.path.exists(self.tempfile_name):
-            self.torch_ir, self.tokenizer = llm_model_map[model_name]["initializer"](
+            self.torch_ir, self.tokenizer = llm_model_map[self.hf_model_name][
+                "initializer"
+            ](
                self.hf_model_name,
                hf_auth_token,
                compile_to="torch",
@@ -163,6 +171,7 @@ class LanguageModel:
                precision=self.precision,
                quantization=self.quantization,
                streaming_llm=self.streaming_llm,
+                decomp_attn=True,
            )
            with open(self.tempfile_name, "w+") as f:
                f.write(self.torch_ir)
@@ -192,11 +201,27 @@ class LanguageModel:
            )
        elif self.backend == "vulkan":
            flags.extend(["--iree-stream-resource-max-allocation-size=4294967296"])
+        elif self.backend == "rocm":
+            flags.extend(
+                [
+                    "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
+                    "--iree-llvmgpu-enable-prefetch=true",
+                    "--iree-opt-outer-dim-concat=true",
+                    "--iree-flow-enable-aggressive-fusion",
+                ]
+            )
+            if "gfx9" in self.triple:
+                flags.extend(
+                    [
+                        f"--iree-codegen-transform-dialect-library={get_mfma_spec_path(self.triple, get_checkpoints_path())}",
+                        "--iree-codegen-llvmgpu-use-vector-distribution=true",
+                    ]
+                )
        flags.extend(llm_model_map[self.hf_model_name]["compile_flags"])
        flatbuffer_blob = compile_module_to_flatbuffer(
            self.tempfile_name,
            device=self.device,
-            frontend="torch",
+            frontend="auto",
            model_config_path=None,
            extra_args=flags,
            write_to=self.vmfb_name,
@@ -258,7 +283,7 @@ class LanguageModel:

            history.append(format_out(token))
            while (
-                format_out(token) != llm_model_map["llama2_7b"]["stop_token"]
+                format_out(token) != llm_model_map[self.hf_model_name]["stop_token"]
                and len(history) < self.max_tokens
            ):
                dec_time = time.time()
@@ -272,7 +297,7 @@ class LanguageModel:

            self.prev_token_len = token_len + len(history)

-            if format_out(token) == llm_model_map["llama2_7b"]["stop_token"]:
+            if format_out(token) == llm_model_map[self.hf_model_name]["stop_token"]:
                break

        for i in range(len(history)):
@@ -306,7 +331,7 @@ class LanguageModel:
                self.first_input = False

            history.append(int(token))
-            while token != llm_model_map["llama2_7b"]["stop_token"]:
+            while token != llm_model_map[self.hf_model_name]["stop_token"]:
                dec_time = time.time()
                result = self.hf_mod(token.reshape([1, 1]), past_key_values=pkv)
                history.append(int(token))
@@ -317,7 +342,7 @@ class LanguageModel:

            self.prev_token_len = token_len + len(history)

-            if token == llm_model_map["llama2_7b"]["stop_token"]:
+            if token == llm_model_map[self.hf_model_name]["stop_token"]:
                break
        for i in range(len(history)):
            if type(history[i]) != int:
@@ -327,6 +352,17 @@ class LanguageModel:
        return result_output, total_time


+def get_mfma_spec_path(target_chip, save_dir):
+    url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
+    attn_spec = urlopen(url).read().decode("utf-8")
+    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
+    if os.path.exists(spec_path):
+        return spec_path
+    with open(spec_path, "w") as f:
+        f.write(attn_spec)
+    return spec_path
+
+
 def llm_chat_api(InputData: dict):
    from datetime import datetime as dt

@@ -347,7 +383,11 @@ def llm_chat_api(InputData: dict):
    else:
        print(f"prompt : {InputData['prompt']}")

-    model_name = InputData["model"] if "model" in InputData.keys() else "llama2_7b"
+    model_name = (
+        InputData["model"]
+        if "model" in InputData.keys()
+        else "meta-llama/Llama-2-7b-chat-hf"
+    )
    model_path = llm_model_map[model_name]
    device = InputData["device"] if "device" in InputData.keys() else "cpu"
    precision = "fp16"
--- a/apps/shark_studio/api/sd.py
+++ b/apps/shark_studio/api/sd.py
@@ -1,54 +1,82 @@
 import gc
 import torch
+import gradio as gr
 import time
 import os
 import json
 import numpy as np
+import copy
+import importlib.util
+import sys
 from tqdm.auto import tqdm

 from pathlib import Path
 from random import randint
-from turbine_models.custom_models.sd_inference import clip, unet, vae
+from turbine_models.custom_models.sd_inference.sd_pipeline import SharkSDPipeline
+from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
+    SharkSDXLPipeline,
+)
+
+
 from apps.shark_studio.api.controlnet import control_adapter_map
+from apps.shark_studio.api.utils import parse_device
 from apps.shark_studio.web.utils.state import status_label
 from apps.shark_studio.web.utils.file_utils import (
    safe_name,
    get_resource_path,
    get_checkpoints_path,
 )
-from apps.shark_studio.modules.pipeline import SharkPipelineBase
-from apps.shark_studio.modules.schedulers import get_schedulers
-from apps.shark_studio.modules.prompt_encoding import (
-    get_weighted_text_embeddings,
-)
+
 from apps.shark_studio.modules.img_processing import (
-    resize_stencil,
    save_output_img,
-    resamplers,
-    resampler_list,
 )

 from apps.shark_studio.modules.ckpt_processing import (
    preprocessCKPT,
-    process_custom_pipe_weights,
+    save_irpa,
 )
-from transformers import CLIPTokenizer
-from diffusers.image_processor import VaeImageProcessor

-sd_model_map = {
-    "clip": {
-        "initializer": clip.export_clip_model,
-    },
-    "unet": {
-        "initializer": unet.export_unet_model,
-    },
-    "vae_decode": {
-        "initializer": vae.export_vae_model,
-    },
+EMPTY_SD_MAP = {
+    "clip": None,
+    "scheduler": None,
+    "unet": None,
+    "vae_decode": None,
+}
+
+EMPTY_SDXL_MAP = {
+    "prompt_encoder": None,
+    "scheduled_unet": None,
+    "vae_decode": None,
+    "pipeline": None,
+    "full_pipeline": None,
+}
+
+EMPTY_FLAGS = {
+    "clip": None,
+    "unet": None,
+    "vae": None,
+    "pipeline": None,
 }


-class StableDiffusion(SharkPipelineBase):
+def load_script(source, module_name):
+    """
+    reads file source and loads it as a module
+
+    :param source: file to load
+    :param module_name: name of module to register in sys.modules
+    :return: loaded module
+    """
+
+    spec = importlib.util.spec_from_file_location(module_name, source)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+
+    return module
+
+
+class StableDiffusion:
    # This class is responsible for executing image generation and creating
    # /managing a set of compiled modules to run Stable Diffusion. The init
    # aims to be as general as possible, and the class will infer and compile
@@ -61,66 +89,45 @@ class StableDiffusion(SharkPipelineBase):
        height: int,
        width: int,
        batch_size: int,
+        steps: int,
+        scheduler: str,
        precision: str,
        device: str,
+        target_triple: str = None,
        custom_vae: str = None,
        num_loras: int = 0,
        import_ir: bool = True,
        is_controlled: bool = False,
-        hf_auth_token=None,
+        external_weights: str = "safetensors",
    ):
-        self.model_max_length = 77
-        self.batch_size = batch_size
        self.precision = precision
-        self.dtype = torch.float16 if precision == "fp16" else torch.float32
-        self.height = height
-        self.width = width
-        self.scheduler_obj = {}
-        static_kwargs = {
-            "pipe": {
-                "external_weights": "safetensors",
-            },
-            "clip": {"hf_model_name": base_model_id},
-            "unet": {
-                "hf_model_name": base_model_id,
-                "unet_model": unet.UnetModel(hf_model_name=base_model_id),
-                "batch_size": batch_size,
-                # "is_controlled": is_controlled,
-                # "num_loras": num_loras,
-                "height": height,
-                "width": width,
-                "precision": precision,
-                "max_length": self.model_max_length,
-            },
-            "vae_encode": {
-                "hf_model_name": base_model_id,
-                "vae_model": vae.VaeModel(
-                    hf_model_name=custom_vae if custom_vae else base_model_id,
-                ),
-                "batch_size": batch_size,
-                "height": height,
-                "width": width,
-                "precision": precision,
-            },
-            "vae_decode": {
-                "hf_model_name": base_model_id,
-                "vae_model": vae.VaeModel(
-                    hf_model_name=custom_vae if custom_vae else base_model_id,
-                ),
-                "batch_size": batch_size,
-                "height": height,
-                "width": width,
-                "precision": precision,
-            },
-        }
-        super().__init__(sd_model_map, base_model_id, static_kwargs, device, import_ir)
+        self.compiled_pipeline = False
+        self.base_model_id = base_model_id
+        self.custom_vae = custom_vae
+        self.is_sdxl = "xl" in self.base_model_id.lower()
+        self.is_custom = ".py" in self.base_model_id.lower()
+        if self.is_custom:
+            custom_module = load_script(
+                os.path.join(get_checkpoints_path("scripts"), self.base_model_id),
+                "custom_pipeline",
+            )
+            self.turbine_pipe = custom_module.StudioPipeline
+            self.model_map = custom_module.MODEL_MAP
+        elif self.is_sdxl:
+            self.turbine_pipe = SharkSDXLPipeline
+            self.model_map = EMPTY_SDXL_MAP
+        else:
+            self.turbine_pipe = SharkSDPipeline
+            self.model_map = EMPTY_SD_MAP
+        max_length = 64
+        target_backend, self.rt_device, triple = parse_device(device, target_triple)
        pipe_id_list = [
            safe_name(base_model_id),
            str(batch_size),
-            str(self.model_max_length),
+            str(max_length),
            f"{str(height)}x{str(width)}",
            precision,
-            self.device,
+            triple,
        ]
        if num_loras > 0:
            pipe_id_list.append(str(num_loras) + "lora")
@@ -129,305 +136,147 @@ class StableDiffusion(SharkPipelineBase):
        if custom_vae:
            pipe_id_list.append(custom_vae)
        self.pipe_id = "_".join(pipe_id_list)
-        print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
-        del static_kwargs
-        gc.collect()
-
-    def prepare_pipe(self, custom_weights, adapters, embeddings, is_img2img):
-        print(f"\n[LOG] Preparing pipeline...")
-        self.is_img2img = is_img2img
-        self.schedulers = get_schedulers(self.base_model_id)
-
-        self.weights_path = os.path.join(
-            get_checkpoints_path(), self.safe_name(self.base_model_id)
+        self.pipeline_dir = Path(os.path.join(get_checkpoints_path(), self.pipe_id))
+        self.weights_path = Path(
+            os.path.join(
+                get_checkpoints_path(), safe_name(self.base_model_id + "_" + precision)
+            )
        )
        if not os.path.exists(self.weights_path):
            os.mkdir(self.weights_path)

-        for model in adapters:
-            self.model_map[model] = adapters[model]
+        decomp_attn = True
+        attn_spec = None
+        if triple in ["gfx940", "gfx942", "gfx90a"]:
+            decomp_attn = False
+            attn_spec = "mfma"
+        elif triple in ["gfx1100", "gfx1103", "gfx1150"]:
+            decomp_attn = False
+            attn_spec = "wmma"
+            if triple in ["gfx1103", "gfx1150"]:
+                # external weights have issues on igpu
+                external_weights = None
+        elif target_backend == "llvm-cpu":
+            decomp_attn = False

-        for submodel in self.static_kwargs:
-            if custom_weights:
-                custom_weights_params, _ = process_custom_pipe_weights(custom_weights)
-                if submodel not in ["clip", "clip2"]:
-                    self.static_kwargs[submodel][
-                        "external_weights"
-                    ] = custom_weights_params
-                else:
-                    self.static_kwargs[submodel]["external_weight_path"] = os.path.join(
-                        self.weights_path, submodel + ".safetensors"
+        self.sd_pipe = self.turbine_pipe(
+            hf_model_name=base_model_id,
+            scheduler_id=scheduler,
+            height=height,
+            width=width,
+            precision=precision,
+            max_length=max_length,
+            batch_size=batch_size,
+            num_inference_steps=steps,
+            device=target_backend,
+            iree_target_triple=triple,
+            ireec_flags=EMPTY_FLAGS,
+            attn_spec=attn_spec,
+            decomp_attn=decomp_attn,
+            pipeline_dir=self.pipeline_dir,
+            external_weights_dir=self.weights_path,
+            external_weights=external_weights,
+            custom_vae=custom_vae,
+        )
+        print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
+        gc.collect()
+
+    def prepare_pipe(
+        self, custom_weights, adapters, embeddings, is_img2img, compiled_pipeline
+    ):
+        print(f"\n[LOG] Preparing pipeline...")
+        self.is_img2img = False
+        mlirs = copy.deepcopy(self.model_map)
+        vmfbs = copy.deepcopy(self.model_map)
+        weights = copy.deepcopy(self.model_map)
+        if not self.is_sdxl:
+            compiled_pipeline = False
+        self.compiled_pipeline = compiled_pipeline
+
+        if custom_weights:
+            custom_weights = os.path.join(
+                get_checkpoints_path("checkpoints"),
+                safe_name(self.base_model_id.split("/")[-1]),
+                custom_weights,
+            )
+            diffusers_weights_path = preprocessCKPT(custom_weights, self.precision)
+            for key in weights:
+                if key in ["scheduled_unet", "unet"]:
+                    unet_weights_path = os.path.join(
+                        diffusers_weights_path,
+                        "unet",
+                        "diffusion_pytorch_model.safetensors",
                    )
-            else:
-                self.static_kwargs[submodel]["external_weight_path"] = os.path.join(
-                    self.weights_path, submodel + ".safetensors"
-                )
+                    weights[key] = save_irpa(unet_weights_path, "unet.")

-        self.get_compiled_map(pipe_id=self.pipe_id)
-        print("\n[LOG] Pipeline successfully prepared for runtime.")
+                elif key in ["clip", "prompt_encoder"]:
+                    if not self.is_sdxl:
+                        sd1_path = os.path.join(
+                            diffusers_weights_path, "text_encoder", "model.safetensors"
+                        )
+                        weights[key] = save_irpa(sd1_path, "text_encoder_model.")
+                    else:
+                        clip_1_path = os.path.join(
+                            diffusers_weights_path, "text_encoder", "model.safetensors"
+                        )
+                        clip_2_path = os.path.join(
+                            diffusers_weights_path,
+                            "text_encoder_2",
+                            "model.safetensors",
+                        )
+                        weights[key] = [
+                            save_irpa(clip_1_path, "text_encoder_model_1."),
+                            save_irpa(clip_2_path, "text_encoder_model_2."),
+                        ]
+
+                elif key in ["vae_decode"] and weights[key] is None:
+                    vae_weights_path = os.path.join(
+                        diffusers_weights_path,
+                        "vae",
+                        "diffusion_pytorch_model.safetensors",
+                    )
+                    weights[key] = save_irpa(vae_weights_path, "vae.")
+
+        vmfbs, weights = self.sd_pipe.check_prepared(
+            mlirs, vmfbs, weights, interactive=False
+        )
+        print(f"\n[LOG] Loading pipeline to device {self.rt_device}.")
+        self.sd_pipe.load_pipeline(
+            vmfbs, weights, self.rt_device, self.compiled_pipeline
+        )
+        print(
+            "\n[LOG] Pipeline successfully prepared for runtime. Generating images..."
+        )
        return

-    def encode_prompts_weight(
-        self,
-        prompt,
-        negative_prompt,
-        do_classifier_free_guidance=True,
-    ):
-        # Encodes the prompt into text encoder hidden states.
-        self.load_submodels(["clip"])
-        self.tokenizer = CLIPTokenizer.from_pretrained(
-            self.base_model_id,
-            subfolder="tokenizer",
-        )
-        clip_inf_start = time.time()
-
-        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-        )
-
-        if do_classifier_free_guidance:
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        pad = (0, 0) * (len(text_embeddings.shape) - 2)
-        pad = pad + (
-            0,
-            self.static_kwargs["unet"]["max_length"] - text_embeddings.shape[1],
-        )
-        text_embeddings = torch.nn.functional.pad(text_embeddings, pad)
-
-        # SHARK: Report clip inference time
-        clip_inf_time = (time.time() - clip_inf_start) * 1000
-        if self.ondemand:
-            self.unload_submodels(["clip"])
-            gc.collect()
-        print(f"\n[LOG] Clip Inference time (ms) = {clip_inf_time:.3f}")
-
-        return text_embeddings.numpy().astype(np.float16)
-
-    def prepare_latents(
-        self,
-        generator,
-        num_inference_steps,
-        image,
-        strength,
-    ):
-        noise = torch.randn(
-            (
-                self.batch_size,
-                4,
-                self.height // 8,
-                self.width // 8,
-            ),
-            generator=generator,
-            dtype=self.dtype,
-        ).to("cpu")
-
-        self.scheduler.set_timesteps(num_inference_steps)
-        if self.is_img2img:
-            init_timestep = min(
-                int(num_inference_steps * strength), num_inference_steps
-            )
-            t_start = max(num_inference_steps - init_timestep, 0)
-            timesteps = self.scheduler.timesteps[t_start:]
-            latents = self.encode_image(image)
-            latents = self.scheduler.add_noise(latents, noise, timesteps[0].repeat(1))
-            return latents, [timesteps]
-        else:
-            self.scheduler.is_scale_input_called = True
-            latents = noise * self.scheduler.init_noise_sigma
-            return latents, self.scheduler.timesteps
-
-    def encode_image(self, input_image):
-        self.load_submodels(["vae_encode"])
-        vae_encode_start = time.time()
-        latents = self.run("vae_encode", input_image)
-        vae_inf_time = (time.time() - vae_encode_start) * 1000
-        if self.ondemand:
-            self.unload_submodels(["vae_encode"])
-        print(f"\n[LOG] VAE Encode Inference time (ms): {vae_inf_time:.3f}")
-
-        return latents
-
-    def produce_img_latents(
-        self,
-        latents,
-        text_embeddings,
-        guidance_scale,
-        total_timesteps,
-        cpu_scheduling,
-        mask=None,
-        masked_image_latents=None,
-        return_all_latents=False,
-    ):
-        # self.status = SD_STATE_IDLE
-        step_time_sum = 0
-        latent_history = [latents]
-        text_embeddings = torch.from_numpy(text_embeddings).to(self.dtype)
-        text_embeddings_numpy = text_embeddings.detach().numpy()
-        guidance_scale = torch.Tensor([guidance_scale]).to(self.dtype)
-        self.load_submodels(["unet"])
-        for i, t in tqdm(enumerate(total_timesteps)):
-            step_start_time = time.time()
-            timestep = torch.tensor([t]).to(self.dtype).detach().numpy()
-            latent_model_input = self.scheduler.scale_model_input(latents, t).to(
-                self.dtype
-            )
-            if mask is not None and masked_image_latents is not None:
-                latent_model_input = torch.cat(
-                    [
-                        torch.from_numpy(np.asarray(latent_model_input)).to(self.dtype),
-                        mask,
-                        masked_image_latents,
-                    ],
-                    dim=1,
-                ).to(self.dtype)
-            if cpu_scheduling:
-                latent_model_input = latent_model_input.detach().numpy()
-
-            # Profiling Unet.
-            # profile_device = start_profiling(file_path="unet.rdc")
-            noise_pred = self.run(
-                "unet",
-                [
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                ],
-            )
-            # end_profiling(profile_device)
-
-            if cpu_scheduling:
-                noise_pred = torch.from_numpy(noise_pred.to_host())
-                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
-            else:
-                latents = self.run("scheduler_step", (noise_pred, t, latents))
-
-            latent_history.append(latents)
-            step_time = (time.time() - step_start_time) * 1000
-            # print(
-            #     f"\n [LOG] step = {i} | timestep = {t} | time = {step_time:.2f}ms"
-            # )
-            step_time_sum += step_time
-
-            # if self.status == SD_STATE_CANCEL:
-            #    break
-
-        if self.ondemand:
-            self.unload_submodels(["unet"])
-            gc.collect()
-
-        avg_step_time = step_time_sum / len(total_timesteps)
-        print(f"\n[LOG] Average step time: {avg_step_time}ms/it")
-
-        if not return_all_latents:
-            return latents
-        all_latents = torch.cat(latent_history, dim=0)
-        return all_latents
-
-    def decode_latents(self, latents, cpu_scheduling=True):
-        latents_numpy = latents.to(self.dtype)
-        if cpu_scheduling:
-            latents_numpy = latents.detach().numpy()
-
-        # profile_device = start_profiling(file_path="vae.rdc")
-        vae_start = time.time()
-        images = self.run("vae_decode", latents_numpy).to_host()
-        vae_inf_time = (time.time() - vae_start) * 1000
-        # end_profiling(profile_device)
-        print(f"\n[LOG] VAE Inference time (ms): {vae_inf_time:.3f}")
-
-        images = torch.from_numpy(images).permute(0, 2, 3, 1).float().numpy()
-        pil_images = self.image_processor.numpy_to_pil(images)
-        return pil_images
-
    def generate_images(
        self,
        prompt,
        negative_prompt,
        image,
-        scheduler,
-        steps,
        strength,
        guidance_scale,
        seed,
        ondemand,
-        repeatable_seeds,
        resample_type,
        control_mode,
        hints,
    ):
-        # TODO: Batched args
-        self.image_processor = VaeImageProcessor(do_convert_rgb=True)
-        self.scheduler = self.schedulers[scheduler]
-        self.ondemand = ondemand
-        if self.is_img2img:
-            image, _ = self.image_processor.preprocess(image, resample_type)
-        else:
-            image = None
-
-        print("\n[LOG] Generating images...")
-        batched_args = [
-            prompt,
-            negative_prompt,
-            image,
-        ]
-        for arg in batched_args:
-            if not isinstance(arg, list):
-                arg = [arg] * self.batch_size
-            if len(arg) < self.batch_size:
-                arg = arg * self.batch_size
-            else:
-                arg = [arg[i] for i in range(self.batch_size)]
-
-        text_embeddings = self.encode_prompts_weight(
+        img = self.sd_pipe.generate_images(
            prompt,
            negative_prompt,
+            1,
+            guidance_scale,
+            seed,
+            return_imgs=True,
        )
-
-        uint32_info = np.iinfo(np.uint32)
-        uint32_min, uint32_max = uint32_info.min, uint32_info.max
-        if seed < uint32_min or seed >= uint32_max:
-            seed = randint(uint32_min, uint32_max)
-
-        generator = torch.manual_seed(seed)
-
-        init_latents, final_timesteps = self.prepare_latents(
-            generator=generator,
-            num_inference_steps=steps,
-            image=image,
-            strength=strength,
-        )
-
-        latents = self.produce_img_latents(
-            latents=init_latents,
-            text_embeddings=text_embeddings,
-            guidance_scale=guidance_scale,
-            total_timesteps=final_timesteps,
-            cpu_scheduling=True,  # until we have schedulers through Turbine
-        )
-
-        # Img latents -> PIL images
-        all_imgs = []
-        self.load_submodels(["vae_decode"])
-        for i in tqdm(range(0, latents.shape[0], self.batch_size)):
-            imgs = self.decode_latents(
-                latents=latents[i : i + self.batch_size],
-                cpu_scheduling=True,
-            )
-            all_imgs.extend(imgs)
-        if self.ondemand:
-            self.unload_submodels(["vae_decode"])
-
-        return all_imgs
+        return img


 def shark_sd_fn_dict_input(
    sd_kwargs: dict,
 ):
-    print("[LOG] Submitting Request...")
+    print("\n[LOG] Submitting Request...")

    for key in sd_kwargs:
        if sd_kwargs[key] in [None, []]:
@@ -437,9 +286,34 @@ def shark_sd_fn_dict_input(
        if key == "seed":
            sd_kwargs[key] = int(sd_kwargs[key])

-    for i in range(1):
-        generated_imgs = yield from shark_sd_fn(**sd_kwargs)
-        yield generated_imgs
+    # TODO: move these checks into the UI code so we don't have gradio warnings in a generalized dict input function.
+    if not sd_kwargs["device"]:
+        gr.Warning("No device specified. Please specify a device.")
+        return None, ""
+    if sd_kwargs["height"] not in [512, 1024]:
+        gr.Warning("Height must be 512 or 1024. This is a temporary limitation.")
+        return None, ""
+    if sd_kwargs["height"] != sd_kwargs["width"]:
+        gr.Warning("Height and width must be the same. This is a temporary limitation.")
+        return None, ""
+    if sd_kwargs["base_model_id"] == "stabilityai/sdxl-turbo":
+        if sd_kwargs["steps"] > 10:
+            gr.Warning("Max steps for sdxl-turbo is 10. 1 to 4 steps are recommended.")
+            return None, ""
+        if sd_kwargs["guidance_scale"] > 3:
+            gr.Warning(
+                "sdxl-turbo CFG scale should be less than 2.0 if using negative prompt, 0 otherwise."
+            )
+            return None, ""
+    if sd_kwargs["target_triple"] == "":
+        if parse_device(sd_kwargs["device"], sd_kwargs["target_triple"])[2] == "":
+            gr.Warning(
+                "Target device architecture could not be inferred. Please specify a target triple, e.g. 'gfx1100' for a Radeon 7900xtx."
+            )
+            return None, ""
+
+    generated_imgs = yield from shark_sd_fn(**sd_kwargs)
+    return generated_imgs


 def shark_sd_fn(
@@ -460,8 +334,9 @@ def shark_sd_fn(
    custom_vae: str,
    precision: str,
    device: str,
+    target_triple: str,
    ondemand: bool,
-    repeatable_seeds: bool,
+    compiled_pipeline: bool,
    resample_type: str,
    controlnets: dict,
    embeddings: dict,
@@ -471,8 +346,6 @@ def shark_sd_fn(
        sd_init_image = [sd_init_image]
    is_img2img = True if sd_init_image[0] is not None else False

-    print("\n[LOG] Performing Stable Diffusion Pipeline setup...")
-
    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
    import apps.shark_studio.web.utils.globals as global_obj

@@ -481,6 +354,7 @@ def shark_sd_fn(
    control_mode = None
    hints = []
    num_loras = 0
+    import_ir = True
    for i in embeddings:
        num_loras += 1 if embeddings[i] else 0
    if "model" in controlnets:
@@ -512,28 +386,29 @@ def shark_sd_fn(
        "batch_size": batch_size,
        "precision": precision,
        "device": device,
+        "target_triple": target_triple,
        "custom_vae": custom_vae,
        "num_loras": num_loras,
-        "import_ir": cmd_opts.import_mlir,
+        "import_ir": import_ir,
        "is_controlled": is_controlled,
+        "steps": steps,
+        "scheduler": scheduler,
    }
    submit_prep_kwargs = {
        "custom_weights": custom_weights,
        "adapters": adapters,
        "embeddings": embeddings,
        "is_img2img": is_img2img,
+        "compiled_pipeline": compiled_pipeline,
    }
    submit_run_kwargs = {
        "prompt": prompt,
        "negative_prompt": negative_prompt,
        "image": sd_init_image,
-        "steps": steps,
-        "scheduler": scheduler,
        "strength": strength,
        "guidance_scale": guidance_scale,
        "seed": seed,
        "ondemand": ondemand,
-        "repeatable_seeds": repeatable_seeds,
        "resample_type": resample_type,
        "control_mode": control_mode,
        "hints": hints,
@@ -566,22 +441,35 @@ def shark_sd_fn(
    for current_batch in range(batch_count):
        start_time = time.time()
        out_imgs = global_obj.get_sd_obj().generate_images(**submit_run_kwargs)
-        total_time = time.time() - start_time
-        text_output = f"Total image(s) generation time: {total_time:.4f}sec"
-        print(f"\n[LOG] {text_output}")
+        if not isinstance(out_imgs, list):
+            out_imgs = [out_imgs]
+        # total_time = time.time() - start_time
+        # text_output = f"Total image(s) generation time: {total_time:.4f}sec"
+        # print(f"\n[LOG] {text_output}")
        # if global_obj.get_sd_status() == SD_STATE_CANCEL:
        #     break
        # else:
-        save_output_img(
-            out_imgs[current_batch],
-            seed,
-            sd_kwargs,
-        )
+        for batch in range(batch_size):
+            save_output_img(
+                out_imgs[batch],
+                seed,
+                sd_kwargs,
+            )
        generated_imgs.extend(out_imgs)
+        # TODO: make seed changes over batch counts more configurable.
+        submit_run_kwargs["seed"] = submit_run_kwargs["seed"] + 1
        yield generated_imgs, status_label(
            "Stable Diffusion", current_batch + 1, batch_count, batch_size
        )
-    return generated_imgs, ""
+    return (generated_imgs, "")
+
+
+def unload_sd():
+    print("Unloading models.")
+    import apps.shark_studio.web.utils.globals as global_obj
+
+    global_obj.clear_cache()
+    gc.collect()


 def cancel_sd():
@@ -596,13 +484,19 @@ def view_json_file(file_path):
    return content


+def safe_name(name):
+    return name.replace("/", "_").replace("\\", "_").replace(".", "_")
+
+
 if __name__ == "__main__":
    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
    import apps.shark_studio.web.utils.globals as global_obj

    global_obj._init()

-    sd_json = view_json_file(get_resource_path("../configs/default_sd_config.json"))
+    sd_json = view_json_file(
+        get_resource_path(os.path.join(cmd_opts.config_dir, "default_sd_config.json"))
+    )
    sd_kwargs = json.loads(sd_json)
    for arg in vars(cmd_opts):
        if arg in sd_kwargs:
--- a/apps/shark_studio/api/utils.py
+++ b/apps/shark_studio/api/utils.py
@@ -52,6 +52,13 @@ def get_available_devices():
    set_iree_runtime_flags()

    available_devices = []
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
+    cpu_device = get_devices_by_name("cpu-sync")
+    available_devices.extend(cpu_device)
+    cpu_device = get_devices_by_name("cpu-task")
+    available_devices.extend(cpu_device)
+
    from shark.iree_utils.vulkan_utils import (
        get_all_vulkan_devices,
    )
@@ -64,17 +71,28 @@ def get_available_devices():
        id += 1
    if id != 0:
        print(f"vulkan devices are available.")
+
    available_devices.extend(vulkan_devices)
    metal_devices = get_devices_by_name("metal")
    available_devices.extend(metal_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
-    rocm_devices = get_devices_by_name("rocm")
-    available_devices.extend(rocm_devices)
-    cpu_device = get_devices_by_name("cpu-sync")
-    available_devices.extend(cpu_device)
-    cpu_device = get_devices_by_name("cpu-task")
-    available_devices.extend(cpu_device)
+    hip_devices = get_devices_by_name("hip")
+    available_devices.extend(hip_devices)
+
+    for idx, device_str in enumerate(available_devices):
+        if "AMD Radeon(TM) Graphics =>" in device_str:
+            igpu_id_candidates = [
+                x.split("w/")[-1].split("=>")[0]
+                for x in available_devices
+                if "M Graphics" in x
+            ]
+            for igpu_name in igpu_id_candidates:
+                if igpu_name:
+                    available_devices[idx] = device_str.replace(
+                        "AMD Radeon(TM) Graphics", igpu_name
+                    )
+                break
    return available_devices


@@ -127,6 +145,57 @@ def set_iree_runtime_flags():
    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)


+def parse_device(device_str, target_override=""):
+    from shark.iree_utils.compile_utils import (
+        clean_device_info,
+        get_iree_target_triple,
+        iree_target_map,
+    )
+
+    rt_driver, device_id = clean_device_info(device_str)
+    target_backend = iree_target_map(rt_driver)
+    if device_id:
+        rt_device = f"{rt_driver}://{device_id}"
+    else:
+        rt_device = rt_driver
+
+    if target_override:
+        return target_backend, rt_device, target_override
+    match target_backend:
+        case "vulkan-spirv":
+            triple = get_iree_target_triple(device_str)
+            return target_backend, rt_device, triple
+        case "rocm":
+            triple = get_rocm_target_chip(device_str)
+            return target_backend, rt_device, triple
+        case "llvm-cpu":
+            return "llvm-cpu", "local-task", "x86_64-linux-gnu"
+
+
+def get_rocm_target_chip(device_str):
+    # TODO: Use a data file to map device_str to target chip.
+    rocm_chip_map = {
+        "6700": "gfx1031",
+        "6800": "gfx1030",
+        "6900": "gfx1030",
+        "7900": "gfx1100",
+        "MI300X": "gfx942",
+        "MI300A": "gfx940",
+        "MI210": "gfx90a",
+        "MI250": "gfx90a",
+        "MI100": "gfx908",
+        "MI50": "gfx906",
+        "MI60": "gfx906",
+        "780M": "gfx1103",
+    }
+    for key in rocm_chip_map:
+        if key in device_str:
+            return rocm_chip_map[key]
+    raise AssertionError(
+        f"Device {device_str} not recognized. Please file an issue at https://github.com/nod-ai/SHARK/issues."
+    )
+
+
 def get_all_devices(driver_name):
    """
    Inputs: driver_name
--- a/apps/shark_studio/modules/ckpt_processing.py
+++ b/apps/shark_studio/modules/ckpt_processing.py
@@ -2,10 +2,16 @@ import os
 import json
 import re
 import requests
+import torch
+import safetensors
+from shark_turbine.aot.params import (
+    ParameterArchiveBuilder,
+)
 from io import BytesIO
 from pathlib import Path
 from tqdm import tqdm
 from omegaconf import OmegaConf
+from diffusers import StableDiffusionPipeline
 from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
 from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
    download_from_original_stable_diffusion_ckpt,
@@ -14,21 +20,21 @@ from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
 )


-def get_path_to_diffusers_checkpoint(custom_weights):
+def get_path_to_diffusers_checkpoint(custom_weights, precision="fp16"):
    path = Path(custom_weights)
    diffusers_path = path.parent.absolute()
-    diffusers_directory_name = os.path.join("diffusers", path.stem)
+    diffusers_directory_name = os.path.join("diffusers", path.stem + f"_{precision}")
    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
    path_to_diffusers = complete_path_to_diffusers.as_posix()
    return path_to_diffusers


-def preprocessCKPT(custom_weights, is_inpaint=False):
-    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights)
+def preprocessCKPT(custom_weights, precision="fp16", is_inpaint=False):
+    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights, precision)
    if next(Path(path_to_diffusers).iterdir(), None):
        print("Checkpoint already loaded at : ", path_to_diffusers)
-        return
+        return path_to_diffusers
    else:
        print(
            "Diffusers' checkpoint will be identified here : ",
@@ -50,8 +56,24 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
        from_safetensors=from_safetensors,
        num_in_channels=num_in_channels,
    )
+    if precision == "fp16":
+        pipe.to(dtype=torch.float16)
    pipe.save_pretrained(path_to_diffusers)
+    del pipe
    print("Loading complete")
+    return path_to_diffusers
+
+
+def save_irpa(weights_path, prepend_str):
+    weights = safetensors.torch.load_file(weights_path)
+    archive = ParameterArchiveBuilder()
+    for key in weights.keys():
+        new_key = prepend_str + key
+        archive.add_tensor(new_key, weights[key])
+
+    irpa_file = weights_path.replace(".safetensors", ".irpa")
+    archive.save(irpa_file)
+    return irpa_file


 def convert_original_vae(vae_checkpoint):
@@ -87,6 +109,7 @@ def process_custom_pipe_weights(custom_weights):
            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
            custom_weights_tgt = get_path_to_diffusers_checkpoint(custom_weights)
            custom_weights_params = custom_weights
+
        return custom_weights_params, custom_weights_tgt


@@ -98,7 +121,7 @@ def get_civitai_checkpoint(url: str):
        base_filename = re.findall(
            '"([^"]*)"', response.headers["Content-Disposition"]
        )[0]
-        destination_path = Path.cwd() / (cmd_opts.ckpt_dir or "models") / base_filename
+        destination_path = Path.cwd() / (cmd_opts.model_dir or "models") / base_filename

        # we don't have this model downloaded yet
        if not destination_path.is_file():
--- a/apps/shark_studio/modules/pipeline.py
+++ b/apps/shark_studio/modules/pipeline.py
@@ -41,7 +41,7 @@ class SharkPipelineBase:
        self.device, self.device_id = clean_device_info(device)
        self.import_mlir = import_mlir
        self.iree_module_dict = {}
-        self.tmp_dir = get_resource_path(os.path.join("..", "shark_tmp"))
+        self.tmp_dir = get_resource_path(cmd_opts.tmp_dir)
        if not os.path.exists(self.tmp_dir):
            os.mkdir(self.tmp_dir)
        self.tempfiles = {}
@@ -55,9 +55,7 @@ class SharkPipelineBase:
        # and your model map is populated with any IR - unique model IDs and their static params,
        # call this method to get the artifacts associated with your map.
        self.pipe_id = self.safe_name(pipe_id)
-        self.pipe_vmfb_path = Path(
-            os.path.join(get_checkpoints_path(".."), self.pipe_id)
-        )
+        self.pipe_vmfb_path = Path(os.path.join(get_checkpoints_path(), self.pipe_id))
        self.pipe_vmfb_path.mkdir(parents=False, exist_ok=True)
        if submodel == "None":
            print("\n[LOG] Gathering any pre-compiled artifacts....")
--- a/apps/shark_studio/modules/schedulers.py
+++ b/apps/shark_studio/modules/schedulers.py
@@ -24,47 +24,47 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
-    schedulers["DDPM"] = DDPMScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["DDIM"] = DDIMScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id, subfolder="scheduler", algorithm_type="dpmsolver"
-    )
-    schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
-        model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
-    )
-    schedulers["DPMSolverMultistepKarras"] = (
-        DPMSolverMultistepScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-            use_karras_sigmas=True,
-        )
-    )
-    schedulers["DPMSolverMultistepKarras++"] = (
-        DPMSolverMultistepScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-            algorithm_type="dpmsolver++",
-            use_karras_sigmas=True,
-        )
-    )
+    # schedulers["DDPM"] = DDPMScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
+    #     model_id, subfolder="scheduler", algorithm_type="dpmsolver"
+    # )
+    # schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
+    #     model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
+    # )
+    # schedulers["DPMSolverMultistepKarras"] = (
+    #     DPMSolverMultistepScheduler.from_pretrained(
+    #         model_id,
+    #         subfolder="scheduler",
+    #         use_karras_sigmas=True,
+    #     )
+    # )
+    # schedulers["DPMSolverMultistepKarras++"] = (
+    #     DPMSolverMultistepScheduler.from_pretrained(
+    #         model_id,
+    #         subfolder="scheduler",
+    #         algorithm_type="dpmsolver++",
+    #         use_karras_sigmas=True,
+    #     )
+    # )
    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
@@ -75,24 +75,24 @@ def get_schedulers(model_id):
            subfolder="scheduler",
        )
    )
-    schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
-    schedulers["KDPM2AncestralDiscrete"] = (
-        KDPM2AncestralDiscreteScheduler.from_pretrained(
-            model_id,
-            subfolder="scheduler",
-        )
-    )
-    schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
-        model_id,
-        subfolder="scheduler",
-    )
+    # schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["KDPM2AncestralDiscrete"] = (
+    #     KDPM2AncestralDiscreteScheduler.from_pretrained(
+    #         model_id,
+    #         subfolder="scheduler",
+    #     )
+    # )
+    # schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
    return schedulers


@@ -101,17 +101,18 @@ def export_scheduler_model(model):


 scheduler_model_map = {
+    "PNDM": export_scheduler_model("PNDMScheduler"),
+    # "DPMSolverSDE": export_scheduler_model("DpmSolverSDEScheduler"),
    "EulerDiscrete": export_scheduler_model("EulerDiscreteScheduler"),
    "EulerAncestralDiscrete": export_scheduler_model("EulerAncestralDiscreteScheduler"),
-    "LCM": export_scheduler_model("LCMScheduler"),
-    "LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
-    "PNDM": export_scheduler_model("PNDMScheduler"),
-    "DDPM": export_scheduler_model("DDPMScheduler"),
-    "DDIM": export_scheduler_model("DDIMScheduler"),
-    "DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
-    "KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
-    "DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
-    "DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
-    "KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
-    "HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
+    # "LCM": export_scheduler_model("LCMScheduler"),
+    # "LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
+    # "DDPM": export_scheduler_model("DDPMScheduler"),
+    # "DDIM": export_scheduler_model("DDIMScheduler"),
+    # "DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
+    # "KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
+    # "DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
+    # "DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
+    # "KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
+    # "HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
 }
--- a/apps/shark_studio/modules/shared_cmd_opts.py
+++ b/apps/shark_studio/modules/shared_cmd_opts.py
@@ -339,7 +339,7 @@ p.add_argument(
 p.add_argument(
    "--output_dir",
    type=str,
-    default=None,
+    default=os.path.join(os.getcwd(), "generated_imgs"),
    help="Directory path to save the output images and json.",
 )

@@ -613,12 +613,27 @@ p.add_argument(
 )

 p.add_argument(
-    "--ckpt_dir",
+    "--tmp_dir",
    type=str,
-    default="../models",
+    default=os.path.join(os.getcwd(), "shark_tmp"),
+    help="Path to tmp directory",
+)
+
+p.add_argument(
+    "--config_dir",
+    type=str,
+    default=os.path.join(os.getcwd(), "configs"),
+    help="Path to config directory",
+)
+
+p.add_argument(
+    "--model_dir",
+    type=str,
+    default=os.path.join(os.getcwd(), "models"),
    help="Path to directory where all .ckpts are stored in order to populate "
    "them in the web UI.",
 )
+
 # TODO: replace API flag when these can be run together
 p.add_argument(
    "--ui",
--- a/apps/shark_studio/tests/api_test.py
+++ b/apps/shark_studio/tests/api_test.py
@@ -36,6 +36,7 @@ class LLMAPITest(unittest.TestCase):
            device="cpu",
            precision="fp32",
            quantization="None",
+            streaming_llm=True,
        )
        count = 0
        label = "Turkishoure Turkish"
--- a/apps/shark_studio/web/configs/default_sd_config.json
+++ b/apps/shark_studio/web/configs/default_sd_config.json
@@ -1,28 +0,0 @@
-{
-  "prompt": [
-    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
-  ],
-  "negative_prompt": [
-    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
-  ],
-  "sd_init_image": [null],
-  "height": 512,
-  "width": 512,
-  "steps": 50,
-  "strength": 0.8,
-  "guidance_scale": 7.5,
-  "seed": "-1",
-  "batch_count": 1,
-  "batch_size": 1,
-  "scheduler": "EulerDiscrete",
-  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
-  "custom_weights": null,
-  "custom_vae": null,
-  "precision": "fp16",
-  "device": "AMD Radeon RX 7900 XTX => vulkan://0",
-  "ondemand": false,
-  "repeatable_seeds": false,
-  "resample_type": "Nearest Neighbor",
-  "controlnets": {},
-  "embeddings": {}
-}
--- a/apps/shark_studio/web/index.py
+++ b/apps/shark_studio/web/index.py
@@ -76,8 +76,8 @@ def launch_webui(address):
 def webui():
    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
    from apps.shark_studio.web.ui.utils import (
-        nodicon_loc,
-        nodlogo_loc,
+        amdicon_loc,
+        amdlogo_loc,
    )

    launch_api = cmd_opts.api
@@ -172,9 +172,9 @@ def webui():
        analytics_enabled=False,
        title="Shark Studio 2.0 Beta",
    ) as studio_web:
-        nod_logo = Image.open(nodlogo_loc)
+        amd_logo = Image.open(amdlogo_loc)
        gr.Image(
-            value=nod_logo,
+            value=amd_logo,
            show_label=False,
            interactive=False,
            elem_id="tab_bar_logo",
@@ -209,7 +209,7 @@ def webui():
        inbrowser=True,
        server_name="0.0.0.0",
        server_port=cmd_opts.server_port,
-        favicon_path=nodicon_loc,
+        favicon_path=amdicon_loc,
    )


--- a/apps/shark_studio/web/ui/chat.py
+++ b/apps/shark_studio/web/ui/chat.py
@@ -9,6 +9,7 @@ from apps.shark_studio.api.llm import (
    llm_model_map,
    LanguageModel,
 )
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
 import apps.shark_studio.web.utils.globals as global_obj

 B_SYS, E_SYS = "<s>", "</s>"
@@ -64,6 +65,7 @@ def chat_fn(
            external_weights="safetensors",
            use_system_prompt=prompt_prefix,
            streaming_llm=streaming_llm,
+            hf_auth_token=cmd_opts.hf_auth_token,
        )
        history[-1][-1] = "Getting the model ready... Done"
        yield history, ""
@@ -135,7 +137,8 @@ with gr.Blocks(title="Chat") as chat_element:
            streaming_llm = gr.Checkbox(
                label="Run in streaming mode (requires recompilation)",
                value=True,
-                interactive=True,
+                interactive=False,
+                visible=False,
            )
            prompt_prefix = gr.Checkbox(
                label="Add System Prompt",
--- a/apps/shark_studio/web/ui/css/sd_dark_theme.css
+++ b/apps/shark_studio/web/ui/css/sd_dark_theme.css
@@ -367,7 +367,7 @@ footer {
 #tab_bar_logo .image-container {
    object-fit: scale-down;
    position: absolute !important;
-    top: 14px;
+    top: 10px;
    right: 0px;
    height: 36px;
-}
+}
--- a/apps/shark_studio/web/ui/logos/amd-icon.jpg
+++ b/apps/shark_studio/web/ui/logos/amd-icon.jpg
--- a/apps/shark_studio/web/ui/logos/amd-logo.jpg
+++ b/apps/shark_studio/web/ui/logos/amd-logo.jpg
--- a/apps/shark_studio/web/ui/logos/nod-icon.png
+++ b/apps/shark_studio/web/ui/logos/nod-icon.png
--- a/apps/shark_studio/web/ui/logos/nod-logo.png
+++ b/apps/shark_studio/web/ui/logos/nod-logo.png
--- a/apps/shark_studio/web/ui/outputgallery.py
+++ b/apps/shark_studio/web/ui/outputgallery.py
@@ -10,7 +10,7 @@ from apps.shark_studio.web.utils.file_utils import (
    get_generated_imgs_path,
    get_generated_imgs_todays_subdir,
 )
-from apps.shark_studio.web.ui.utils import nodlogo_loc
+from apps.shark_studio.web.ui.utils import amdlogo_loc
 from apps.shark_studio.web.utils.metadata import displayable_metadata

 # -- Functions for file, directory and image info querying
@@ -60,7 +60,7 @@ def output_subdirs() -> list[str]:
 # --- Define UI layout for Gradio

 with gr.Blocks() as outputgallery_element:
-    nod_logo = Image.open(nodlogo_loc)
+    amd_logo = Image.open(amdlogo_loc)

    with gr.Row(elem_id="outputgallery_gallery"):
        # needed to workaround gradio issue:
@@ -73,7 +73,7 @@ with gr.Blocks() as outputgallery_element:
        with gr.Column(scale=6):
            logo = gr.Image(
                label="Getting subdirectories...",
-                value=nod_logo,
+                value=amd_logo,
                interactive=False,
                visible=True,
                show_label=True,
--- a/apps/shark_studio/web/ui/sd.py
+++ b/apps/shark_studio/web/ui/sd.py
@@ -14,12 +14,12 @@ from apps.shark_studio.web.utils.file_utils import (
    get_checkpoints_path,
    get_checkpoints,
    get_configs_path,
-    write_default_sd_config,
+    write_default_sd_configs,
 )
 from apps.shark_studio.api.sd import (
-    sd_model_map,
    shark_sd_fn_dict_input,
    cancel_sd,
+    unload_sd,
 )
 from apps.shark_studio.api.controlnet import (
    cnet_preview,
@@ -33,7 +33,7 @@ from apps.shark_studio.modules.img_processing import (
 )
 from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
 from apps.shark_studio.web.ui.utils import (
-    nodlogo_loc,
+    amdlogo_loc,
    none_to_str_none,
    str_none_to_none,
 )
@@ -45,11 +45,10 @@ from apps.shark_studio.modules import logger
 import apps.shark_studio.web.utils.globals as global_obj

 sd_default_models = [
-    "CompVis/stable-diffusion-v1-4",
    "runwayml/stable-diffusion-v1-5",
    "stabilityai/stable-diffusion-2-1-base",
    "stabilityai/stable-diffusion-2-1",
-    "stabilityai/stable-diffusion-xl-1.0",
+    "stabilityai/stable-diffusion-xl-base-1.0",
    "stabilityai/sdxl-turbo",
 ]

@@ -119,8 +118,9 @@ def pull_sd_configs(
    custom_vae,
    precision,
    device,
+    target_triple,
    ondemand,
-    repeatable_seeds,
+    compiled_pipeline,
    resample_type,
    controlnets,
    embeddings,
@@ -177,8 +177,9 @@ def load_sd_cfg(sd_json: dict, load_sd_config: str):
        sd_json["custom_vae"],
        sd_json["precision"],
        sd_json["device"],
+        sd_json["target_triple"],
        sd_json["ondemand"],
-        sd_json["repeatable_seeds"],
+        sd_json["compiled_pipeline"],
        sd_json["resample_type"],
        sd_json["controlnets"],
        sd_json["embeddings"],
@@ -255,6 +256,11 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                        choices=global_obj.get_device_list(),
                        allow_custom_value=False,
                    )
+                    target_triple = gr.Textbox(
+                        elem_id="target_triple",
+                        label="Architecture",
+                        value="",
+                    )
                    with gr.Row():
                        ondemand = gr.Checkbox(
                            value=cmd_opts.lowvram,
@@ -277,18 +283,19 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                    elem_id="custom_model",
                    value="stabilityai/stable-diffusion-2-1-base",
                    choices=sd_default_models,
+                    allow_custom_value=True,
                )  # base_model_id
                with gr.Row():
                    height = gr.Slider(
                        384,
-                        768,
+                        1024,
                        value=cmd_opts.height,
                        step=8,
                        label="\U00002195\U0000FE0F Height",
                    )
                    width = gr.Slider(
                        384,
-                        768,
+                        1024,
                        value=cmd_opts.width,
                        step=8,
                        label="\U00002194\U0000FE0F Width",
@@ -581,21 +588,6 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                    object_fit="fit",
                                    preview=True,
                                )
-                            with gr.Row():
-                                std_output = gr.Textbox(
-                                    value=f"{sd_model_info}\n"
-                                    f"Images will be saved at "
-                                    f"{get_generated_imgs_path()}",
-                                    lines=2,
-                                    elem_id="std_output",
-                                    show_label=True,
-                                    label="Log",
-                                    show_copy_button=True,
-                                )
-                                sd_element.load(
-                                    logger.read_sd_logs, None, std_output, every=1
-                                )
-                                sd_status = gr.Textbox(visible=False)
                            with gr.Row():
                                batch_count = gr.Slider(
                                    1,
@@ -614,17 +606,15 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                    interactive=True,
                                    visible=True,
                                )
-                                repeatable_seeds = gr.Checkbox(
-                                    cmd_opts.repeatable_seeds,
-                                    label="Use Repeatable Seeds for Batches",
+                                compiled_pipeline = gr.Checkbox(
+                                    False,
+                                    label="Faster txt2img (SDXL only)",
                                )
                            with gr.Row():
                                stable_diffusion = gr.Button("Start")
-                                random_seed = gr.Button("Randomize Seed")
-                                random_seed.click(
-                                    lambda: -1,
-                                    inputs=[],
-                                    outputs=[seed],
+                                unload = gr.Button("Unload Models")
+                                unload.click(
+                                    fn=unload_sd,
                                    queue=False,
                                    show_progress=False,
                                )
@@ -639,7 +629,7 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                    get_configs_path(),
                                    "default_sd_config.json",
                                )
-                                write_default_sd_config(default_config_file)
+                                write_default_sd_configs(get_configs_path())
                                sd_json = gr.JSON(
                                    elem_classes=["fill"],
                                    value=view_json_file(default_config_file),
@@ -693,8 +683,9 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                                        custom_vae,
                                        precision,
                                        device,
+                                        target_triple,
                                        ondemand,
-                                        repeatable_seeds,
+                                        compiled_pipeline,
                                        resample_type,
                                        cnet_config,
                                        embeddings_config,
@@ -711,6 +702,22 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
                            inputs=[sd_json, sd_config_name],
                            outputs=[sd_config_name],
                        )
+                    with gr.Tab(label="Log", id=103) as sd_tab_log:
+                        with gr.Row():
+                            std_output = gr.Textbox(
+                                value=f"{sd_model_info}\n"
+                                f"Images will be saved at "
+                                f"{get_generated_imgs_path()}",
+                                lines=2,
+                                elem_id="std_output",
+                                show_label=True,
+                                label="Log",
+                                show_copy_button=True,
+                            )
+                            sd_element.load(
+                                logger.read_sd_logs, None, std_output, every=1
+                            )
+                            sd_status = gr.Textbox(visible=False)

    pull_kwargs = dict(
        fn=pull_sd_configs,
@@ -732,8 +739,9 @@ with gr.Blocks(title="Stable Diffusion") as sd_element:
            custom_vae,
            precision,
            device,
+            target_triple,
            ondemand,
-            repeatable_seeds,
+            compiled_pipeline,
            resample_type,
            cnet_config,
            embeddings_config,
--- a/apps/shark_studio/web/ui/utils.py
+++ b/apps/shark_studio/web/ui/utils.py
@@ -10,8 +10,8 @@ def resource_path(relative_path):
    return os.path.join(base_path, relative_path)


-nodlogo_loc = resource_path("logos/nod-logo.png")
-nodicon_loc = resource_path("logos/nod-icon.png")
+amdlogo_loc = resource_path("logos/amd-logo.jpg")
+amdicon_loc = resource_path("logos/amd-icon.jpg")


 class HSLHue(IntEnum):
--- a/apps/shark_studio/web/utils/default_configs.py
+++ b/apps/shark_studio/web/utils/default_configs.py
@@ -0,0 +1,95 @@
+default_sd_config = r"""{
+  "prompt": [
+    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
+  ],
+  "negative_prompt": [
+    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
+  ],
+  "sd_init_image": [null],
+  "height": 512,
+  "width": 512,
+  "steps": 50,
+  "strength": 0.8,
+  "guidance_scale": 7.5,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerDiscrete",
+  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "",
+  "target_triple": "",
+  "ondemand": false,
+  "compiled_pipeline": false,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""
+
+sdxl_30steps = r"""{
+  "prompt": [
+    "a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal"
+  ],
+  "negative_prompt": [
+    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
+  ],
+  "sd_init_image": [null],
+  "height": 1024,
+  "width": 1024,
+  "steps": 30,
+  "strength": 0.8,
+  "guidance_scale": 7.5,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerDiscrete",
+  "base_model_id": "stabilityai/stable-diffusion-xl-base-1.0",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "",
+  "target_triple": "",
+  "ondemand": false,
+  "compiled_pipeline": true,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""
+
+sdxl_turbo = r"""{
+  "prompt": [
+    "A cat wearing a hat that says 'TURBO' on it. The cat is sitting on a skateboard."
+  ],
+  "negative_prompt": [
+    ""
+  ],
+  "sd_init_image": [null],
+  "height": 512,
+  "width": 512,
+  "steps": 2,
+  "strength": 0.8,
+  "guidance_scale": 0,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerAncestralDiscrete",
+  "base_model_id": "stabilityai/sdxl-turbo",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "",
+  "target_triple": "",
+  "ondemand": false,
+  "compiled_pipeline": true,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""
+
+default_sd_configs = {
+    "default_sd_config.json": default_sd_config,
+    "sdxl-30steps.json": sdxl_30steps,
+    "sdxl-turbo.json": sdxl_turbo,
+}
--- a/apps/shark_studio/web/utils/file_utils.py
+++ b/apps/shark_studio/web/utils/file_utils.py
@@ -11,43 +11,18 @@ checkpoints_filetypes = (
    "*.safetensors",
 )

-default_sd_config = r"""{
-  "prompt": [
-    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
-  ],
-  "negative_prompt": [
-    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
-  ],
-  "sd_init_image": [null],
-  "height": 512,
-  "width": 512,
-  "steps": 50,
-  "strength": 0.8,
-  "guidance_scale": 7.5,
-  "seed": "-1",
-  "batch_count": 1,
-  "batch_size": 1,
-  "scheduler": "EulerDiscrete",
-  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
-  "custom_weights": null,
-  "custom_vae": null,
-  "precision": "fp16",
-  "device": "AMD Radeon RX 7900 XTX => vulkan://0",
-  "ondemand": false,
-  "repeatable_seeds": false,
-  "resample_type": "Nearest Neighbor",
-  "controlnets": {},
-  "embeddings": {}
-}"""
+from apps.shark_studio.web.utils.default_configs import default_sd_configs


-def write_default_sd_config(path):
-    with open(path, "w") as f:
-        f.write(default_sd_config)
+def write_default_sd_configs(path):
+    for key in default_sd_configs.keys():
+        config_fpath = os.path.join(path, key)
+        with open(config_fpath, "w") as f:
+            f.write(default_sd_configs[key])


 def safe_name(name):
-    return name.replace("/", "_").replace("-", "_")
+    return name.split("/")[-1].replace("-", "_")


 def get_path_stem(path):
@@ -66,33 +41,39 @@ def get_resource_path(path):


 def get_configs_path() -> Path:
-    configs = get_resource_path(os.path.join("..", "configs"))
+    configs = get_resource_path(cmd_opts.config_dir)
    if not os.path.exists(configs):
        os.mkdir(configs)
-    return Path(get_resource_path("../configs"))
+    return Path(configs)


 def get_generated_imgs_path() -> Path:
-    return Path(
-        cmd_opts.output_dir
-        if cmd_opts.output_dir
-        else get_resource_path("../generated_imgs")
-    )
+    outputs = get_resource_path(cmd_opts.output_dir)
+    if not os.path.exists(outputs):
+        os.mkdir(outputs)
+    return Path(outputs)
+
+
+def get_tmp_path() -> Path:
+    tmpdir = get_resource_path(cmd_opts.model_dir)
+    if not os.path.exists(tmpdir):
+        os.mkdir(tmpdir)
+    return Path(tmpdir)


 def get_generated_imgs_todays_subdir() -> str:
    return dt.now().strftime("%Y%m%d")


-def create_checkpoint_folders():
+def create_model_folders():
    dir = ["checkpoints", "vae", "lora", "vmfb"]
-    if not os.path.isdir(cmd_opts.ckpt_dir):
+    if not os.path.isdir(cmd_opts.model_dir):
        try:
-            os.makedirs(cmd_opts.ckpt_dir)
+            os.makedirs(cmd_opts.model_dir)
        except OSError:
            sys.exit(
-                f"Invalid --ckpt_dir argument, "
-                f"{cmd_opts.ckpt_dir} folder does not exist, and cannot be created."
+                f"Invalid --model_dir argument, "
+                f"{cmd_opts.model_dir} folder does not exist, and cannot be created."
            )

    for root in dir:
@@ -100,7 +81,7 @@ def create_checkpoint_folders():


 def get_checkpoints_path(model_type=""):
-    return get_resource_path(os.path.join(cmd_opts.ckpt_dir, model_type))
+    return get_resource_path(os.path.join(cmd_opts.model_dir, model_type))


 def get_checkpoints(model_type="checkpoints"):
--- a/apps/shark_studio/web/utils/metadata/png_metadata.py
+++ b/apps/shark_studio/web/utils/metadata/png_metadata.py
@@ -3,9 +3,8 @@ from pathlib import Path
 from apps.shark_studio.web.utils.file_utils import (
    get_checkpoint_pathfile,
 )
-from apps.shark_studio.api.sd import (
-    sd_model_map,
-)
+from apps.shark_studio.api.sd import EMPTY_SD_MAP as sd_model_map
+
 from apps.shark_studio.modules.schedulers import (
    scheduler_model_map,
 )
--- a/apps/shark_studio/web/utils/tmp_configs.py
+++ b/apps/shark_studio/web/utils/tmp_configs.py
@@ -2,7 +2,9 @@ import os
 import shutil
 from time import time

-shark_tmp = os.path.join(os.getcwd(), "shark_tmp/")
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+shark_tmp = cmd_opts.tmp_dir  # os.path.join(os.getcwd(), "shark_tmp/")


 def clear_tmp_mlir():
@@ -15,7 +17,7 @@ def clear_tmp_mlir():
        and filename.endswith(".mlir")
    ]
    for filename in mlir_files:
-        os.remove(shark_tmp + filename)
+        os.remove(os.path.join(shark_tmp, filename))
    print(f"Clearing .mlir temporary files took {time() - cleanup_start:.4f} seconds.")


--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -10,7 +10,7 @@ from utils import get_datasets

 shark_root = Path(__file__).parent.parent
 demo_css = shark_root.joinpath("web/demo.css").resolve()
-nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/nod-logo.png")
+nodlogo_loc = shark_root.joinpath("web/models/stable_diffusion/logos/amd-logo.jpg")


 with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,16 @@
-f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-f https://openxla.github.io/iree/pip-release-links.html
+-f https://download.pytorch.org/whl/nightly/cpu
+-f https://iree.dev/pip-release-links.html
 --pre

 setuptools
 wheel

-torch==2.3.0.dev20240305
-shark-turbine @ git+https://github.com/nod-ai/SHARK-Turbine.git@ean-sd-fp16#subdirectory=core
-turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@ean-sd-fp16#subdirectory=models
+
+torch==2.3.0
+shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
+turbine-models @ git+https://github.com/nod-ai/SHARK-Turbine.git@ean-unify-sd#subdirectory=models
+diffusers @ git+https://github.com/nod-ai/diffusers@0.29.0.dev0-shark
+brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b

 # SHARK Runner
 tqdm
@@ -17,8 +20,6 @@ google-cloud-storage

 # Testing
 pytest
-pytest-xdist
-pytest-forked
 Pillow
 parameterized

@@ -26,8 +27,10 @@ parameterized
 #accelerate is now required for diffusers import from ckpt.
 accelerate
 scipy
+transformers==4.37.1
+torchsde # Required for Stable Diffusion SDE schedulers.
 ftfy
-gradio==4.19.2
+gradio==4.29.0
 altair
 omegaconf
 # 0.3.2 doesn't have binaries for arm64
@@ -35,6 +38,7 @@ safetensors==0.3.1
 py-cpuinfo
 pydantic==2.4.1 # pin until pyinstaller-hooks-contrib works with beta versions
 mpmath==1.3.0
+optimum

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -88,8 +88,8 @@ else {python -m venv .\shark.venv\}
 .\shark.venv\Scripts\activate
 python -m pip install --upgrade pip
 pip install wheel
-pip install -r requirements.txt
-# remove this when windows DLL issues are fixed from LLVM changes
-pip install --force-reinstall https://github.com/openxla/iree/releases/download/candidate-20240326.843/iree_compiler-20240326.843-cp311-cp311-win_amd64.whl https://github.com/openxla/iree/releases/download/candidate-20240326.843/iree_runtime-20240326.843-cp311-cp311-win_amd64.whl
+pip install --pre -r requirements.txt
+pip install --force-reinstall https://github.com/nod-ai/SRT/releases/download/candidate-20240528.279/iree_compiler-20240528.279-cp311-cp311-win_amd64.whl https://github.com/nod-ai/SRT/releases/download/candidate-20240528.279/iree_runtime-20240528.279-cp311-cp311-win_amd64.whl 
+pip install -e .

 Write-Host "Source your venv with ./shark.venv/Scripts/activate"
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -84,21 +84,7 @@ else
  PYTORCH_URL=https://download.pytorch.org/whl/nightly/cpu/
 fi

-$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f ${PYTORCH_URL}
-
-if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
-  T_VER=$($PYTHON -m pip show torch | grep Version)
-  T_VER_MIN=${T_VER:14:12}
-  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
-  TV_VER_MAJ=${TV_VER:9:6}
-  $PYTHON -m pip uninstall -y torchvision
-  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
-  if [ $? -eq 0 ];then
-    echo "Successfully Installed torch + cu118."
-  else
-    echo "Could not install torch + cu118." >&2
-  fi
-fi
+$PYTHON -m pip install --no-warn-conflicts -e . -f ${RUNTIME} -f ${PYTORCH_URL}

 if [[ -z "${NO_BREVITAS}" ]]; then
  $PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@dev
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -76,6 +76,7 @@ _IREE_DEVICE_MAP = {
    "vulkan": "vulkan",
    "metal": "metal",
    "rocm": "rocm",
+    "hip": "hip",
    "intel-gpu": "level_zero",
 }

@@ -94,6 +95,7 @@ _IREE_TARGET_MAP = {
    "vulkan": "vulkan-spirv",
    "metal": "metal",
    "rocm": "rocm",
+    "hip": "rocm",
    "intel-gpu": "opencl-spirv",
 }

--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -62,13 +62,16 @@ def get_iree_device_args(device, extra_args=[]):
        from shark.iree_utils.gpu_utils import get_iree_rocm_args

        return get_iree_rocm_args(device_num=device_num, extra_args=extra_args)
+    if device == "hip":
+        from shark.iree_utils.gpu_utils import get_iree_rocm_args
+        return get_iree_rocm_args(device_num=device_num, extra_args=extra_args, hip_driver=True)
    return []

 def get_iree_target_triple(device):
    args = get_iree_device_args(device)
    for flag in args:
-        if "triple" in flag.split("-"):
-            triple = flag.split("=")
+        if "triple" in flag:
+            triple = flag.split("=")[-1]
            return triple
    return ""

@@ -89,9 +92,9 @@ def clean_device_info(raw_device):
        if len(device_id) <= 2:
            device_id = int(device_id)

-    if device not in ["rocm", "vulkan"]:
+    if device not in ["hip", "rocm", "vulkan"]:
        device_id = None
-    if device in ["rocm", "vulkan"] and device_id == None:
+    if device in ["hip", "rocm", "vulkan"] and device_id == None:
        device_id = 0
    return device, device_id

--- a/shark/iree_utils/gpu_utils.py
+++ b/shark/iree_utils/gpu_utils.py
@@ -52,7 +52,7 @@ def check_rocm_device_arch_in_args(extra_args):
    return None


-def get_rocm_device_arch(device_num=0, extra_args=[]):
+def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
    # ROCM Device Arch selection:
    # 1 : User given device arch using `--iree-rocm-target-chip` flag
    # 2 : Device arch from `iree-run-module --dump_devices=rocm` for device on index <device_num>
@@ -68,15 +68,23 @@ def get_rocm_device_arch(device_num=0, extra_args=[]):
    arch_in_device_dump = None

    # get rocm arch from iree dump devices
-    def get_devices_info_from_dump(dump):
+    def get_devices_info_from_dump(dump, driver):
        from os import linesep
-
-        dump_clean = list(
-            filter(
-                lambda s: "--device=rocm" in s or "gpu-arch-name:" in s,
-                dump.split(linesep),
+        
+        if driver == "hip":
+            dump_clean = list(
+                filter(
+                    lambda s: "AMD" in s,
+                    dump.split(linesep),
+                )
+            )
+        else:
+            dump_clean = list(
+                filter(
+                    lambda s: f"--device={driver}" in s or "gpu-arch-name:" in s,
+                    dump.split(linesep),
+                )
            )
-        )
        arch_pairs = [
            (
                dump_clean[i].split("=")[1].strip(),
@@ -87,16 +95,17 @@ def get_rocm_device_arch(device_num=0, extra_args=[]):
        return arch_pairs

    dump_device_info = None
+    driver = "hip" if hip_driver else "rocm"
    try:
        dump_device_info = run_cmd(
-            "iree-run-module --dump_devices=rocm", raise_err=True
+            "iree-run-module --dump_devices=" + driver, raise_err=True
        )
    except Exception as e:
-        print("could not execute `iree-run-module --dump_devices=rocm`")
+        print("could not execute `iree-run-module --dump_devices=" + driver + "`")

    if dump_device_info is not None:
        device_num = 0 if device_num is None else device_num
-        device_arch_pairs = get_devices_info_from_dump(dump_device_info[0])
+        device_arch_pairs = get_devices_info_from_dump(dump_device_info[0], driver)
        if len(device_arch_pairs) > device_num:  # can find arch in the list
            arch_in_device_dump = device_arch_pairs[device_num][1]

@@ -107,24 +116,22 @@ def get_rocm_device_arch(device_num=0, extra_args=[]):
    default_rocm_arch = "gfx1100"
    print(
        "Did not find ROCm architecture from `--iree-rocm-target-chip` flag"
-        "\n or from `iree-run-module --dump_devices=rocm` command."
+        "\n or from `iree-run-module --dump_devices` command."
        f"\nUsing {default_rocm_arch} as ROCm arch for compilation."
    )
    return default_rocm_arch


 # Get the default gpu args given the architecture.
-def get_iree_rocm_args(device_num=0, extra_args=[]):
+def get_iree_rocm_args(device_num=0, extra_args=[], hip_driver=False):
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
-    rocm_flags = ["--iree-rocm-link-bc=true"]
-
+    rocm_flags = []
    if check_rocm_device_arch_in_args(extra_args) is None:
-        rocm_arch = get_rocm_device_arch(device_num, extra_args)
+        rocm_arch = get_rocm_device_arch(device_num, extra_args, hip_driver=hip_driver)
        rocm_flags.append(f"--iree-rocm-target-chip={rocm_arch}")

    return rocm_flags

-
 # Some constants taken from cuda.h
 CUDA_SUCCESS = 0
 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -202,9 +202,6 @@ def get_iree_vulkan_args(device_num=0, extra_args=[]):
        )
    res_vulkan_flag += [vulkan_triple_flag]

-    if vulkan_triple_flag is not None:
-        vulkan_target_env = get_vulkan_target_env_flag(vulkan_triple_flag)
-        res_vulkan_flag.append(vulkan_target_env)
    return res_vulkan_flag


--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -6,6 +6,7 @@ import tempfile
 import os
 import hashlib

+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts

 def create_hash(file_name):
    with open(file_name, "rb") as f:
@@ -120,7 +121,7 @@ class SharkImporter:
        is_dynamic=False,
        tracing_required=False,
        func_name="forward",
-        save_dir="./shark_tmp/",
+        save_dir=cmd_opts.tmp_dir, #"./shark_tmp/",
        mlir_type="linalg",
    ):
        if self.frontend in ["torch", "pytorch"]:
@@ -806,7 +807,7 @@ def save_mlir(
        model_name + "_" + frontend + "_" + mlir_dialect + ".mlir"
    )
    if dir == "":
-        dir = os.path.join(".", "shark_tmp")
+        dir = cmd_opts.tmp_dir, #os.path.join(".", "shark_tmp")
    mlir_path = os.path.join(dir, model_name_mlir)
    print(f"saving {model_name_mlir} to {dir}")
    if not os.path.exists(dir):
Author	SHA1	Message	Date
saienduri	acb9046a34	Update requirements.txt	2024-06-18 11:42:12 -07:00
Ean Garvey	26f80ccbbb	Fixes to UI config defaults, config loading, and warnings. (#2153 )	2024-05-31 18:14:27 -04:00
Ean Garvey	d2c3752dc7	Fix batch count and tweaks to chatbot. (#2151 ) * Fix batch count * Add button to unload models manually. * Add compiled pipeline option * Add brevitas to requirements * Tweaks to chatbot * Change script loading trigger	2024-05-31 18:48:28 +05:30
Ean Garvey	4505c4549f	Force inlined weights on igpu for now, small fixes to chatbot (#2149 ) * Add igpu and custom triple support. * Small fixes to igpu, SDXL-turbo * custom pipe loading * formatting * Remove old nodlogo import.	2024-05-30 11:40:42 -05:00
Gaurav Shukla	793495c9c6	[ui] Add AMD logo in shark studio Signed-Off-by: Gaurav Shukla <gaurav.shukla@amd.com>	2024-05-30 21:43:15 +05:30
Ean Garvey	13e1d8d98a	Add igpu and custom triple support. (#2148 )	2024-05-29 17:39:36 -05:00
Ean Garvey	2074df40ad	Point to nod fork of diffusers. (#2146 )	2024-05-29 00:56:21 -05:00
Ean Garvey	7b30582408	Point to SRT links for windows. (#2145 )	2024-05-29 01:20:30 -04:00
Ean Garvey	151195ab74	Add a few requirements for ensured parity with turbine-models requirements. (#2142 ) * Add scipy to requirements. Adds diffusers req and a note for torchsde.	2024-05-28 15:37:31 -05:00
Ean Garvey	8146f0bd2f	Remove leftover merge conflict line from setup script. (#2141 )	2024-05-28 11:04:45 -07:00
Ean Garvey	68e9281778	(Studio2) Refactors SD pipeline to rely on turbine-models pipeline, fixes to LLM, gitignore (#2129 ) * Shark Studio SDXL support, HIP driver support, simpler device info, small fixes * Fixups to llm API/UI and ignore user config files. * Small fixes for unifying pipelines. * Update requirements.txt for iree-turbine (#2130) * Fix Llama2 on CPU (#2133) * Filesystem cleanup and custom model fixes (#2127) * Fix some formatting issues * Remove IREE pin (fixes exe issue) (#2126) * Update find links for IREE packages (#2136) * Shark Studio SDXL support, HIP driver support, simpler device info, small fixes * Abstract out SD pipelines from Studio Webui (WIP) * Switch from pin to minimum torch version and fix index url * Fix device parsing. * Fix linux setup * Fix custom weights. --------- Co-authored-by: saienduri <77521230+saienduri@users.noreply.github.com> Co-authored-by: gpetters-amd <159576198+gpetters-amd@users.noreply.github.com> Co-authored-by: gpetters94 <gpetters@protonmail.com>	2024-05-28 13:18:31 -04:00
Ean Garvey	fd07cae991	Update find links for IREE packages (#2136 )	2024-05-13 11:43:17 -05:00
gpetters94	6cb86a843e	Remove IREE pin (fixes exe issue) (#2126 ) * Diagnose a build issue * Remove IREE pin * Revert the build on pull request change	2024-04-30 12:27:30 -05:00
gpetters-amd	7db1612a5c	Filesystem cleanup and custom model fixes (#2127 ) * Initial filesystem cleanup * More filesystem cleanup * Fix some formatting issues * Address comments	2024-04-30 11:18:33 -05:00
gpetters-amd	81d6e059ac	Fix Llama2 on CPU (#2133 )	2024-04-29 12:18:16 -05:00
saienduri	e003d0abe8	Update requirements.txt for iree-turbine (#2130 ) * Update requirements.txt to iree-turbine creation * Update requirements.txt * Update requirements.txt * Update requirements.txt	2024-04-29 12:28:14 -04:00
Quinn Dawkins	cf2513e7b1	Update IREE discord link (#2118 ) Discord links for IREE were purged, so update the link on the readme.	2024-04-15 12:54:27 -07:00
Ean Garvey	60d8591e95	Change shark-turbine requirement target branch to main. (#2116 )	2024-04-11 19:31:39 -04:00
gpetters-amd	ff91982168	Remove target env (#2114 )	2024-04-08 16:52:45 -05:00
powderluv	a6a9e524c1	Drop linux nightly for now	2024-04-05 12:04:36 -07:00
powderluv	732df2e263	Updated signtool key	2024-04-05 12:01:42 -07:00
gpetters-amd	1ee16bd256	Fix the nightly build (#2111 )	2024-04-05 19:22:33 +05:30
gpetters-amd	752d775fbd	Fix a typo in the nightly build script (#2110 )	2024-03-30 17:31:51 -07:00
gpetters-amd	4d1a6a204d	Fix builder issue (#2109 )	2024-03-30 16:21:55 -07:00
Ean Garvey	0eff62a468	(Studio 2.0) add Stable Diffusion features (#2037 ) * (WIP): Studio2 app infra and SD API UI/app structure and utility implementation. - Initializers for webui/API launch - Schedulers file for SD scheduling utilities - Additions to API-level utilities - Added embeddings module for LoRA, Lycoris, yada yada - Added image_processing module for resamplers, resize tools, transforms, and any image annotation (PNG metadata) - shared_cmd_opts module -- sorry, this is stable_args.py. It lives on. We still want to have some global control over the app exclusively from the command-line. At least we will be free from shark_args. - Moving around some utility pieces. - Try to make api+webui concurrency possible in index.py - SD UI -- this is just img2imgUI but hopefully a little better. - UI utilities for your nod logos and your gradio temps. Enable UI / bugfixes / tweaks * Studio2/SD: Use more correct LoRA alpha calculation (#2034) * Updates ProcessLoRA to use both embedded LoRA alpha, and lora_strength optional parameter (default 1.0) when applying LoRA weights. * Updates ProcessLoRA to cover more dim cases. * This bring ProcessLoRA into line with PR #2015 against Studio1 * Studio2: Remove duplications from api/utils.py (#2035) * Remove duplicate os import * Remove duplicate parse_seed_input function Migrating to JSON requests in SD UI More UI and app flow improvements, logging, shared device cache Model loading Complete SD pipeline. Tweaks to VAE, pipeline states Pipeline tweaks, add cmd_opts parsing to sd api * Add test for SD * Small cleanup * Shark2/SD/UI: Respect ckpt_dir, share and server_port args (#2070) * Takes whether to generate a gradio live link from the existing --share command line parameter, rather than hardcoding as True. * Takes server port from existing --server_port command line parameter, rather than hardcoding as 11911. * Default --ckpt_dir parameter to '../models' * Use --ckpt_dir rather than hardcoding ../models as the base directory for checkpoints, vae, and lora, etc * Add a 'checkpoints' directory below --ckpt_dir to match ComfyUI folder structure. Read custom_weights choices from there, and/or subfolders below there matching the selected base model. * Fix --ckpt_dir possibly not working correctly when an absolute rather than relative path is specified. * Relabel "Custom Weights" to "Custom Weights Checkpoint" in the UI * Add StreamingLLM support to studio2 chat (#2060) * Streaming LLM * Update precision and add gpu support * (studio2) Separate weights generation for quantization support * Adapt prompt changes to studio flow * Remove outdated flag from llm compile flags. * (studio2) use turbine vmfbRunner * tweaks to prompts * Update CPU path and llm api test. * Change device in test to cpu. * Fixes to runner, device names, vmfb mgmt * Use small test without external weights. * HF-Reference LLM mode + Update test result to match latest Turbine. (#2080) * HF-Reference LLM mode. * Fixup test to match current output from Turbine. * lint * Fix test error message + Only initialize HF torch model when used. * Remove redundant format_out change. * Add rest API endpoint from LanguageModel API * Add StreamingLLM support to studio2 chat (#2060) * Streaming LLM * Update precision and add gpu support * (studio2) Separate weights generation for quantization support * Adapt prompt changes to studio flow * Remove outdated flag from llm compile flags. * (studio2) use turbine vmfbRunner * tweaks to prompts * Update CPU path and llm api test. * Change device in test to cpu. * Fixes to runner, device names, vmfb mgmt * Use small test without external weights. * Formatting and init files. * Remove unused import. * Small fixes * Studio2/SD/UI: Improve various parts of the UI for Stable Diffusion (#2074) * Studio2/SD/UI: Improve various parts of the UI of Shark 2 * Update Gradio pin to 4.15.0. * Port workarounds for Gradio >4.8.0 main container sizing from Shark 1.0. * Move nod Logo out of the SD tab and onto the top right of the main tab bar. * Set nod logo icon as the favicon (as current Shark 1.0). * Create a tabbed right hand panel within the SD UI sized to the viewport height. * Make Input Image tab 1 in the right hand panel. * Make output images, generation log, and generation buttons, tab 2 in the right hand panel * Make config JSON display, with config load, save and clear, tab 3 in the right hand panel * Make gallery area of the Output tab take up all vertical space the other controls on the tab do not. * Tidy up the controls on the Config tab somewhat. * Studio2/SD/UI: Reorganise inputs on Left Panel of SD tab * Rename previously added Right Panel Output tab to 'Generate'. * Move Batch Count, Batch Size, and Repeatable Seeds, off of Left Panel and onto 'Generate' Tab. * On 'Generate' tab, rename 'Generate Image(s)' button to 'Start', and 'Stop Batch' button to 'Stop'. They are now below the Batch inputs on a Generate tab so don't need the specificity. * Move Device, Low VRAM, and Precision inputs into their own 'Device Settings' Accordion control. (starts closed) * Rename 'Custom Weights Checkpoint' to 'Checkpoint Weights' * Move Checkpoint Weights, VAE Model, Standalone Lora Weights, and Embeddings Options controls, into their own 'Model Weights' Accordion control. (starts closed) * Move Denoising Strength, and Resample Type controls into their own 'Input Image Processing' Accordion. (starts closed) * Move any remaining controls in the 'Advanced Options' Accorion directly onto the left panel, and remove then Accordion. * Enable the copy button for all text boxes on the SD tab. * Add emoji/unicode glphs to all top level controls and Accordions on the SD Left Panel. * Start with the 'Generate' as the initially selected tab in the SD Right Panel, working around Gradio issue #7805 * Tweaks to SD Right Tab Panel vertical height. * Studio2/SD/UI: Sizing tweaks for Right Panel, and >1920 width * Set height of right panel using vmin rather than vh, with explicit affordances for fixed areas above and below. * Port >1920 width Gradio >4.8 CSS workaround from Shark 1.0. * Studio2/SD: Fix sd pipeline up to "Windows not supported" (#2082) * Studio2/SD: Fix sd pipeline up to "Windows not supported" A number of fixes to the SD pipeline as run from the UI, up until the point that dynamo complains "Windows not yet supported for torch.compile". * Remove separate install of iree-runtime and iree-compile in setup_venv.ps1, and rely on the versions installed via the Turbine requirements.txt. Fixes #2063 for me. * Replace any "None" strings with python None when pulling the config in the UI. * Add 'hf_auth_token' param to api StableDiffusion class, defaulting to None, and then pass that in to the various Models where it is required and wasn't already being done before. * Fix clip custom_weight_params being passed to export_clip_model as "external_weight_file" rather than "external_weights" * Don't pass non-existing "custom_vae" parameter to the Turbine Vae Model, instead pass custom_vae as the "hf_model_id" if it is set. (this may be wrong in the custom vae cast, but stops the code always breaking). * Studio2/SD/UI: Improve UI config None handling * When populating the UI from a JSON Config set controls to "None" for null/None values. * When generating a JSON Config from the UI set props to null/None for controls set to "None". * Use null rather string 'None' in the default config --------- Co-authored-by: Ean Garvey <87458719+monorimet@users.noreply.github.com> * Studio2/SD/UI: Further sd ui pipeline fixes (#2091) On Windows, this gets us all the way failing in iree compile of the with SD 2.1 base. - Fix merge errors with sd right pane config UI tab. - Remove non-requirement.txt install/build of torch/mlir/iree/SRT in setup_venv.ps1, fixing "torch.compile not supported on Windows" error. - Fix gradio deprecation warning for `root=` FileExplorer kwarg. - Comment out `precision` and `max_length` kwargs being passed to unet, as not yet supported on main Turbine branch. Avoids keyword argument error. * Tweak compile-time flags for SD submodels. * Small fixes to sd, pin mpmath * Add pyinstaller spec and imports script. * Fix the .exe (#2101) * Fix _IREE_TARGET_MAP (#2103) (#2108) - Change target passed to iree for vulkan from 'vulkan' to 'vulkan-spriv', as 'vulkan' is not a valid value for --iree-hal-target-backends with the current iree compiler. Co-authored-by: Stefan Kapusniak <121311569+one-lithe-rune@users.noreply.github.com> * Cleanup sd model map. * Update dependencies. * Studio2/SD/UI: Update gradio to 4.19.2 (sd-studio2) (#2097) - Move pin for gradio from 4.15 -> 4.19.2 on the sd-studio2 branch * fix formatting and disable explicit vulkan env settings. --------- Co-authored-by: Stefan Kapusniak <121311569+one-lithe-rune@users.noreply.github.com> Co-authored-by: Stanley Winata <68087699+raikonenfnu@users.noreply.github.com> Co-authored-by: gpetters-amd <159576198+gpetters-amd@users.noreply.github.com> Co-authored-by: gpetters94 <gpetters@protonmail.com>	2024-03-29 18:13:21 -04:00