XFAIL some macos tests

Add distilgpt2 to stablehlo in shark tank (#1481 )
Change instances of input_type='mhlo' to 'auto' (#1482 )
2026-04-20 03:00:34 -04:00 · 2023-06-04 15:27:03 -07:00 · 2023-06-02 16:44:46 -05:00 · 2023-06-02 16:43:47 -05:00 · 2023-06-01 22:26:55 -07:00 · 2023-06-01 22:25:20 -07:00
82 changed files with 7138 additions and 1351 deletions
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -137,7 +137,7 @@ jobs:
        export DYLD_LIBRARY_PATH=/usr/local/lib/
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k vulkan --update_tank
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k vulkan

    - name: Validate Vulkan Models (a100)
      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -0,0 +1,210 @@
+import torch
+import torch_mlir
+from transformers import (
+    AutoTokenizer,
+    StoppingCriteria,
+)
+from io import BytesIO
+from pathlib import Path
+from apps.language_models.utils import (
+    get_torch_mlir_module_bytecode,
+    get_vmfb_from_path,
+)
+
+
+class StopOnTokens(StoppingCriteria):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        stop_ids = [50278, 50279, 50277, 1, 0]
+        for stop_id in stop_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+
+
+def shouldStop(tokens):
+    stop_ids = [50278, 50279, 50277, 1, 0]
+    for stop_id in stop_ids:
+        if tokens[0][-1] == stop_id:
+            return True
+    return False
+
+
+MAX_SEQUENCE_LENGTH = 256
+
+
+def user(message, history):
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+
+
+def compile_stableLM(
+    model,
+    model_inputs,
+    model_name,
+    model_vmfb_name,
+    device="cuda",
+    precision="fp32",
+):
+    from shark.shark_inference import SharkInference
+
+    # device = "cuda"  # "cpu"
+    # TODO: vmfb and mlir name should include precision and device
+    vmfb_path = (
+        Path(model_name + f"_{device}.vmfb")
+        if model_vmfb_name is None
+        else Path(model_vmfb_name)
+    )
+    shark_module = get_vmfb_from_path(
+        vmfb_path, device, mlir_dialect="tm_tensor"
+    )
+    if shark_module is not None:
+        return shark_module
+
+    mlir_path = Path(model_name + ".mlir")
+    print(
+        f"[DEBUG] mlir path {mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
+    )
+    if mlir_path.exists():
+        with open(mlir_path, "rb") as f:
+            bytecode = f.read()
+    else:
+        ts_graph = get_torch_mlir_module_bytecode(model, model_inputs)
+        module = torch_mlir.compile(
+            ts_graph,
+            [*model_inputs],
+            torch_mlir.OutputType.LINALG_ON_TENSORS,
+            use_tracing=False,
+            verbose=False,
+        )
+        bytecode_stream = BytesIO()
+        module.operation.write_bytecode(bytecode_stream)
+        bytecode = bytecode_stream.getvalue()
+    f_ = open(model_name + ".mlir", "wb")
+    f_.write(bytecode)
+    print("Saved mlir")
+    f_.close()
+
+    shark_module = SharkInference(
+        mlir_module=bytecode, device=device, mlir_dialect="tm_tensor"
+    )
+    shark_module.compile()
+
+    path = shark_module.save_module(
+        vmfb_path.parent.absolute(), vmfb_path.stem
+    )
+    print("Saved vmfb at ", str(path))
+
+    return shark_module
+
+
+class StableLMModel(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, input_ids, attention_mask):
+        combine_input_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        output = self.model(**combine_input_dict)
+        return output.logits
+
+
+# Initialize a StopOnTokens object
+system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+"""
+
+
+def get_tokenizer():
+    model_path = "stabilityai/stablelm-tuned-alpha-3b"
+    tok = AutoTokenizer.from_pretrained(model_path)
+    tok.add_special_tokens({"pad_token": "<PAD>"})
+    print("Sucessfully loaded the tokenizer to the memory")
+    return tok
+
+
+# sharkStableLM = compile_stableLM
+# (
+#   None,
+#   tuple([input_ids, attention_mask]),
+#   "stableLM_linalg_f32_seqLen256",
+#   "/home/shark/vivek/stableLM_shark_f32_seqLen256"
+# )
+def generate(
+    new_text,
+    max_new_tokens,
+    sharkStableLM,
+    tokenizer=None,
+):
+    if tokenizer is None:
+        tokenizer = get_tokenizer()
+    # Construct the input message string for the model by
+    # concatenating the current system message and conversation history
+    # Tokenize the messages string
+    # sharkStableLM = compile_stableLM
+    # (
+    #   None,
+    #   tuple([input_ids, attention_mask]),
+    #   "stableLM_linalg_f32_seqLen256",
+    #   "/home/shark/vivek/stableLM_shark_f32_seqLen256"
+    # )
+    words_list = []
+    for i in range(max_new_tokens):
+        # numWords = len(new_text.split())
+        # if(numWords>220):
+        #  break
+        params = {
+            "new_text": new_text,
+        }
+        generated_token_op = generate_new_token(
+            sharkStableLM, tokenizer, params
+        )
+        detok = generated_token_op["detok"]
+        stop_generation = generated_token_op["stop_generation"]
+        if stop_generation:
+            break
+        print(detok, end="", flush=True)
+        words_list.append(detok)
+        if detok == "":
+            break
+        new_text = new_text + detok
+    return words_list
+
+
+def generate_new_token(shark_model, tokenizer, params):
+    new_text = params["new_text"]
+    model_inputs = tokenizer(
+        [new_text],
+        padding="max_length",
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        return_tensors="pt",
+    )
+    sum_attentionmask = torch.sum(model_inputs.attention_mask)
+    # sharkStableLM = compile_stableLM(None, tuple([input_ids, attention_mask]), "stableLM_linalg_f32_seqLen256", "/home/shark/vivek/stableLM_shark_f32_seqLen256")
+    output = shark_model(
+        "forward", [model_inputs.input_ids, model_inputs.attention_mask]
+    )
+    output = torch.from_numpy(output)
+    next_toks = torch.topk(output, 1)
+    stop_generation = False
+    if shouldStop(next_toks.indices):
+        stop_generation = True
+    new_token = next_toks.indices[0][int(sum_attentionmask) - 1]
+    detok = tokenizer.decode(
+        new_token,
+        skip_special_tokens=True,
+    )
+    ret_dict = {
+        "new_token": new_token,
+        "detok": detok,
+        "stop_generation": stop_generation,
+    }
+    return ret_dict
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
--- a/apps/language_models/scripts/vicuna_web.py
+++ b/apps/language_models/scripts/vicuna_web.py
@@ -0,0 +1,777 @@
+import sys
+import warnings
+import gradio as gr
+import time
+
+warnings.filterwarnings("ignore")
+sys.path.insert(0, "D:\S\SB\I\python_packages\iree_compiler")
+sys.path.insert(0, "D:\S\SB\I\python_packages\iree_runtime")
+import torch
+import torch_mlir
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from typing import List
+from io import BytesIO
+from pathlib import Path
+from shark.shark_downloader import download_public_file
+from shark.shark_importer import transform_fx as transform_fx_
+import re
+from shark.shark_inference import SharkInference
+from tqdm import tqdm
+from torch_mlir import TensorPlaceholder
+from apps.stable_diffusion.web.ui.utils import available_devices
+
+
+class FirstVicunaLayer(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, hidden_states, attention_mask, position_ids):
+        outputs = self.model(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+        )
+        next_hidden_states = outputs[0]
+        past_key_value_out0, past_key_value_out1 = (
+            outputs[-1][0],
+            outputs[-1][1],
+        )
+
+        return (
+            next_hidden_states,
+            past_key_value_out0,
+            past_key_value_out1,
+        )
+
+
+class SecondVicunaLayer(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value0,
+        past_key_value1,
+    ):
+        outputs = self.model(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=(
+                past_key_value0,
+                past_key_value1,
+            ),
+            use_cache=True,
+        )
+        next_hidden_states = outputs[0]
+        past_key_value_out0, past_key_value_out1 = (
+            outputs[-1][0],
+            outputs[-1][1],
+        )
+
+        return (
+            next_hidden_states,
+            past_key_value_out0,
+            past_key_value_out1,
+        )
+
+
+class CompiledFirstVicunaLayer(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value=None,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        hidden_states = hidden_states.detach()
+        attention_mask = attention_mask.detach()
+        position_ids = position_ids.detach()
+        output = self.model(
+            "forward",
+            (
+                hidden_states,
+                attention_mask,
+                position_ids,
+            ),
+        )
+
+        output0 = torch.tensor(output[0])
+        output1 = torch.tensor(output[1])
+        output2 = torch.tensor(output[2])
+
+        return (
+            output0,
+            (
+                output1,
+                output2,
+            ),
+        )
+
+
+class CompiledSecondVicunaLayer(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        hidden_states = hidden_states.detach()
+        attention_mask = attention_mask.detach()
+        position_ids = position_ids.detach()
+        pkv0 = past_key_value[0].detach()
+        pkv1 = past_key_value[1].detach()
+        output = self.model(
+            "forward",
+            (
+                hidden_states,
+                attention_mask,
+                position_ids,
+                pkv0,
+                pkv1,
+            ),
+        )
+
+        output0 = torch.tensor(output[0])
+        output1 = torch.tensor(output[1])
+        output2 = torch.tensor(output[2])
+
+        return (
+            output0,
+            (
+                output1,
+                output2,
+            ),
+        )
+
+
+class ShardedVicunaModel(torch.nn.Module):
+    def __init__(self, model, layers0, layers1):
+        super().__init__()
+        self.model = model
+        assert len(layers0) == len(model.model.layers)
+        # self.model.model.layers = torch.nn.modules.container.ModuleList(layers0)
+        self.model.model.config.use_cache = True
+        self.model.model.config.output_attentions = False
+        self.layers0 = layers0
+        self.layers1 = layers1
+
+    def forward(
+        self,
+        input_ids,
+        is_first=True,
+        past_key_values=None,
+        attention_mask=None,
+    ):
+        if is_first:
+            self.model.model.layers = torch.nn.modules.container.ModuleList(
+                self.layers0
+            )
+            return self.model.forward(input_ids, attention_mask=attention_mask)
+        else:
+            self.model.model.layers = torch.nn.modules.container.ModuleList(
+                self.layers1
+            )
+            return self.model.forward(
+                input_ids,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+            )
+
+
+def write_in_dynamic_inputs0(module, dynamic_input_size):
+    new_lines = []
+    for line in module.splitlines():
+        line = re.sub(f"{dynamic_input_size}x", "?x", line)
+        if "?x" in line:
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
+        line = re.sub(f" {dynamic_input_size},", " %dim,", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
+            )
+        if "arith.cmpi" in line:
+            line = re.sub(f"c{dynamic_input_size}", "dim", line)
+        new_lines.append(line)
+    new_module = "\n".join(new_lines)
+    return new_module
+
+
+def write_in_dynamic_inputs1(module, dynamic_input_size):
+    new_lines = []
+    for line in module.splitlines():
+        if "dim_42 =" in line:
+            continue
+        if f"%c{dynamic_input_size}_i64 =" in line:
+            new_lines.append(
+                "%dim_42 = tensor.dim %arg1, %c3 : tensor<1x1x1x?xf32>"
+            )
+            new_lines.append(
+                f"%dim_42_i64 = arith.index_cast %dim_42 : index to i64"
+            )
+            continue
+        line = re.sub(f"{dynamic_input_size}x", "?x", line)
+        if "?x" in line:
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim_42)", line)
+        line = re.sub(f" {dynamic_input_size},", " %dim_42,", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim_42\)",
+                "tensor.empty(%dim_42, %dim_42)",
+                line,
+            )
+        if "arith.cmpi" in line:
+            line = re.sub(f"c{dynamic_input_size}", "dim_42", line)
+        new_lines.append(line)
+    new_module = "\n".join(new_lines)
+    return new_module
+
+
+def compile_vicuna_layer(
+    vicuna_layer,
+    hidden_states,
+    attention_mask,
+    position_ids,
+    past_key_value0=None,
+    past_key_value1=None,
+):
+    hidden_states_placeholder = TensorPlaceholder.like(
+        hidden_states, dynamic_axes=[1]
+    )
+    attention_mask_placeholder = TensorPlaceholder.like(
+        attention_mask, dynamic_axes=[2, 3]
+    )
+    position_ids_placeholder = TensorPlaceholder.like(
+        position_ids, dynamic_axes=[1]
+    )
+
+    if past_key_value0 is None and past_key_value1 is None:
+        fx_g = make_fx(
+            vicuna_layer,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(hidden_states, attention_mask, position_ids)
+
+    else:
+        fx_g = make_fx(
+            vicuna_layer,
+            decomposition_table=get_decompositions(
+                [
+                    torch.ops.aten.embedding_dense_backward,
+                    torch.ops.aten.native_layer_norm_backward,
+                    torch.ops.aten.slice_backward,
+                    torch.ops.aten.select_backward,
+                    torch.ops.aten.norm.ScalarOpt_dim,
+                    torch.ops.aten.native_group_norm,
+                    torch.ops.aten.upsample_bilinear2d.vec,
+                    torch.ops.aten.split.Tensor,
+                    torch.ops.aten.split_with_sizes,
+                ]
+            ),
+        )(
+            hidden_states,
+            attention_mask,
+            position_ids,
+            past_key_value0,
+            past_key_value1,
+        )
+
+    def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+        removed_indexes = []
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, (list, tuple)):
+                    node_arg = list(node_arg)
+                    node_args_len = len(node_arg)
+                    for i in range(node_args_len):
+                        curr_index = node_args_len - (i + 1)
+                        if node_arg[curr_index] is None:
+                            removed_indexes.append(curr_index)
+                            node_arg.pop(curr_index)
+                    node.args = (tuple(node_arg),)
+                    break
+
+        if len(removed_indexes) > 0:
+            fx_g.graph.lint()
+            fx_g.graph.eliminate_dead_code()
+            fx_g.recompile()
+        removed_indexes.sort()
+        return removed_indexes
+
+    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+        """
+        Replace tuple with tuple element in functions that return one-element tuples.
+        Returns true if an unwrapping took place, and false otherwise.
+        """
+        unwrapped_tuple = False
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple):
+                    if len(node_arg) == 1:
+                        node.args = (node_arg[0],)
+                        unwrapped_tuple = True
+                        break
+
+        if unwrapped_tuple:
+            fx_g.graph.lint()
+            fx_g.recompile()
+        return unwrapped_tuple
+
+    def transform_fx(fx_g):
+        for node in fx_g.graph.nodes:
+            if node.op == "call_function":
+                if node.target in [
+                    torch.ops.aten.empty,
+                ]:
+                    # aten.empty should be filled with zeros.
+                    if node.target in [torch.ops.aten.empty]:
+                        with fx_g.graph.inserting_after(node):
+                            new_node = fx_g.graph.call_function(
+                                torch.ops.aten.zero_,
+                                args=(node,),
+                            )
+                            node.append(new_node)
+                            node.replace_all_uses_with(new_node)
+                            new_node.args = (node,)
+
+        fx_g.graph.lint()
+
+    transform_fx(fx_g)
+    fx_g.recompile()
+    removed_none_indexes = _remove_nones(fx_g)
+    was_unwrapped = _unwrap_single_tuple_return(fx_g)
+
+    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+    fx_g.recompile()
+
+    print("FX_G recompile")
+
+    def strip_overloads(gm):
+        """
+        Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+        Args:
+            gm(fx.GraphModule): The input Fx graph module to be modified
+        """
+        for node in gm.graph.nodes:
+            if isinstance(node.target, torch._ops.OpOverload):
+                node.target = node.target.overloadpacket
+        gm.recompile()
+
+    strip_overloads(fx_g)
+    ts_g = torch.jit.script(fx_g)
+    return ts_g
+
+
+path = "TheBloke/vicuna-7B-1.1-HF"
+kwargs = {"torch_dtype": torch.float}
+vicuna_model = AutoModelForCausalLM.from_pretrained(path, **kwargs)
+tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
+
+
+def compile_to_vmfb(inputs, layers, is_first=True):
+    mlirs, modules = [], []
+    for idx, layer in tqdm(enumerate(layers), desc="Getting mlirs"):
+        if is_first:
+            mlir_path = Path(f"{idx}_0.mlir")
+            vmfb_path = Path(f"{idx}_0.vmfb")
+        else:
+            mlir_path = Path(f"{idx}_1.mlir")
+            vmfb_path = Path(f"{idx}_1.vmfb")
+        if vmfb_path.exists():
+            continue
+        if mlir_path.exists():
+            # print(f"Found layer {idx} mlir")
+            f_ = open(mlir_path, "rb")
+            bytecode = f_.read()
+            f_.close()
+        else:
+            hidden_states_placeholder = TensorPlaceholder.like(
+                inputs[0], dynamic_axes=[1]
+            )
+            attention_mask_placeholder = TensorPlaceholder.like(
+                inputs[1], dynamic_axes=[3]
+            )
+            position_ids_placeholder = TensorPlaceholder.like(
+                inputs[2], dynamic_axes=[1]
+            )
+            if not is_first:
+                pkv0_placeholder = TensorPlaceholder.like(
+                    inputs[3], dynamic_axes=[2]
+                )
+                pkv1_placeholder = TensorPlaceholder.like(
+                    inputs[4], dynamic_axes=[2]
+                )
+            print(f"Compiling layer {idx} mlir")
+            if is_first:
+                ts_g = compile_vicuna_layer(
+                    layer, inputs[0], inputs[1], inputs[2]
+                )
+                module = torch_mlir.compile(
+                    ts_g,
+                    (
+                        hidden_states_placeholder,
+                        inputs[1],
+                        inputs[2],
+                    ),
+                    torch_mlir.OutputType.LINALG_ON_TENSORS,
+                    use_tracing=False,
+                    verbose=False,
+                )
+            else:
+                ts_g = compile_vicuna_layer(
+                    layer,
+                    inputs[0],
+                    inputs[1],
+                    inputs[2],
+                    inputs[3],
+                    inputs[4],
+                )
+                module = torch_mlir.compile(
+                    ts_g,
+                    (
+                        inputs[0],
+                        attention_mask_placeholder,
+                        inputs[2],
+                        pkv0_placeholder,
+                        pkv1_placeholder,
+                    ),
+                    torch_mlir.OutputType.LINALG_ON_TENSORS,
+                    use_tracing=False,
+                    verbose=False,
+                )
+
+            # bytecode_stream = BytesIO()
+            # module.operation.write_bytecode(bytecode_stream)
+            # bytecode = bytecode_stream.getvalue()
+
+            if is_first:
+                module = write_in_dynamic_inputs0(str(module), 137)
+                bytecode = module.encode("UTF-8")
+                bytecode_stream = BytesIO(bytecode)
+                bytecode = bytecode_stream.read()
+
+            else:
+                module = write_in_dynamic_inputs1(str(module), 138)
+                if idx in [0, 5, 6, 7]:
+                    module_str = module
+                    module_str = module_str.splitlines()
+                    new_lines = []
+                    for line in module_str:
+                        if len(line) < 1000:
+                            new_lines.append(line)
+                        else:
+                            new_lines.append(line[:999])
+                    module_str = "\n".join(new_lines)
+                    f1_ = open(f"{idx}_1_test.mlir", "w+")
+                    f1_.write(module_str)
+                    f1_.close()
+
+                bytecode = module.encode("UTF-8")
+                bytecode_stream = BytesIO(bytecode)
+                bytecode = bytecode_stream.read()
+
+            f_ = open(mlir_path, "wb")
+            f_.write(bytecode)
+            f_.close()
+        mlirs.append(bytecode)
+
+    for idx, layer in tqdm(enumerate(layers), desc="compiling modules"):
+        if is_first:
+            vmfb_path = Path(f"{idx}_0.vmfb")
+            if idx < 25:
+                device = "cpu"
+            else:
+                device = "cpu"
+            if vmfb_path.exists():
+                # print(f"Found layer {idx} vmfb")
+                module = SharkInference(
+                    None, device=device, mlir_dialect="tm_tensor"
+                )
+                module.load_module(vmfb_path)
+            else:
+                print(f"Compiling layer {idx} vmfb")
+                module = SharkInference(
+                    mlirs[idx], device=device, mlir_dialect="tm_tensor"
+                )
+                module.save_module(
+                    module_name=f"{idx}_0",
+                    extra_args=[
+                        "--iree-hal-dump-executable-sources-to=ies",
+                        "--iree-vm-target-truncate-unsupported-floats",
+                        "--iree-codegen-check-ir-before-llvm-conversion=false",
+                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    ],
+                )
+                module.load_module(vmfb_path)
+            modules.append(module)
+        else:
+            vmfb_path = Path(f"{idx}_1.vmfb")
+            if idx < 25:
+                device = "vulkan"
+            else:
+                device = "cpu"
+            if vmfb_path.exists():
+                # print(f"Found layer {idx} vmfb")
+                module = SharkInference(
+                    None, device=device, mlir_dialect="tm_tensor"
+                )
+                module.load_module(vmfb_path)
+            else:
+                print(f"Compiling layer {idx} vmfb")
+                module = SharkInference(
+                    mlirs[idx], device=device, mlir_dialect="tm_tensor"
+                )
+                module.save_module(
+                    module_name=f"{idx}_1",
+                    extra_args=[
+                        "--iree-hal-dump-executable-sources-to=ies",
+                        "--iree-vm-target-truncate-unsupported-floats",
+                        "--iree-codegen-check-ir-before-llvm-conversion=false",
+                        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+                    ],
+                )
+                module.load_module(vmfb_path)
+            modules.append(module)
+
+    return mlirs, modules
+
+
+def get_sharded_model():
+    # SAMPLE_INPUT_LEN is used for creating mlir with dynamic inputs, which is currently an increadibly hacky proccess
+    # please don't change it
+    SAMPLE_INPUT_LEN = 137
+    global vicuna_model
+
+    placeholder_input0 = (
+        torch.zeros([1, SAMPLE_INPUT_LEN, 4096]),
+        torch.zeros([1, 1, SAMPLE_INPUT_LEN, SAMPLE_INPUT_LEN]),
+        torch.zeros([1, SAMPLE_INPUT_LEN], dtype=torch.int64),
+    )
+
+    placeholder_input1 = (
+        torch.zeros([1, 1, 4096]),
+        torch.zeros([1, 1, 1, SAMPLE_INPUT_LEN + 1]),
+        torch.zeros([1, 1], dtype=torch.int64),
+        torch.zeros([1, 32, SAMPLE_INPUT_LEN, 128]),
+        torch.zeros([1, 32, SAMPLE_INPUT_LEN, 128]),
+    )
+
+    layers0 = [FirstVicunaLayer(layer) for layer in vicuna_model.model.layers]
+    _, modules0 = compile_to_vmfb(placeholder_input0, layers0, is_first=True)
+    shark_layers0 = [CompiledFirstVicunaLayer(m) for m in modules0]
+
+    layers1 = [SecondVicunaLayer(layer) for layer in vicuna_model.model.layers]
+    _, modules1 = compile_to_vmfb(placeholder_input1, layers1, is_first=False)
+    shark_layers1 = [CompiledSecondVicunaLayer(m) for m in modules1]
+
+    sharded_model = ShardedVicunaModel(
+        vicuna_model, shark_layers0, shark_layers1
+    )
+    return sharded_model
+
+
+sharded_model = get_sharded_model()
+
+
+def user(message, history):
+    print("msg=", message)
+    print("history=", history)
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+
+
+def chat(curr_system_message, history):
+    global sharded_model
+    past_key_values = None
+    messages = curr_system_message + "".join(
+        [
+            "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+            for item in history
+        ]
+    )
+    print(messages)
+    prompt = messages.strip()
+    input_ids = tokenizer(prompt).input_ids
+    tokens = input_ids
+    new_sentence = []
+    max_response_len = 1000
+    partial_sentence = []
+    partial_text = ""
+    start_time = time.time()
+    for iteration in range(max_response_len):
+        original_input_ids = input_ids
+        input_id_len = len(input_ids)
+        input_ids = torch.tensor(input_ids)
+        input_ids = input_ids.reshape([1, input_id_len])
+
+        if iteration == 0:
+            output = sharded_model.forward(input_ids, is_first=True)
+        else:
+            output = sharded_model.forward(
+                input_ids, past_key_values=past_key_values, is_first=False
+            )
+        logits = output["logits"]
+        past_key_values = output["past_key_values"]
+        new_token = int(torch.argmax(logits[:, -1, :], dim=1)[0])
+        if new_token == 2:
+            break
+        new_sentence += [new_token]
+        partial_sentence += [new_token]
+        if iteration > 0 and iteration % 2 == 0:
+            new_text = tokenizer.decode(partial_sentence)
+            partial_sentence = []
+            print(new_text, " ")
+            partial_text += new_text + " "
+            history[-1][1] = partial_text
+            yield history
+
+        tokens.append(new_token)
+        original_input_ids.append(new_token)
+        input_ids = [new_token]
+    end_time = time.time()
+    print(
+        f"Total time taken to generated response is {end_time-start_time} seconds"
+    )
+
+    for i in range(len(tokens)):
+        if type(tokens[i]) != int:
+            tokens[i] = int(tokens[i][0])
+    new_sentence_str = tokenizer.decode(new_sentence)
+    print(new_sentence_str)
+    history[-1][1] = new_sentence_str
+    return history
+
+
+system_msg = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+# history_eg = [["hi hello how are you", ""]]
+# print(chat(system_msg, history_eg))
+
+with gr.Blocks(title="Chatbot") as vicuna_chat:
+    with gr.Row():
+        model = gr.Dropdown(
+            label="Select Model",
+            value="TheBloke/vicuna-7B-1.1-HF",
+            choices=[
+                "TheBloke/vicuna-7B-1.1-HF",
+            ],
+        )
+        device_value = None
+        for d in available_devices:
+            if "vulkan" in d:
+                device_value = d
+                break
+
+        device = gr.Dropdown(
+            label="Device",
+            value=device_value if device_value else available_devices[0],
+            interactive=False,
+            choices=available_devices,
+        )
+    chatbot = gr.Chatbot().style(height=500)
+    with gr.Row():
+        with gr.Column():
+            msg = gr.Textbox(
+                label="Chat Message Box",
+                placeholder="Chat Message Box",
+                show_label=False,
+            ).style(container=False)
+        with gr.Column():
+            with gr.Row():
+                submit = gr.Button("Submit")
+                stop = gr.Button("Stop")
+                clear = gr.Button("Clear")
+    system_msg = gr.Textbox(
+        system_msg, label="System Message", interactive=False, visible=False
+    )
+
+    submit_event = msg.submit(
+        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+    ).then(
+        fn=chat,
+        inputs=[system_msg, chatbot],
+        outputs=[chatbot],
+        queue=True,
+    )
+    submit_click_event = submit.click(
+        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+    ).then(
+        fn=chat,
+        inputs=[system_msg, chatbot],
+        outputs=[chatbot],
+        queue=True,
+    )
+    stop.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, [chatbot], queue=False)
+
+import argparse
+
+p = argparse.ArgumentParser(
+    description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+p.add_argument(
+    "--share",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for generating a public URL",
+)
+p.add_argument(
+    "--server_port",
+    type=int,
+    default=8080,
+    help="flag for setting server port",
+)
+args, unknown = p.parse_known_args()
+
+vicuna_chat.queue()
+vicuna_chat.launch(
+    share=args.share,
+    inbrowser=True,
+    server_name="0.0.0.0",
+    server_port=args.server_port,
+)
--- a/apps/language_models/src/model_wrappers/stablelm_model.py
+++ b/apps/language_models/src/model_wrappers/stablelm_model.py
@@ -0,0 +1,15 @@
+import torch
+
+
+class StableLMModel(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, input_ids, attention_mask):
+        combine_input_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        output = self.model(**combine_input_dict)
+        return output.logits
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -0,0 +1,239 @@
+import torch
+from transformers import AutoModelForCausalLM
+
+
+class FirstVicuna(torch.nn.Module):
+    def __init__(self, model_path):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+
+    def forward(self, input_ids):
+        op = self.model(input_ids=input_ids, use_cache=True)
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondVicuna(torch.nn.Module):
+    def __init__(self, model_path):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+    ):
+        # input_ids = input_tuple[0]
+        # input_tuple = torch.unbind(pkv, dim=0)
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
--- a/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
@@ -0,0 +1,178 @@
+import torch
+
+
+class FirstVicunaLayer(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, hidden_states, attention_mask, position_ids):
+        outputs = self.model(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+        )
+        next_hidden_states = outputs[0]
+        past_key_value_out0, past_key_value_out1 = (
+            outputs[-1][0],
+            outputs[-1][1],
+        )
+
+        return (
+            next_hidden_states,
+            past_key_value_out0,
+            past_key_value_out1,
+        )
+
+
+class SecondVicunaLayer(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value0,
+        past_key_value1,
+    ):
+        outputs = self.model(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=(
+                past_key_value0,
+                past_key_value1,
+            ),
+            use_cache=True,
+        )
+        next_hidden_states = outputs[0]
+        past_key_value_out0, past_key_value_out1 = (
+            outputs[-1][0],
+            outputs[-1][1],
+        )
+
+        return (
+            next_hidden_states,
+            past_key_value_out0,
+            past_key_value_out1,
+        )
+
+
+class CompiledFirstVicunaLayer(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value=None,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        hidden_states = hidden_states.detach()
+        attention_mask = attention_mask.detach()
+        position_ids = position_ids.detach()
+        output = self.model(
+            "forward",
+            (
+                hidden_states,
+                attention_mask,
+                position_ids,
+            ),
+        )
+
+        output0 = torch.tensor(output[0])
+        output1 = torch.tensor(output[1])
+        output2 = torch.tensor(output[2])
+
+        return (
+            output0,
+            (
+                output1,
+                output2,
+            ),
+        )
+
+
+class CompiledSecondVicunaLayer(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        hidden_states = hidden_states.detach()
+        attention_mask = attention_mask.detach()
+        position_ids = position_ids.detach()
+        pkv0 = past_key_value[0].detach()
+        pkv1 = past_key_value[1].detach()
+        output = self.model(
+            "forward",
+            (
+                hidden_states,
+                attention_mask,
+                position_ids,
+                pkv0,
+                pkv1,
+            ),
+        )
+
+        output0 = torch.tensor(output[0])
+        output1 = torch.tensor(output[1])
+        output2 = torch.tensor(output[2])
+
+        return (
+            output0,
+            (
+                output1,
+                output2,
+            ),
+        )
+
+
+class ShardedVicunaModel(torch.nn.Module):
+    def __init__(self, model, layers0, layers1):
+        super().__init__()
+        self.model = model
+        assert len(layers0) == len(model.model.layers)
+        # self.model.model.layers = torch.nn.modules.container.ModuleList(layers0)
+        self.model.model.config.use_cache = True
+        self.model.model.config.output_attentions = False
+        self.layers0 = layers0
+        self.layers1 = layers1
+
+    def forward(
+        self,
+        input_ids,
+        is_first=True,
+        past_key_values=None,
+        attention_mask=None,
+    ):
+        if is_first:
+            self.model.model.layers = torch.nn.modules.container.ModuleList(
+                self.layers0
+            )
+            return self.model.forward(input_ids, attention_mask=attention_mask)
+        else:
+            self.model.model.layers = torch.nn.modules.container.ModuleList(
+                self.layers1
+            )
+            return self.model.forward(
+                input_ids,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+            )
--- a/apps/language_models/src/pipelines/SharkLLMBase.py
+++ b/apps/language_models/src/pipelines/SharkLLMBase.py
@@ -0,0 +1,41 @@
+from abc import ABC, abstractmethod
+
+
+class SharkLLMBase(ABC):
+    def __init__(
+        self, model_name, hf_model_path=None, max_num_tokens=512
+    ) -> None:
+        self.model_name = model_name
+        self.hf_model_path = hf_model_path
+        self.max_num_tokens = max_num_tokens
+        self.shark_model = None
+        self.device = "cpu"
+        self.precision = "fp32"
+
+    @classmethod
+    @abstractmethod
+    def compile(self):
+        pass
+
+    @classmethod
+    @abstractmethod
+    def generate(self, prompt):
+        pass
+
+    @classmethod
+    @abstractmethod
+    def generate_new_token(self, params):
+        pass
+
+    @classmethod
+    @abstractmethod
+    def get_tokenizer(self):
+        pass
+
+    @classmethod
+    @abstractmethod
+    def get_src_model(self):
+        pass
+
+    def load_init_from_config(self):
+        pass
--- a/apps/language_models/src/pipelines/stablelm_pipeline.py
+++ b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -0,0 +1,185 @@
+import torch
+import torch_mlir
+from transformers import AutoTokenizer, StoppingCriteria, AutoModelForCausalLM
+from io import BytesIO
+from pathlib import Path
+from apps.language_models.utils import (
+    get_torch_mlir_module_bytecode,
+    get_vmfb_from_path,
+)
+from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
+from apps.language_models.src.model_wrappers.stablelm_model import (
+    StableLMModel,
+)
+
+
+class StopOnTokens(StoppingCriteria):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        stop_ids = [50278, 50279, 50277, 1, 0]
+        for stop_id in stop_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+
+
+class SharkStableLM(SharkLLMBase):
+    def __init__(
+        self,
+        model_name,
+        hf_model_path="stabilityai/stablelm-tuned-alpha-3b",
+        max_num_tokens=512,
+        device="cuda",
+        precision="fp32",
+    ) -> None:
+        super().__init__(model_name, hf_model_path, max_num_tokens)
+        self.max_sequence_len = 256
+        self.device = device
+        self.precision = precision
+        self.tokenizer = self.get_tokenizer()
+        self.shark_model = self.compile()
+
+    def shouldStop(self, tokens):
+        stop_ids = [50278, 50279, 50277, 1, 0]
+        for stop_id in stop_ids:
+            if tokens[0][-1] == stop_id:
+                return True
+        return False
+
+    def get_src_model(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.hf_model_path, torch_dtype=torch.float32
+        )
+        return model
+
+    def get_model_inputs(self):
+        input_ids = torch.randint(3, (1, self.max_sequence_len))
+        attention_mask = torch.randint(3, (1, self.max_sequence_len))
+        return input_ids, attention_mask
+
+    def compile(self):
+        tmp_model_name = (
+            f"stableLM_linalg_{self.precision}_seqLen{self.max_sequence_len}"
+        )
+
+        # device = "cuda"  # "cpu"
+        # TODO: vmfb and mlir name should include precision and device
+        model_vmfb_name = None
+        vmfb_path = (
+            Path(tmp_model_name + f"_{self.device}.vmfb")
+            if model_vmfb_name is None
+            else Path(model_vmfb_name)
+        )
+        shark_module = get_vmfb_from_path(
+            vmfb_path, self.device, mlir_dialect="tm_tensor"
+        )
+        if shark_module is not None:
+            return shark_module
+
+        mlir_path = Path(tmp_model_name + ".mlir")
+        print(
+            f"[DEBUG] mlir path {mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
+        )
+        if mlir_path.exists():
+            with open(mlir_path, "rb") as f:
+                bytecode = f.read()
+        else:
+            model = StableLMModel(self.get_src_model())
+            model_inputs = self.get_model_inputs()
+            ts_graph = get_torch_mlir_module_bytecode(model, model_inputs)
+            module = torch_mlir.compile(
+                ts_graph,
+                [*model_inputs],
+                torch_mlir.OutputType.LINALG_ON_TENSORS,
+                use_tracing=False,
+                verbose=False,
+            )
+            bytecode_stream = BytesIO()
+            module.operation.write_bytecode(bytecode_stream)
+            bytecode = bytecode_stream.getvalue()
+        f_ = open(tmp_model_name + ".mlir", "wb")
+        f_.write(bytecode)
+        print("Saved mlir")
+        f_.close()
+
+        from shark.shark_inference import SharkInference
+
+        shark_module = SharkInference(
+            mlir_module=bytecode, device=self.device, mlir_dialect="tm_tensor"
+        )
+        shark_module.compile()
+
+        path = shark_module.save_module(
+            vmfb_path.parent.absolute(), vmfb_path.stem
+        )
+        print("Saved vmfb at ", str(path))
+
+        return shark_module
+
+    def get_tokenizer(self):
+        tok = AutoTokenizer.from_pretrained(self.hf_model_path)
+        tok.add_special_tokens({"pad_token": "<PAD>"})
+        # print("[DEBUG] Sucessfully loaded the tokenizer to the memory")
+        return tok
+
+    def generate(self, prompt):
+        words_list = []
+        for i in range(self.max_num_tokens):
+            params = {
+                "new_text": prompt,
+            }
+
+            generated_token_op = self.generate_new_token(params)
+
+            detok = generated_token_op["detok"]
+            stop_generation = generated_token_op["stop_generation"]
+
+            if stop_generation:
+                break
+
+            print(detok, end="", flush=True)  # this is for CLI and DEBUG
+            words_list.append(detok)
+            if detok == "":
+                break
+            prompt = prompt + detok
+        return words_list
+
+    def generate_new_token(self, params):
+        new_text = params["new_text"]
+        model_inputs = self.tokenizer(
+            [new_text],
+            padding="max_length",
+            max_length=self.max_sequence_len,
+            truncation=True,
+            return_tensors="pt",
+        )
+        sum_attentionmask = torch.sum(model_inputs.attention_mask)
+        output = self.shark_model(
+            "forward", [model_inputs.input_ids, model_inputs.attention_mask]
+        )
+        output = torch.from_numpy(output)
+        next_toks = torch.topk(output, 1)
+        stop_generation = False
+        if self.shouldStop(next_toks.indices):
+            stop_generation = True
+        new_token = next_toks.indices[0][int(sum_attentionmask) - 1]
+        detok = self.tokenizer.decode(
+            new_token,
+            skip_special_tokens=True,
+        )
+        ret_dict = {
+            "new_token": new_token,
+            "detok": detok,
+            "stop_generation": stop_generation,
+        }
+        return ret_dict
+
+
+# Initialize a StopOnTokens object
+system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+"""
--- a/apps/language_models/src/pipelines/vicuna_pipeline.py
+++ b/apps/language_models/src/pipelines/vicuna_pipeline.py
@@ -0,0 +1,417 @@
+from apps.language_models.src.model_wrappers.vicuna_model import (
+    FirstVicuna,
+    SecondVicuna,
+)
+from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
+from apps.language_models.utils import get_torch_mlir_module_bytecode
+from io import BytesIO
+from pathlib import Path
+from shark.shark_inference import SharkInference
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+import re
+import torch
+import torch_mlir
+import os
+
+
+class Vicuna(SharkLLMBase):
+    def __init__(
+        self,
+        model_name,
+        hf_model_path="TheBloke/vicuna-7B-1.1-HF",
+        max_num_tokens=512,
+        device="cuda",
+        precision="fp32",
+    ) -> None:
+        super().__init__(model_name, hf_model_path, max_num_tokens)
+        self.max_sequence_length = 256
+        self.device = device
+        self.precision = precision
+        self.tokenizer = self.get_tokenizer()
+        self.shark_model = self.compile()
+
+    def get_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.hf_model_path, use_fast=False
+        )
+        return tokenizer
+
+    def get_src_model(self):
+        kwargs = {"torch_dtype": torch.float}
+        vicuna_model = AutoModelForCausalLM.from_pretrained(
+            self.hf_model_path, **kwargs
+        )
+        return vicuna_model
+
+    def compile_first_vicuna(self):
+        vmfb_path = Path(self.model_name + ".vmfb")
+        if vmfb_path.exists():
+            shark_module = SharkInference(
+                None, device=self.device, mlir_dialect="tm_tensor"
+            )
+            shark_module.load_module(vmfb_path)
+            # self.shark_module = shark_module
+            return shark_module
+        mlir_path = Path(self.model_name + ".mlir")
+        print(
+            f"[DEBUG] mlir path { mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
+        )
+        if mlir_path.exists():
+            with open(mlir_path, "rb") as f:
+                bytecode = f.read()
+        else:
+            compilation_prompt = "".join(["0" for _ in range(17)])
+            compilation_input_ids = self.tokenizer(
+                compilation_prompt
+            ).input_ids
+            compilation_input_ids = torch.tensor(
+                compilation_input_ids
+            ).reshape([1, 19])
+            firstVicunaCompileInput = (compilation_input_ids,)
+            model = FirstVicuna(self.hf_model_path)
+
+            ts_graph = get_torch_mlir_module_bytecode(
+                model, firstVicunaCompileInput
+            )
+
+            firstVicunaCompileInput = list(firstVicunaCompileInput)
+            firstVicunaCompileInput[0] = torch_mlir.TensorPlaceholder.like(
+                firstVicunaCompileInput[0], dynamic_axes=[1]
+            )
+            firstVicunaCompileInput = tuple(firstVicunaCompileInput)
+            module = torch_mlir.compile(
+                ts_graph,
+                [*firstVicunaCompileInput],
+                torch_mlir.OutputType.LINALG_ON_TENSORS,
+                use_tracing=False,
+                verbose=False,
+            )
+
+            def remove_constant_dim(line):
+                if "19x" in line:
+                    line = re.sub("19x", "?x", line)
+                    line = re.sub(
+                        "tensor.empty\(\)", "tensor.empty(%dim)", line
+                    )
+                if "tensor.empty" in line and "?x?" in line:
+                    line = re.sub(
+                        "tensor.empty\(%dim\)",
+                        "tensor.empty(%dim, %dim)",
+                        line,
+                    )
+                if "arith.cmpi" in line:
+                    line = re.sub("c19", "dim", line)
+                if " 19," in line:
+                    line = re.sub(" 19,", " %dim,", line)
+                return line
+
+            module_str = str(module)
+            new_lines = []
+
+            for line in module_str.splitlines():
+                line = remove_constant_dim(line)
+                if "%0 = tensor.empty(%dim) : tensor<?xi64>" in line:
+                    new_lines.append(
+                        "%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>"
+                    )
+                if "%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>" in line:
+                    continue
+
+                new_lines.append(line)
+
+            module_str = "\n".join(new_lines)
+            bytecode = module_str.encode("UTF-8")
+            bytecode_stream = BytesIO(bytecode)
+            bytecode = bytecode_stream.read()
+            f_ = open(f"{self.model_name}.mlir", "wb")
+            f_.write(bytecode)
+            f_.close()
+
+        shark_module = SharkInference(
+            mlir_module=bytecode, device=self.device, mlir_dialect="tm_tensor"
+        )
+
+        path = shark_module.save_module(
+            os.getcwd(),
+            self.model_name,
+            extra_args=[
+                "--iree-hal-dump-executable-sources-to=ies",
+                "--iree-vm-target-truncate-unsupported-floats",
+                "--iree-codegen-check-ir-before-llvm-conversion=false",
+                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+            ],
+        )
+        print("Saved vmfb at ", str(path))
+        shark_module.load_module(vmfb_path)
+
+        return shark_module
+
+    def compile_second_vicuna(self):
+        vmfb_path = Path(self.model_name + ".vmfb")
+        if vmfb_path.exists():
+            shark_module = SharkInference(
+                None, device=self.device, mlir_dialect="tm_tensor"
+            )
+            shark_module.load_module(vmfb_path)
+            # self.shark_module = shark_module
+            return shark_module
+        mlir_path = Path(self.model_name + ".mlir")
+        print(
+            f"[DEBUG] mlir path { mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
+        )
+        if mlir_path.exists():
+            with open(mlir_path, "rb") as f:
+                bytecode = f.read()
+        else:
+            compilation_input_ids = torch.zeros([1, 1], dtype=torch.int64)
+            pkv = tuple(
+                (torch.zeros([1, 32, 19, 128], dtype=torch.float32))
+                for _ in range(64)
+            )
+            secondVicunaCompileInput = (compilation_input_ids,) + pkv
+            model = SecondVicuna(self.hf_model_path)
+            ts_graph = get_torch_mlir_module_bytecode(
+                model, secondVicunaCompileInput
+            )
+            secondVicunaCompileInput = list(secondVicunaCompileInput)
+            for i in range(len(secondVicunaCompileInput)):
+                if i != 0:
+                    secondVicunaCompileInput[
+                        i
+                    ] = torch_mlir.TensorPlaceholder.like(
+                        secondVicunaCompileInput[i], dynamic_axes=[2]
+                    )
+            secondVicunaCompileInput = tuple(secondVicunaCompileInput)
+            module = torch_mlir.compile(
+                ts_graph,
+                [*secondVicunaCompileInput],
+                torch_mlir.OutputType.LINALG_ON_TENSORS,
+                use_tracing=False,
+                verbose=False,
+            )
+
+            def remove_constant_dim(line):
+                if "c19_i64" in line:
+                    line = re.sub("c19_i64", "dim_i64", line)
+                if "19x" in line:
+                    line = re.sub("19x", "?x", line)
+                    line = re.sub(
+                        "tensor.empty\(\)", "tensor.empty(%dim)", line
+                    )
+                if "tensor.empty" in line and "?x?" in line:
+                    line = re.sub(
+                        "tensor.empty\(%dim\)",
+                        "tensor.empty(%dim, %dim)",
+                        line,
+                    )
+                if "arith.cmpi" in line:
+                    line = re.sub("c19", "dim", line)
+                if " 19," in line:
+                    line = re.sub(" 19,", " %dim,", line)
+                if "20x" in line:
+                    line = re.sub("20x", "?x", line)
+                    line = re.sub(
+                        "tensor.empty\(\)", "tensor.empty(%dimp1)", line
+                    )
+                if " 20," in line:
+                    line = re.sub(" 20,", " %dimp1,", line)
+                return line
+
+            module_str = str(module)
+            new_lines = []
+
+            for line in module_str.splitlines():
+                if "%c19_i64 = arith.constant 19 : i64" in line:
+                    new_lines.append("%c2 = arith.constant 2 : index")
+                    new_lines.append(
+                        "%dim_4_int = tensor.dim %arg1, %c2 : tensor<1x32x?x128xf32>"
+                    )
+                    new_lines.append(
+                        "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
+                    )
+                    continue
+                if "%c2 = arith.constant 2 : index" in line:
+                    continue
+                if "%c20_i64 = arith.constant 20 : i64" in line:
+                    new_lines.append("%c1_i64 = arith.constant 1 : i64")
+                    new_lines.append(
+                        "%c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
+                    )
+                    new_lines.append(
+                        "%dimp1 = arith.index_cast %c20_i64 : i64 to index"
+                    )
+                    continue
+                line = remove_constant_dim(line)
+                new_lines.append(line)
+
+            module_str = "\n".join(new_lines)
+            bytecode = module_str.encode("UTF-8")
+            bytecode_stream = BytesIO(bytecode)
+            bytecode = bytecode_stream.read()
+            f_ = open(f"{self.model_name}.mlir", "wb")
+            f_.write(bytecode)
+            f_.close()
+
+        shark_module = SharkInference(
+            mlir_module=bytecode, device=self.device, mlir_dialect="tm_tensor"
+        )
+
+        path = shark_module.save_module(
+            os.getcwd(),
+            self.model_name,
+            extra_args=[
+                "--iree-hal-dump-executable-sources-to=ies",
+                "--iree-vm-target-truncate-unsupported-floats",
+                "--iree-codegen-check-ir-before-llvm-conversion=false",
+                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+            ],
+        )
+        print("Saved vmfb at ", str(path))
+        shark_module.load_module(vmfb_path)
+
+        # self.shark_module = shark_module
+
+        return shark_module
+
+    def compile(self):
+        # get first vic
+        # fvic_shark_model = self.compile_first_vicuna()
+        # get second vic
+        # svic_shark_model = self.compile_second_vicuna()
+        # return tuple of shark_modules
+        # return fvic_shark_model, svic_shark_model
+        return None
+
+    def generate(self, prompt):
+        # TODO: refactor for cleaner integration
+
+        res = []
+        params = {
+            "prompt": prompt,
+            "is_first": True,
+        }
+
+        generated_token_op = self.generate_new_token(params=params)
+
+        token = generated_token_op["token"]
+        logits = generated_token_op["logits"]
+        pkv = generated_token_op["pkv"]
+        detok = generated_token_op["detok"]
+
+        res.append(detok)
+
+        for _ in range(self.max_num_tokens - 2):
+            # t1 = time.time()
+            params = {
+                "prompt": None,
+                "is_first": False,
+                "logits": logits,
+                "pkv": pkv,
+            }
+
+            generated_token_op = self.generate_new_token(params=params)
+            import gc
+
+            gc.collect()
+            torch.cuda.empty_cache()
+
+            token = generated_token_op["token"]
+            logits = generated_token_op["logits"]
+            pkv = generated_token_op["pkv"]
+            detok = generated_token_op["detok"]
+
+            if token == 2:
+                break
+            if detok == "<0x0A>":
+                res.append("\n")
+            else:
+                res.append(detok)
+
+        return res
+
+    def generate_new_token(self, params):
+        def forward_first(first_vic, prompt, cache_outputs=False):
+            input_ids = self.tokenizer(prompt).input_ids
+            input_id_len = len(input_ids)
+            input_ids = torch.tensor(input_ids)
+            input_ids = input_ids.reshape([1, input_id_len])
+            firstVicunaInput = (input_ids,)
+            assert first_vic is not None
+            output_first_vicuna = first_vic("forward", firstVicunaInput)
+            output_first_vicuna_tensor = torch.tensor(output_first_vicuna[1:])
+            logits_first_vicuna = torch.tensor(output_first_vicuna[0])
+            if cache_outputs:
+                torch.save(
+                    logits_first_vicuna, "logits_first_vicuna_tensor.pt"
+                )
+                torch.save(
+                    output_first_vicuna_tensor, "output_first_vicuna_tensor.pt"
+                )
+            token = torch.argmax(
+                torch.tensor(logits_first_vicuna)[:, -1, :], dim=1
+            )
+            return token, logits_first_vicuna, output_first_vicuna_tensor
+
+        def forward_second(sec_vic, inputs=None, load_inputs=False):
+            if inputs is not None:
+                logits = inputs[0]
+                pkv = inputs[1:]
+            elif load_inputs:
+                pkv = torch.load("output_first_vicuna_tensor.pt")
+                pkv = tuple(torch.tensor(x) for x in pkv)
+                logits = torch.load("logits_first_vicuna_tensor.pt")
+            else:
+                print(
+                    "Either inputs must be given, or load_inputs must be true"
+                )
+                return None
+            token = torch.argmax(torch.tensor(logits)[:, -1, :], dim=1)
+            token = token.to(torch.int64).reshape([1, 1])
+            secondVicunaInput = (token,) + tuple(pkv)
+
+            secondVicunaOutput = sec_vic("forward", secondVicunaInput)
+            new_pkv = secondVicunaOutput[1:]
+            new_logits = secondVicunaOutput[0]
+            new_token = torch.argmax(torch.tensor(new_logits)[:, -1, :], dim=1)
+            return new_token, new_logits, new_pkv
+
+        is_first = params["is_first"]
+
+        if is_first:
+            prompt = params["prompt"]
+            fv = self.compile_first_vicuna()
+            token, logits, pkv = forward_first(
+                fv,  # self.shark_model[0],
+                prompt=prompt,
+                cache_outputs=False,
+            )
+            del fv
+        else:
+            _logits = params["logits"]
+            _pkv = params["pkv"]
+            inputs = (_logits,) + tuple(_pkv)
+            sv = self.compile_second_vicuna()
+            token, logits, pkv = forward_second(
+                sv,  # self.shark_model[1],
+                inputs=inputs,
+                load_inputs=False,
+            )
+            del sv
+
+        detok = self.tokenizer.decode(token)
+        print(
+            f"[DEBUG] is_first: {is_first} |"
+            f" token : {token} | detok : {detok}"
+        )
+        ret_dict = {
+            "token": token,
+            "logits": logits,
+            "pkv": pkv,
+            "detok": detok,
+        }
+        return ret_dict
+
+    def autocomplete(self, prompt):
+        # use First vic alone to complete a story / prompt / sentence.
+        pass
--- a/apps/language_models/utils.py
+++ b/apps/language_models/utils.py
@@ -0,0 +1,140 @@
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from typing import List
+from pathlib import Path
+
+
+def get_torch_mlir_module_bytecode(model, model_inputs):
+    fx_g = make_fx(
+        model,
+        decomposition_table=get_decompositions(
+            [
+                torch.ops.aten.embedding_dense_backward,
+                torch.ops.aten.native_layer_norm_backward,
+                torch.ops.aten.slice_backward,
+                torch.ops.aten.select_backward,
+                torch.ops.aten.norm.ScalarOpt_dim,
+                torch.ops.aten.native_group_norm,
+                torch.ops.aten.upsample_bilinear2d.vec,
+                torch.ops.aten.split.Tensor,
+                torch.ops.aten.split_with_sizes,
+            ]
+        ),
+        # tracing_mode='symbolic',
+    )(*model_inputs)
+    print("Got FX_G")
+
+    def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+        removed_indexes = []
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, (list, tuple)):
+                    node_arg = list(node_arg)
+                    node_args_len = len(node_arg)
+                    for i in range(node_args_len):
+                        curr_index = node_args_len - (i + 1)
+                        if node_arg[curr_index] is None:
+                            removed_indexes.append(curr_index)
+                            node_arg.pop(curr_index)
+                    node.args = (tuple(node_arg),)
+                    break
+
+        if len(removed_indexes) > 0:
+            fx_g.graph.lint()
+            fx_g.graph.eliminate_dead_code()
+            fx_g.recompile()
+        removed_indexes.sort()
+        return removed_indexes
+
+    def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+        """
+        Replace tuple with tuple element in functions that return one-element tuples.
+        Returns true if an unwrapping took place, and false otherwise.
+        """
+        unwrapped_tuple = False
+        for node in fx_g.graph.nodes:
+            if node.op == "output":
+                assert (
+                    len(node.args) == 1
+                ), "Output node must have a single argument"
+                node_arg = node.args[0]
+                if isinstance(node_arg, tuple):
+                    if len(node_arg) == 1:
+                        node.args = (node_arg[0],)
+                        unwrapped_tuple = True
+                        break
+
+        if unwrapped_tuple:
+            fx_g.graph.lint()
+            fx_g.recompile()
+        return unwrapped_tuple
+
+    def transform_fx(fx_g):
+        for node in fx_g.graph.nodes:
+            if node.op == "call_function":
+                if node.target in [
+                    torch.ops.aten.empty,
+                ]:
+                    # aten.empty should be filled with zeros.
+                    if node.target in [torch.ops.aten.empty]:
+                        with fx_g.graph.inserting_after(node):
+                            new_node = fx_g.graph.call_function(
+                                torch.ops.aten.zero_,
+                                args=(node,),
+                            )
+                            node.append(new_node)
+                            node.replace_all_uses_with(new_node)
+                            new_node.args = (node,)
+
+        fx_g.graph.lint()
+
+    transform_fx(fx_g)
+    fx_g.recompile()
+    removed_none_indexes = _remove_nones(fx_g)
+    was_unwrapped = _unwrap_single_tuple_return(fx_g)
+
+    fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+    fx_g.recompile()
+
+    print("FX_G recompile")
+
+    def strip_overloads(gm):
+        """
+        Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+        Args:
+            gm(fx.GraphModule): The input Fx graph module to be modified
+        """
+        for node in gm.graph.nodes:
+            if isinstance(node.target, torch._ops.OpOverload):
+                node.target = node.target.overloadpacket
+        gm.recompile()
+
+    strip_overloads(fx_g)
+    ts_g = torch.jit.script(fx_g)
+    print("Got TS_G")
+    return ts_g
+
+
+# expects a Path / str as arg
+# returns None if path not found or SharkInference module
+def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
+    if not isinstance(vmfb_path, Path):
+        vmfb_path = Path(vmfb_path)
+
+    from shark.shark_inference import SharkInference
+
+    if not vmfb_path.exists():
+        return None
+
+    print("Loading vmfb from: ", vmfb_path)
+    shark_module = SharkInference(
+        None, device=device, mlir_dialect=mlir_dialect
+    )
+    shark_module.load_module(vmfb_path)
+    print("Successfully loaded vmfb")
+    return shark_module
--- a/apps/stable_diffusion/profiling_with_iree.md
+++ b/apps/stable_diffusion/profiling_with_iree.md
@@ -10,7 +10,7 @@ Vulkan AMD:
 iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb

 #  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
-#  use –iree-input-type=mhlo for tf models
+#  use –iree-input-type=auto or "mhlo_legacy" or "stablehlo" for TF models

 CUDA NVIDIA:
 iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1,5 +1 @@
-from apps.stable_diffusion.scripts.img2img import img2img_inf
-from apps.stable_diffusion.scripts.inpaint import inpaint_inf
-from apps.stable_diffusion.scripts.outpaint import outpaint_inf
-from apps.stable_diffusion.scripts.upscaler import upscaler_inf
 from apps.stable_diffusion.scripts.train_lora_word import lora_train
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -7,6 +7,7 @@ from apps.stable_diffusion.src import (
    args,
    Image2ImagePipeline,
    StencilPipeline,
+    resize_stencil,
    get_schedulers,
    set_init_device_flags,
    utils,
@@ -16,273 +17,6 @@ from apps.stable_diffusion.src import (
 from apps.stable_diffusion.src.utils import get_generation_text_info


-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# For stencil, the input image can be of any size but we need to ensure that
-# it conforms with our model contraints :-
-#   Both width and height should be in the range of [128, 768] and multiple of 8.
-# This utility function performs the transformation on the input image while
-# also maintaining the aspect ratio before sending it to the stencil pipeline.
-def resize_stencil(image: Image.Image):
-    width, height = image.size
-    aspect_ratio = width / height
-    min_size = min(width, height)
-    if min_size < 128:
-        n_size = 128
-        if width == min_size:
-            width = n_size
-            height = n_size / aspect_ratio
-        else:
-            height = n_size
-            width = n_size * aspect_ratio
-    width = int(width)
-    height = int(height)
-    n_width = width // 8
-    n_height = height // 8
-    n_width *= 8
-    n_height *= 8
-
-    min_size = min(width, height)
-    if min_size > 768:
-        n_size = 768
-        if width == min_size:
-            height = n_size
-            width = n_size * aspect_ratio
-        else:
-            width = n_size
-            height = n_size / aspect_ratio
-    width = int(width)
-    height = int(height)
-    n_width = width // 8
-    n_height = height // 8
-    n_width *= 8
-    n_height *= 8
-    new_image = image.resize((n_width, n_height))
-    return new_image, n_width, n_height
-
-
-# Exposed to UI.
-def img2img_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image,
-    height: int,
-    width: int,
-    steps: int,
-    strength: float,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    use_stencil: str,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.seed = seed
-    args.steps = steps
-    args.strength = strength
-    args.scheduler = scheduler
-    args.img_path = "not none"
-    args.ondemand = ondemand
-
-    if init_image is None:
-        return None, "An Initial Image is required"
-    image = init_image.convert("RGB")
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    use_stencil = None if use_stencil == "None" else use_stencil
-    args.use_stencil = use_stencil
-    if use_stencil is not None:
-        args.scheduler = "DDIM"
-        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
-        image, width, height = resize_stencil(image)
-    elif args.scheduler != "PNDM":
-        if "Shark" in args.scheduler:
-            print(
-                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
-            )
-            args.scheduler = "PNDM"
-        else:
-            sys.exit(
-                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
-            )
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    args.precision = precision
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    new_config_obj = Config(
-        "img2img",
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=use_stencil,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(args.scheduler)
-
-        if use_stencil is not None:
-            args.use_tuned = False
-            global_obj.set_sd_obj(
-                StencilPipeline.from_pretrained(
-                    scheduler_obj,
-                    args.import_mlir,
-                    args.hf_model_id,
-                    args.ckpt_loc,
-                    args.custom_vae,
-                    args.precision,
-                    args.max_length,
-                    args.batch_size,
-                    args.height,
-                    args.width,
-                    args.use_base_vae,
-                    args.use_tuned,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
-                    use_stencil=use_stencil,
-                    debug=args.import_debug if args.import_mlir else False,
-                    use_lora=args.use_lora,
-                    ondemand=args.ondemand,
-                )
-            )
-        else:
-            global_obj.set_sd_obj(
-                Image2ImagePipeline.from_pretrained(
-                    scheduler_obj,
-                    args.import_mlir,
-                    args.hf_model_id,
-                    args.ckpt_loc,
-                    args.custom_vae,
-                    args.precision,
-                    args.max_length,
-                    args.batch_size,
-                    args.height,
-                    args.width,
-                    args.use_base_vae,
-                    args.use_tuned,
-                    low_cpu_mem_usage=args.low_cpu_mem_usage,
-                    debug=args.import_debug if args.import_mlir else False,
-                    use_lora=args.use_lora,
-                    ondemand=args.ondemand,
-                )
-            )
-
-    global_obj.set_sd_scheduler(args.scheduler)
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    extra_info = {"STRENGTH": strength}
-    text_output = ""
-    for current_batch in range(batch_count):
-        if current_batch > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            image,
-            batch_size,
-            height,
-            width,
-            steps,
-            strength,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-            use_stencil=use_stencil,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed, extra_info)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output
-
-
 def main():
    if args.clear_all:
        clear_all()
@@ -300,16 +34,11 @@ def main():
        args.scheduler = "DDIM"
        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
        image, args.width, args.height = resize_stencil(image)
-    elif args.scheduler != "PNDM":
-        if "Shark" in args.scheduler:
-            print(
-                f"SharkEulerDiscrete scheduler not supported. Switching to PNDM scheduler"
-            )
-            args.scheduler = "PNDM"
-        else:
-            sys.exit(
-                "Img2Img works best with PNDM scheduler. Other schedulers are not supported yet."
-            )
+    elif "Shark" in args.scheduler:
+        print(
+            f"Shark schedulers are not supported. Switching to EulerDiscrete scheduler"
+        )
+        args.scheduler = "EulerDiscrete"
    cpu_scheduling = not args.scheduler.startswith("Shark")
    dtype = torch.float32 if args.precision == "fp32" else torch.half
    set_init_device_flags()
--- a/apps/stable_diffusion/scripts/inpaint.py
+++ b/apps/stable_diffusion/scripts/inpaint.py
@@ -14,186 +14,6 @@ from apps.stable_diffusion.src import (
 from apps.stable_diffusion.src.utils import get_generation_text_info


-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def inpaint_inf(
-    prompt: str,
-    negative_prompt: str,
-    image_dict,
-    height: int,
-    width: int,
-    inpaint_full_res: bool,
-    inpaint_full_res_padding: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-    args.img_path = "not none"
-    args.mask_path = "not none"
-    args.ondemand = ondemand
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        "inpaint",
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.precision = precision
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-inpainting"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(scheduler)
-        global_obj.set_sd_obj(
-            InpaintPipeline.from_pretrained(
-                scheduler=scheduler_obj,
-                import_mlir=args.import_mlir,
-                model_id=args.hf_model_id,
-                ckpt_loc=args.ckpt_loc,
-                custom_vae=args.custom_vae,
-                precision=args.precision,
-                max_length=args.max_length,
-                batch_size=args.batch_size,
-                height=args.height,
-                width=args.width,
-                use_base_vae=args.use_base_vae,
-                use_tuned=args.use_tuned,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                debug=args.import_debug if args.import_mlir else False,
-                use_lora=args.use_lora,
-                ondemand=args.ondemand,
-            )
-        )
-
-    global_obj.set_sd_scheduler(scheduler)
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    image = image_dict["image"]
-    mask_image = image_dict["mask"]
-    text_output = ""
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            image,
-            mask_image,
-            batch_size,
-            height,
-            width,
-            inpaint_full_res,
-            inpaint_full_res_padding,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output
-
-
 def main():
    if args.clear_all:
        clear_all()
--- a/apps/stable_diffusion/scripts/outpaint.py
+++ b/apps/stable_diffusion/scripts/outpaint.py
@@ -11,196 +11,6 @@ from apps.stable_diffusion.src import (
    clear_all,
    save_output_img,
 )
-from apps.stable_diffusion.src.utils import get_generation_text_info
-
-
-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def outpaint_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image,
-    pixels: int,
-    mask_blur: int,
-    directions: list,
-    noise_q: float,
-    color_variation: float,
-    height: int,
-    width: int,
-    steps: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
-        SD_STATE_CANCEL,
-    )
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.steps = steps
-    args.scheduler = scheduler
-    args.img_path = "not none"
-    args.ondemand = ondemand
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    new_config_obj = Config(
-        "outpaint",
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        height,
-        width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.precision = precision
-        args.batch_count = batch_count
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.height = height
-        args.width = width
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-inpainting"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(scheduler)
-        global_obj.set_sd_obj(
-            OutpaintPipeline.from_pretrained(
-                scheduler_obj,
-                args.import_mlir,
-                args.hf_model_id,
-                args.ckpt_loc,
-                args.custom_vae,
-                args.precision,
-                args.max_length,
-                args.batch_size,
-                args.height,
-                args.width,
-                args.use_base_vae,
-                args.use_tuned,
-                use_lora=args.use_lora,
-                ondemand=args.ondemand,
-            )
-        )
-
-    global_obj.set_sd_scheduler(scheduler)
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-
-    left = True if "left" in directions else False
-    right = True if "right" in directions else False
-    top = True if "up" in directions else False
-    bottom = True if "down" in directions else False
-
-    text_output = ""
-    for i in range(batch_count):
-        if i > 0:
-            img_seed = utils.sanitize_seed(-1)
-        out_imgs = global_obj.get_sd_obj().generate_images(
-            prompt,
-            negative_prompt,
-            init_image,
-            pixels,
-            mask_blur,
-            left,
-            right,
-            top,
-            bottom,
-            noise_q,
-            color_variation,
-            batch_size,
-            height,
-            width,
-            steps,
-            guidance_scale,
-            img_seed,
-            args.max_length,
-            dtype,
-            args.use_base_vae,
-            cpu_scheduling,
-        )
-        seeds.append(img_seed)
-        total_time = time.time() - start_time
-        text_output = get_generation_text_info(seeds, device)
-        text_output += "\n" + global_obj.get_sd_obj().log
-        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
-
-        if global_obj.get_sd_status() == SD_STATE_CANCEL:
-            break
-        else:
-            save_output_img(out_imgs[0], img_seed)
-            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
-
-    return generated_imgs, text_output


 def main():
--- a/apps/stable_diffusion/scripts/train_lora_word.py
+++ b/apps/stable_diffusion/scripts/train_lora_word.py
@@ -73,6 +73,7 @@ from apps.stable_diffusion.src import (
    set_init_device_flags,
    clear_all,
 )
+from apps.stable_diffusion.src.utils import update_lora_weight


 # Setup the dataset
@@ -159,6 +160,21 @@ class LoraDataset(Dataset):
        return example


+def torch_device(device):
+    device_tokens = device.split("=>")
+    if len(device_tokens) == 1:
+        device_str = device_tokens[0].strip()
+    else:
+        device_str = device_tokens[1].strip()
+    device_type_tokens = device_str.split("://")
+    if device_type_tokens[0] == "metal":
+        device_type_tokens[0] = "vulkan"
+    if len(device_type_tokens) > 1:
+        return device_type_tokens[0] + ":" + device_type_tokens[1]
+    else:
+        return device_type_tokens[0]
+
+
 ########## Setting up the model ##########
 def lora_train(
    prompt: str,
@@ -177,6 +193,7 @@ def lora_train(
    max_length: int,
    training_images_dir: str,
    lora_save_dir: str,
+    use_lora: str,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -222,12 +239,8 @@ def lora_train(
    args.max_length = max_length
    args.height = height
    args.width = width
-    device_str = device.split("=>", 1)[1].strip().split("://")
-    if len(device_str) > 1:
-        device_str = device_str[0] + ":" + device_str[1]
-    else:
-        device_str = device_str[0]
-    args.device = device_str
+    args.device = torch_device(device)
+    args.use_lora = use_lora

    # Load the Stable Diffusion model
    text_encoder = CLIPTextModel.from_pretrained(
@@ -252,29 +265,33 @@ def lora_train(
    unet.to(args.device)
    text_encoder.to(args.device)

-    lora_attn_procs = {}
-    for name in unet.attn_processors.keys():
-        cross_attention_dim = (
-            None
-            if name.endswith("attn1.processor")
-            else unet.config.cross_attention_dim
-        )
-        if name.startswith("mid_block"):
-            hidden_size = unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[
-                block_id
-            ]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = unet.config.block_out_channels[block_id]
+    if use_lora != "":
+        update_lora_weight(unet, args.use_lora, "unet")
+    else:
+        lora_attn_procs = {}
+        for name in unet.attn_processors.keys():
+            cross_attention_dim = (
+                None
+                if name.endswith("attn1.processor")
+                else unet.config.cross_attention_dim
+            )
+            if name.startswith("mid_block"):
+                hidden_size = unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(unet.config.block_out_channels))[
+                    block_id
+                ]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = unet.config.block_out_channels[block_id]

-        lora_attn_procs[name] = LoRACrossAttnProcessor(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
-        )
+            lora_attn_procs[name] = LoRACrossAttnProcessor(
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )

-    unet.set_attn_processor(lora_attn_procs)
+        unet.set_attn_processor(lora_attn_procs)
    lora_layers = AttnProcsLayers(unet.attn_processors)

    class VaeModel(torch.nn.Module):
@@ -671,4 +688,5 @@ if __name__ == "__main__":
        args.max_length,
        args.training_images_dir,
        args.lora_save_dir,
+        args.use_lora,
    )
--- a/apps/stable_diffusion/scripts/tuner.py
+++ b/apps/stable_diffusion/scripts/tuner.py
@@ -0,0 +1,126 @@
+import os
+from pathlib import Path
+from shark_tuner.codegen_tuner import SharkCodegenTuner
+from shark_tuner.iree_utils import (
+    dump_dispatches,
+    create_context,
+    export_module_to_mlir_file,
+)
+from shark_tuner.model_annotation import model_annotation
+from apps.stable_diffusion.src.utils.stable_args import args
+from apps.stable_diffusion.src.utils.utils import set_init_device_flags
+from apps.stable_diffusion.src.utils.sd_annotation import (
+    get_device_args,
+    load_winograd_configs,
+)
+from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel
+
+
+def load_mlir_module():
+    sd_model = SharkifyStableDiffusionModel(
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        args.precision,
+        max_len=args.max_length,
+        batch_size=args.batch_size,
+        height=args.height,
+        width=args.width,
+        use_base_vae=args.use_base_vae,
+        use_tuned=False,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        return_mlir=True,
+    )
+
+    if args.annotation_model == "unet":
+        mlir_module = sd_model.unet()
+        model_name = sd_model.model_name["unet"]
+    elif args.annotation_model == "vae":
+        mlir_module = sd_model.vae()
+        model_name = sd_model.model_name["vae"]
+    else:
+        raise ValueError(
+            f"{args.annotation_model} is not supported for tuning."
+        )
+
+    return mlir_module, model_name
+
+
+def main():
+    args.use_tuned = False
+    set_init_device_flags()
+    mlir_module, model_name = load_mlir_module()
+
+    # Get device and device specific arguments
+    device, device_spec_args = get_device_args()
+    device_spec = ""
+    vulkan_target_triple = ""
+    if device_spec_args:
+        device_spec = device_spec_args[-1].split("=")[-1].strip()
+        if device == "vulkan":
+            vulkan_target_triple = device_spec
+            device_spec = device_spec.split("-")[0]
+
+    # Add winograd annotation for vulkan device
+    use_winograd = (
+        True
+        if device == "vulkan" and args.annotation_model in ["unet", "vae"]
+        else False
+    )
+    winograd_config = (
+        load_winograd_configs()
+        if device == "vulkan" and args.annotation_model in ["unet", "vae"]
+        else ""
+    )
+    with create_context() as ctx:
+        input_module = model_annotation(
+            ctx,
+            input_contents=mlir_module,
+            config_path=winograd_config,
+            search_op="conv",
+            winograd=use_winograd,
+        )
+
+    # Dump model dispatches
+    generates_dir = Path.home() / "tmp"
+    if not os.path.exists(generates_dir):
+        os.makedirs(generates_dir)
+    dump_mlir = generates_dir / "temp.mlir"
+    dispatch_dir = generates_dir / f"{model_name}_{device_spec}_dispatches"
+    export_module_to_mlir_file(input_module, dump_mlir)
+    dump_dispatches(
+        dump_mlir,
+        device,
+        dispatch_dir,
+        vulkan_target_triple,
+        use_winograd=use_winograd,
+    )
+
+    # Tune each dispatch
+    dtype = "f16" if args.precision == "fp16" else "f32"
+    config_filename = f"{model_name}_{device_spec}_configs.json"
+
+    for f_path in os.listdir(dispatch_dir):
+        if not f_path.endswith(".mlir"):
+            continue
+
+        model_dir = os.path.join(dispatch_dir, f_path)
+
+        tuner = SharkCodegenTuner(
+            model_dir,
+            device,
+            "random",
+            args.num_iters,
+            args.tuned_config_dir,
+            dtype,
+            args.search_op,
+            batch_size=1,
+            config_filename=config_filename,
+            use_dispatch=True,
+            vulkan_target_triple=vulkan_target_triple,
+        )
+        tuner.tune()
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/scripts/upscaler.py
+++ b/apps/stable_diffusion/scripts/upscaler.py
@@ -13,192 +13,6 @@ from apps.stable_diffusion.src import (
 )


-# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
-init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
-init_use_tuned = args.use_tuned
-init_import_mlir = args.import_mlir
-
-
-# Exposed to UI.
-def upscaler_inf(
-    prompt: str,
-    negative_prompt: str,
-    init_image,
-    height: int,
-    width: int,
-    steps: int,
-    noise_level: int,
-    guidance_scale: float,
-    seed: int,
-    batch_count: int,
-    batch_size: int,
-    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
-    precision: str,
-    device: str,
-    max_length: int,
-    save_metadata_to_json: bool,
-    save_metadata_to_png: bool,
-    lora_weights: str,
-    lora_hf_id: str,
-    ondemand: bool,
-):
-    from apps.stable_diffusion.web.ui.utils import (
-        get_custom_model_pathfile,
-        get_custom_vae_or_lora_weights,
-        Config,
-    )
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
-
-    args.prompts = [prompt]
-    args.negative_prompts = [negative_prompt]
-    args.guidance_scale = guidance_scale
-    args.seed = seed
-    args.steps = steps
-    args.scheduler = scheduler
-    args.ondemand = ondemand
-
-    if init_image is None:
-        return None, "An Initial Image is required"
-    image = init_image.convert("RGB").resize((height, width))
-
-    # set ckpt_loc and hf_model_id.
-    args.ckpt_loc = ""
-    args.hf_model_id = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, both must not be empty",
-            )
-        args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
-    else:
-        args.hf_model_id = custom_model
-
-    args.save_metadata_to_json = save_metadata_to_json
-    args.write_metadata_to_png = save_metadata_to_png
-
-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
-
-    dtype = torch.float32 if precision == "fp32" else torch.half
-    cpu_scheduling = not scheduler.startswith("Shark")
-    args.height = 128
-    args.width = 128
-    new_config_obj = Config(
-        "upscaler",
-        args.hf_model_id,
-        args.ckpt_loc,
-        precision,
-        batch_size,
-        max_length,
-        args.height,
-        args.width,
-        device,
-        use_lora=args.use_lora,
-        use_stencil=None,
-    )
-    if (
-        not global_obj.get_sd_obj()
-        or global_obj.get_cfg_obj() != new_config_obj
-    ):
-        global_obj.clear_cache()
-        global_obj.set_cfg_obj(new_config_obj)
-        args.batch_size = batch_size
-        args.max_length = max_length
-        args.device = device.split("=>", 1)[1].strip()
-        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
-        args.use_tuned = init_use_tuned
-        args.import_mlir = init_import_mlir
-        set_init_device_flags()
-        model_id = (
-            args.hf_model_id
-            if args.hf_model_id
-            else "stabilityai/stable-diffusion-2-1-base"
-        )
-        global_obj.set_schedulers(get_schedulers(model_id))
-        scheduler_obj = global_obj.get_scheduler(scheduler)
-        global_obj.set_sd_obj(
-            UpscalerPipeline.from_pretrained(
-                scheduler_obj,
-                args.import_mlir,
-                args.hf_model_id,
-                args.ckpt_loc,
-                args.custom_vae,
-                args.precision,
-                args.max_length,
-                args.batch_size,
-                args.height,
-                args.width,
-                args.use_base_vae,
-                args.use_tuned,
-                low_cpu_mem_usage=args.low_cpu_mem_usage,
-                use_lora=args.use_lora,
-                ondemand=args.ondemand,
-            )
-        )
-
-    global_obj.set_sd_scheduler(scheduler)
-    global_obj.get_sd_obj().low_res_scheduler = global_obj.get_scheduler(
-        "DDPM"
-    )
-
-    start_time = time.time()
-    global_obj.get_sd_obj().log = ""
-    generated_imgs = []
-    seeds = []
-    img_seed = utils.sanitize_seed(seed)
-    extra_info = {"NOISE LEVEL": noise_level}
-    for current_batch in range(batch_count):
-        if current_batch > 0:
-            img_seed = utils.sanitize_seed(-1)
-        low_res_img = image
-        high_res_img = Image.new("RGB", (height * 4, width * 4))
-
-        for i in range(0, width, 128):
-            for j in range(0, height, 128):
-                box = (j, i, j + 128, i + 128)
-                upscaled_image = global_obj.get_sd_obj().generate_images(
-                    prompt,
-                    negative_prompt,
-                    low_res_img.crop(box),
-                    batch_size,
-                    args.height,
-                    args.width,
-                    steps,
-                    noise_level,
-                    guidance_scale,
-                    img_seed,
-                    args.max_length,
-                    dtype,
-                    args.use_base_vae,
-                    cpu_scheduling,
-                )
-                high_res_img.paste(upscaled_image[0], (j * 4, i * 4))
-
-        save_output_img(high_res_img, img_seed, extra_info)
-        generated_imgs.append(high_res_img)
-        seeds.append(img_seed)
-        global_obj.get_sd_obj().log += "\n"
-        yield generated_imgs, global_obj.get_sd_obj().log
-
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={device}"
-    text_output += f"\nsteps={steps}, noise_level={noise_level}, guidance_scale={guidance_scale}, seed={seeds}"
-    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
-    text_output += global_obj.get_sd_obj().log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    yield generated_imgs, text_output
-
-
 if __name__ == "__main__":
    if args.clear_all:
        clear_all()
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -29,6 +29,9 @@ datas += collect_data_files('gradio_client')
 datas += collect_data_files('iree')
 datas += collect_data_files('google-cloud-storage')
 datas += collect_data_files('shark')
+datas += collect_data_files('tkinter')
+datas += collect_data_files('webview')
+datas += collect_data_files('sentencepiece')
 datas += [
         ( 'src/utils/resources/prompts.json', 'resources' ),
         ( 'src/utils/resources/model_db.json', 'resources' ),
@@ -44,6 +47,7 @@ block_cipher = None

 hiddenimports = ['shark', 'shark.shark_inference', 'apps']
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]

 a = Analysis(
    ['web/index.py'],
--- a/apps/stable_diffusion/shark_sd_cli.spec
+++ b/apps/stable_diffusion/shark_sd_cli.spec
@@ -42,6 +42,7 @@ block_cipher = None

 hiddenimports = ['shark', 'shark.shark_inference', 'apps']
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]

 a = Analysis(
    ['scripts/main.py'],
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -5,6 +5,7 @@ from apps.stable_diffusion.src.utils import (
    get_available_devices,
    clear_all,
    save_output_img,
+    resize_stencil,
 )
 from apps.stable_diffusion.src.pipelines import (
    Text2ImagePipeline,
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -1,9 +1,11 @@
 from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
 from transformers import CLIPTextModel
 from collections import defaultdict
+from pathlib import Path
 import torch
 import safetensors.torch
 import traceback
+import subprocess
 import sys
 import os
 from apps.stable_diffusion.src.utils import (
@@ -11,8 +13,8 @@ from apps.stable_diffusion.src.utils import (
    get_opt_flags,
    base_models,
    args,
-    fetch_vmfb,
    preprocessCKPT,
+    convert_original_vae,
    get_path_to_diffusers_checkpoint,
    fetch_and_update_base_model_id,
    get_path_stem,
@@ -82,6 +84,7 @@ class SharkifyStableDiffusionModel:
        use_stencil: str = None,
        use_lora: str = "",
        use_quantize: str = None,
+        return_mlir: bool = False,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
@@ -91,10 +94,19 @@ class SharkifyStableDiffusionModel:
        self.custom_weights = custom_weights
        self.use_quantize = use_quantize
        if custom_weights != "":
-            assert custom_weights.lower().endswith(
-                (".ckpt", ".safetensors")
-            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
-            custom_weights = get_path_to_diffusers_checkpoint(custom_weights)
+            if "civitai" in custom_weights:
+                weights_id = custom_weights.split("/")[-1]
+                # TODO: use model name and identify file type by civitai rest api
+                weights_path = str(Path.cwd()) + "/models/" + weights_id + ".safetensors"
+                if not os.path.isfile(weights_path):
+                    subprocess.run(["wget", custom_weights, "-O", weights_path])
+                custom_weights = get_path_to_diffusers_checkpoint(weights_path)
+                self.custom_weights = weights_path
+            else:
+                assert custom_weights.lower().endswith(
+                    (".ckpt", ".safetensors")
+                ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
+                custom_weights = get_path_to_diffusers_checkpoint(custom_weights)
        self.model_id = model_id if custom_weights == "" else custom_weights
        # TODO: remove the following line when stable-diffusion-2-1 works
        if self.model_id == "stabilityai/stable-diffusion-2-1":
@@ -147,6 +159,7 @@ class SharkifyStableDiffusionModel:
        self.base_model_id = fetch_and_update_base_model_id(self.model_to_run)
        if self.base_model_id != "" and args.ckpt_loc != "":
            args.hf_model_id = self.base_model_id
+        self.return_mlir = return_mlir

    def get_extended_name_for_all_model(self):
        model_name = {}
@@ -160,6 +173,8 @@ class SharkifyStableDiffusionModel:
                    model_config = model_config + get_path_stem(self.custom_vae)
                if self.base_vae:
                    sub_model = "base_vae"
+            if "stencil_adaptor" == model and self.use_stencil is not None:
+                model_config = model_config + get_path_stem(self.use_stencil)
            model_name[model] = get_extended_name(sub_model + model_config)
            index += 1
        return model_name
@@ -211,17 +226,20 @@ class SharkifyStableDiffusionModel:

        vae_encode = VaeEncodeModel()
        inputs = tuple(self.inputs["vae_encode"])
-        is_f16 = True if self.precision == "fp16" else False
-        shark_vae_encode = compile_through_fx(
+        is_f16 = True if not self.is_upscaler and self.precision == "fp16" else False
+        shark_vae_encode, vae_encode_mlir = compile_through_fx(
            vae_encode,
            inputs,
            is_f16=is_f16,
            use_tuned=self.use_tuned,
-            model_name=self.model_name["vae_encode"],
+            extended_model_name=self.model_name["vae_encode"],
            extra_args=get_opt_flags("vae", precision=self.precision),
            base_model_id=self.base_model_id,
+            model_name="vae_encode",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_vae_encode
+        return shark_vae_encode, vae_encode_mlir

    def get_vae(self):
        class VaeModel(torch.nn.Module):
@@ -261,23 +279,26 @@ class SharkifyStableDiffusionModel:

        vae = VaeModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
        inputs = tuple(self.inputs["vae"])
-        is_f16 = True if self.precision == "fp16" else False
+        is_f16 = True if not self.is_upscaler and self.precision == "fp16" else False
        save_dir = os.path.join(self.sharktank_dir, self.model_name["vae"])
        if self.debug:
            os.makedirs(save_dir, exist_ok=True)
-        shark_vae = compile_through_fx(
+        shark_vae, vae_mlir = compile_through_fx(
            vae,
            inputs,
            is_f16=is_f16,
            use_tuned=self.use_tuned,
-            model_name=self.model_name["vae"],
+            extended_model_name=self.model_name["vae"],
            debug=self.debug,
            generate_vmfb=self.generate_vmfb,
            save_dir=save_dir,
            extra_args=get_opt_flags("vae", precision=self.precision),
            base_model_id=self.base_model_id,
+            model_name="vae",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_vae
+        return shark_vae, vae_mlir

    def get_controlled_unet(self):
        class ControlledUnetModel(torch.nn.Module):
@@ -322,17 +343,20 @@ class SharkifyStableDiffusionModel:

        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True,]
-        shark_controlled_unet = compile_through_fx(
+        shark_controlled_unet, controlled_unet_mlir = compile_through_fx(
            unet,
            inputs,
-            model_name=self.model_name["stencil_unet"],
+            extended_model_name=self.model_name["stencil_unet"],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
+            model_name="stencil_unet",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_controlled_unet
+        return shark_controlled_unet, controlled_unet_mlir

    def get_control_net(self):
        class StencilControlNetModel(torch.nn.Module):
@@ -376,17 +400,20 @@ class SharkifyStableDiffusionModel:

        inputs = tuple(self.inputs["stencil_adaptor"])
        input_mask = [True, True, True, True]
-        shark_cnet = compile_through_fx(
+        shark_cnet, cnet_mlir = compile_through_fx(
            scnet,
            inputs,
-            model_name=self.model_name["stencil_adaptor"],
+            extended_model_name=self.model_name["stencil_adaptor"],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
+            model_name="stencil_adaptor",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_cnet
+        return shark_cnet, cnet_mlir

    def get_unet(self):
        class UnetModel(torch.nn.Module):
@@ -432,10 +459,10 @@ class SharkifyStableDiffusionModel:
                save_dir,
                exist_ok=True,
            )
-        shark_unet = compile_through_fx(
+        shark_unet, unet_mlir = compile_through_fx(
            unet,
            inputs,
-            model_name=self.model_name["unet"],
+            extended_model_name=self.model_name["unet"],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
@@ -444,8 +471,11 @@ class SharkifyStableDiffusionModel:
            save_dir=save_dir,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
+            model_name="unet",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_unet
+        return shark_unet, unet_mlir

    def get_unet_upscaler(self):
        class UnetModel(torch.nn.Module):
@@ -473,17 +503,20 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False
        inputs = tuple(self.inputs["unet"])
        input_mask = [True, True, True, False]
-        shark_unet = compile_through_fx(
+        shark_unet, unet_mlir = compile_through_fx(
            unet,
            inputs,
-            model_name=self.model_name["unet"],
+            extended_model_name=self.model_name["unet"],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
+            model_name="unet",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_unet
+        return shark_unet, unet_mlir

    def get_clip(self):
        class CLIPText(torch.nn.Module):
@@ -507,17 +540,20 @@ class SharkifyStableDiffusionModel:
                save_dir,
                exist_ok=True,
            )
-        shark_clip = compile_through_fx(
+        shark_clip, clip_mlir = compile_through_fx(
            clip_model,
            tuple(self.inputs["clip"]),
-            model_name=self.model_name["clip"],
+            extended_model_name=self.model_name["clip"],
            debug=self.debug,
            generate_vmfb=self.generate_vmfb,
            save_dir=save_dir,
            extra_args=get_opt_flags("clip", precision="fp32"),
            base_model_id=self.base_model_id,
+            model_name="clip",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
        )
-        return shark_clip
+        return shark_clip, clip_mlir

    def process_custom_vae(self):
        custom_vae = self.custom_vae.lower()
@@ -536,8 +572,12 @@ class SharkifyStableDiffusionModel:
                vae_checkpoint = safetensors.torch.load_file(self.custom_vae, device="cpu")
            if "state_dict" in vae_checkpoint:
                vae_checkpoint = vae_checkpoint["state_dict"]
-            vae_dict = {k: v for k, v in vae_checkpoint.items() if k[0:4] != "loss" and k not in vae_ignore_keys}
-            return vae_dict
+
+            try:
+                vae_checkpoint = convert_original_vae(vae_checkpoint)
+            finally:
+                vae_dict = {k: v for k, v in vae_checkpoint.items() if k[0:4] != "loss" and k not in vae_ignore_keys}
+                return vae_dict

    def compile_unet_variants(self, model):
        if model == "unet":
@@ -553,54 +593,45 @@ class SharkifyStableDiffusionModel:
            return self.get_controlled_unet()

    def vae_encode(self):
-        # Fetch vmfb for the model if present
-        vmfb = fetch_vmfb("vae_encode", self.model_name["vae_encode"], self.precision)
-        if vmfb:
-            return vmfb
-
        try:
            self.inputs["vae_encode"] = self.get_input_info_for(base_models["vae_encode"])
-            compiled_vae_encode = self.get_vae_encode()
+            compiled_vae_encode, vae_encode_mlir = self.get_vae_encode()

            check_compilation(compiled_vae_encode, "Vae Encode")
+            if self.return_mlir:
+                return vae_encode_mlir
            return compiled_vae_encode
        except Exception as e:
            sys.exit(e)

    def clip(self):
-        vmfb = fetch_vmfb("clip", self.model_name["clip"], self.precision)
-        if vmfb:
-            return vmfb
-
        try:
            self.inputs["clip"] = self.get_input_info_for(base_models["clip"])
-            compiled_clip = self.get_clip()
+            compiled_clip, clip_mlir = self.get_clip()

            check_compilation(compiled_clip, "Clip")
+            if self.return_mlir:
+                return clip_mlir
            return compiled_clip
        except Exception as e:
            sys.exit(e)

    def unet(self):
-        model = "stencil_unet" if self.use_stencil is not None else "unet"
-        vmfb = fetch_vmfb(model, self.model_name[model], self.precision)
-        if vmfb:
-            return vmfb
-
        try:
+            model = "stencil_unet" if self.use_stencil is not None else "unet"
            compiled_unet = None
            unet_inputs = base_models[model]

            if self.base_model_id != "":
                self.inputs["unet"] = self.get_input_info_for(unet_inputs[self.base_model_id])
-                compiled_unet = self.compile_unet_variants(model)
+                compiled_unet, unet_mlir = self.compile_unet_variants(model)
            else:
                for model_id in unet_inputs:
                    self.base_model_id = model_id
                    self.inputs["unet"] = self.get_input_info_for(unet_inputs[model_id])

                    try:
-                        compiled_unet = self.compile_unet_variants(model)
+                        compiled_unet, unet_mlir = self.compile_unet_variants(model)
                    except Exception as e:
                        print(e)
                        print("Retrying with a different base model configuration")
@@ -618,15 +649,13 @@ class SharkifyStableDiffusionModel:
                    break

            check_compilation(compiled_unet, "Unet")
+            if self.return_mlir:
+                return unet_mlir
            return compiled_unet
        except Exception as e:
            sys.exit(e)

    def vae(self):
-        vmfb = fetch_vmfb("vae", self.model_name["vae"], self.precision)
-        if vmfb:
-            return vmfb
-
        try:
            vae_input = base_models["vae"]["vae_upscaler"] if self.is_upscaler else base_models["vae"]["vae"]
            self.inputs["vae"] = self.get_input_info_for(vae_input)
@@ -634,24 +663,24 @@ class SharkifyStableDiffusionModel:
            is_base_vae = self.base_vae
            if self.is_upscaler:
                self.base_vae = True
-            compiled_vae = self.get_vae()
+            compiled_vae, vae_mlir = self.get_vae()
            self.base_vae = is_base_vae

            check_compilation(compiled_vae, "Vae")
+            if self.return_mlir:
+                return vae_mlir
            return compiled_vae
        except Exception as e:
            sys.exit(e)

    def controlnet(self):
-        vmfb = fetch_vmfb("stencil_adaptor", self.model_name["stencil_adaptor"], self.precision)
-        if vmfb:
-            return vmfb
-
        try:
            self.inputs["stencil_adaptor"] = self.get_input_info_for(base_models["stencil_adaptor"])
-            compiled_stencil_adaptor = self.get_control_net()
+            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net()

            check_compilation(compiled_stencil_adaptor, "Stencil")
+            if self.return_mlir:
+                return controlnet_mlir
            return compiled_stencil_adaptor
        except Exception as e:
            sys.exit(e)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -155,7 +155,9 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        generator = torch.manual_seed(seed)

        # Get text embeddings with weight emphasis from prompts
-        text_embeddings = self.encode_prompts_weight(prompts, neg_prompts)
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
@@ -407,7 +407,9 @@ class InpaintPipeline(StableDiffusionPipeline):
        )

        # Get text embeddings with weight emphasis from prompts
-        text_embeddings = self.encode_prompts_weight(prompts, neg_prompts)
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
@@ -408,7 +408,9 @@ class OutpaintPipeline(StableDiffusionPipeline):
        )

        # Get text embeddings with weight emphasis from prompts
-        text_embeddings = self.encode_prompts_weight(prompts, neg_prompts)
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
@@ -537,8 +539,6 @@ class OutpaintPipeline(StableDiffusionPipeline):
                    cpu_scheduling=cpu_scheduling,
                )
                all_imgs.extend(imgs)
-            if self.ondemand:
-                self.unload_vae()

            res_img = all_imgs[0].resize(
                (image_to_process.width, image_to_process.height)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -229,7 +229,9 @@ class StencilPipeline(StableDiffusionPipeline):
        generator = torch.manual_seed(seed)

        # Get text embeddings with weight emphasis from prompts
-        text_embeddings = self.encode_prompts_weight(prompts, neg_prompts)
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -111,7 +111,9 @@ class Text2ImagePipeline(StableDiffusionPipeline):
        )

        # Get text embeddings with weight emphasis from prompts
-        text_embeddings = self.encode_prompts_weight(prompts, neg_prompts)
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
@@ -20,6 +20,8 @@ from diffusers import (
 )
 from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    SD_STATE_IDLE,
+    SD_STATE_CANCEL,
    StableDiffusionPipeline,
 )
 from apps.stable_diffusion.src.utils import (
@@ -84,6 +86,7 @@ class UpscalerPipeline(StableDiffusionPipeline):
    ):
        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
        self.low_res_scheduler = low_res_scheduler
+        self.status = SD_STATE_IDLE

    def prepare_extra_step_kwargs(self, generator, eta):
        accepts_eta = "eta" in set(
@@ -164,6 +167,7 @@ class UpscalerPipeline(StableDiffusionPipeline):
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
+        self.status = SD_STATE_IDLE
        self.load_unet()
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
@@ -210,6 +214,9 @@ class UpscalerPipeline(StableDiffusionPipeline):
            #  )
            step_time_sum += step_time

+            if self.status == SD_STATE_CANCEL:
+                break
+
        if self.ondemand:
            self.unload_unet()
        avg_step_time = step_time_sum / len(total_timesteps)
@@ -256,7 +263,9 @@ class UpscalerPipeline(StableDiffusionPipeline):
        generator = torch.manual_seed(seed)

        # Get text embeddings with weight emphasis from prompts
-        text_embeddings = self.encode_prompts_weight(prompts, neg_prompts)
+        text_embeddings = self.encode_prompts_weight(
+            prompts, neg_prompts, max_length
+        )

        # 4. Preprocess image
        image = preprocess(image).to(dtype)
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -57,7 +57,7 @@ class StableDiffusionPipeline:
        self.vae = None
        self.text_encoder = None
        self.unet = None
-        self.tokenizer = get_tokenizer()
+        self.model_max_length = 77
        self.scheduler = scheduler
        # TODO: Implement using logging python utility.
        self.log = ""
@@ -66,6 +66,13 @@ class StableDiffusionPipeline:
        self.import_mlir = import_mlir
        self.use_lora = use_lora
        self.ondemand = ondemand
+        # TODO: Find a better workaround for fetching base_model_id early enough for CLIPTokenizer.
+        try:
+            self.tokenizer = get_tokenizer()
+        except:
+            self.load_unet()
+            self.unload_unet()
+            self.tokenizer = get_tokenizer()

    def load_clip(self):
        if self.text_encoder is not None:
@@ -80,7 +87,8 @@ class StableDiffusionPipeline:
        else:
            try:
                self.text_encoder = get_clip()
-            except:
+            except Exception as e:
+                print(e)
                print("download pipeline failed, falling back to import_mlir")
                self.text_encoder = self.sd_model.clip()

@@ -97,7 +105,8 @@ class StableDiffusionPipeline:
        else:
            try:
                self.unet = get_unet()
-            except:
+            except Exception as e:
+                print(e)
                print("download pipeline failed, falling back to import_mlir")
                self.unet = self.sd_model.unet()

@@ -114,7 +123,8 @@ class StableDiffusionPipeline:
        else:
            try:
                self.vae = get_vae()
-            except:
+            except Exception as e:
+                print(e)
                print("download pipeline failed, falling back to import_mlir")
                self.vae = self.sd_model.vae()

@@ -146,7 +156,8 @@ class StableDiffusionPipeline:
        clip_inf_start = time.time()
        text_embeddings = self.text_encoder("forward", (text_input,))
        clip_inf_time = (time.time() - clip_inf_start) * 1000
-        # self.unload_clip()
+        if self.ondemand:
+            self.unload_clip()
        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"

        return text_embeddings
@@ -337,6 +348,7 @@ class StableDiffusionPipeline:
        self,
        prompt,
        negative_prompt,
+        model_max_length,
        do_classifier_free_guidance=True,
        max_embeddings_multiples=1,
        num_images_per_prompt=1,
@@ -349,6 +361,8 @@ class StableDiffusionPipeline:
            negative_prompt (`str` or `List[str]`):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
+            model_max_length (int):
+                SHARK: pass the max length instead of relying on pipe.tokenizer.model_max_length
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not,
                SHARK: must be set to True as we always expect neg embeddings (defaulted to True)
@@ -360,7 +374,8 @@ class StableDiffusionPipeline:
                SHARK: num_images_per_prompt is not used (defaulted to 1)
        """

-        # SHARK: Load the clip and prepare inference time
+        # SHARK: Save model_max_length, load the clip and init inference time
+        self.model_max_length = model_max_length
        self.load_clip()
        clip_inf_start = time.time()

@@ -399,7 +414,8 @@ class StableDiffusionPipeline:

        # SHARK: Report clip inference time
        clip_inf_time = (time.time() - clip_inf_start) * 1000
-        # self.unload_clip()
+        if self.ondemand:
+            self.unload_clip()
        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"

        return text_embeddings.numpy()
@@ -689,9 +705,7 @@ def get_weighted_text_embeddings(
        skip_weighting (`bool`, *optional*, defaults to `False`):
            Skip the weighting. When the parsing is skipped, it is forced True.
    """
-    max_length = (
-        pipe.tokenizer.model_max_length - 2
-    ) * max_embeddings_multiples + 2
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
    if isinstance(prompt, str):
        prompt = [prompt]

@@ -733,12 +747,10 @@ def get_weighted_text_embeddings(

    max_embeddings_multiples = min(
        max_embeddings_multiples,
-        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+        (max_length - 1) // (pipe.model_max_length - 2) + 1,
    )
    max_embeddings_multiples = max(1, max_embeddings_multiples)
-    max_length = (
-        pipe.tokenizer.model_max_length - 2
-    ) * max_embeddings_multiples + 2
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2

    # pad the length of tokens and weights
    bos = pipe.tokenizer.bos_token_id
@@ -750,7 +762,7 @@ def get_weighted_text_embeddings(
        bos,
        eos,
        no_boseos_middle=no_boseos_middle,
-        chunk_length=pipe.tokenizer.model_max_length,
+        chunk_length=pipe.model_max_length,
    )
    # prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device="cpu")
@@ -762,7 +774,7 @@ def get_weighted_text_embeddings(
            bos,
            eos,
            no_boseos_middle=no_boseos_middle,
-            chunk_length=pipe.tokenizer.model_max_length,
+            chunk_length=pipe.model_max_length,
        )
        # uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
        uncond_tokens = torch.tensor(
@@ -773,7 +785,7 @@ def get_weighted_text_embeddings(
    text_embeddings = get_unweighted_text_embeddings(
        pipe,
        prompt_tokens,
-        pipe.tokenizer.model_max_length,
+        pipe.model_max_length,
        no_boseos_middle=no_boseos_middle,
    )
    # prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
@@ -784,7 +796,7 @@ def get_weighted_text_embeddings(
        uncond_embeddings = get_unweighted_text_embeddings(
            pipe,
            uncond_tokens,
-            pipe.tokenizer.model_max_length,
+            pipe.model_max_length,
            no_boseos_middle=no_boseos_middle,
        )
        # uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -40,6 +40,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
    def compile(self):
        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
        BATCH_SIZE = args.batch_size
+        device = args.device.split(":", 1)[0].strip()

        model_input = {
            "euler": {
@@ -89,19 +90,19 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):

        def _import(self):
            scaling_model = ScalingModel()
-            self.scaling_model = compile_through_fx(
+            self.scaling_model, _ = compile_through_fx(
                model=scaling_model,
                inputs=(example_latent, example_sigma),
-                model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}"
+                extended_model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}_{device}_"
                + args.precision,
                extra_args=iree_flags,
            )

            step_model = SchedulerStepModel()
-            self.step_model = compile_through_fx(
+            self.step_model, _ = compile_through_fx(
                step_model,
                (example_output, example_sigma, example_latent, example_dt),
-                model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}"
+                extended_model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}_{device}_"
                + args.precision,
                extra_args=iree_flags,
            )
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -24,14 +24,17 @@ from apps.stable_diffusion.src.utils.utils import (
    get_available_devices,
    get_opt_flags,
    preprocessCKPT,
-    fetch_vmfb,
+    convert_original_vae,
    fetch_and_update_base_model_id,
    get_path_to_diffusers_checkpoint,
    sanitize_seed,
    get_path_stem,
    get_extended_name,
+    get_generated_imgs_path,
+    get_generated_imgs_todays_subdir,
    clear_all,
    save_output_img,
    get_generation_text_info,
    update_lora_weight,
+    resize_stencil,
 )
--- a/apps/stable_diffusion/src/utils/resources/model_db.json
+++ b/apps/stable_diffusion/src/utils/resources/model_db.json
@@ -3,7 +3,7 @@
    "stablediffusion/untuned":"gs://shark_tank/nightly"
  },
  {
-    "stablediffusion/v1_4/unet/fp16/length_64/untuned":"unet_1_64_512_512_fp16_stable-diffusion-2-1-base_vulkan",
+    "stablediffusion/v1_4/unet/fp16/length_64/untuned":"unet_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
    "stablediffusion/v1_4/vae/fp16/length_77/untuned":"vae_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
    "stablediffusion/v1_4/vae/fp16/length_64/untuned":"vae_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
    "stablediffusion/v1_4/clip/fp32/length_64/untuned":"clip_1_64_512_512_fp16_stable-diffusion-v1-4_vulkan",
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -45,12 +45,12 @@
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16},iree-linalg-ext-convert-conv2d-to-winograd))"
        ]
      }
    }
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -70,6 +70,8 @@ def load_winograd_configs():
    config_bucket = "gs://shark_tank/sd_tuned/configs/"
    config_name = f"{args.annotation_model}_winograd_{device}.json"
    full_gs_url = config_bucket + config_name
+    if not os.path.exists(WORKDIR):
+        os.mkdir(WORKDIR)
    winograd_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading Winograd config file from ", winograd_config_dir)
    download_public_file(full_gs_url, winograd_config_dir, True)
@@ -233,11 +235,14 @@ def sd_model_annotation(mlir_model, model_name, base_model_id=None):
            winograd_model, lowering_config_dir, model_name, use_winograd
        )
    elif args.annotation_model == "vae" and device == "vulkan":
-        use_winograd = True
-        winograd_config_dir = load_winograd_configs()
-        tuned_model = annotate_with_winograd(
-            mlir_model, winograd_config_dir, model_name
-        )
+        if "rdna2" not in args.iree_vulkan_target_triple.split("-")[0]:
+            use_winograd = True
+            winograd_config_dir = load_winograd_configs()
+            tuned_model = annotate_with_winograd(
+                mlir_model, winograd_config_dir, model_name
+            )
+        else:
+            tuned_model = mlir_model
    else:
        use_winograd = False
        lowering_config_dir = load_lower_configs(base_model_id)
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -493,7 +493,13 @@ p.add_argument(
    default="",
    help="Path to directory where all .ckpts are stored in order to populate them in the web UI",
 )
-
+# TODO: replace API flag when these can be run together
+p.add_argument(
+    "--ui",
+    type=str,
+    default="app" if os.name == "nt" else "web",
+    help="one of: [api, app, web]",
+)

 p.add_argument(
    "--share",
@@ -515,6 +521,22 @@ p.add_argument(
    action=argparse.BooleanOptionalAction,
    help="flag for enabling rest API",
 )
+
+p.add_argument(
+    "--output_gallery",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="flag for removing the output gallery tab, and avoid exposing images under --output_dir in the UI",
+)
+
+p.add_argument(
+    "--output_gallery_followlinks",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="flag for whether the output gallery tab in the UI should follow symlinks when listing subdirectorys under --output_dir",
+)
+
+
 ##############################################################################
 ### SD model auto-annotation flags
 ##############################################################################
@@ -539,6 +561,31 @@ p.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Save annotated mlir file",
 )
+##############################################################################
+### SD model auto-tuner flags
+##############################################################################
+
+p.add_argument(
+    "--tuned_config_dir",
+    type=path_expand,
+    default="./",
+    help="Directory to save the tuned config file",
+)
+
+p.add_argument(
+    "--num_iters",
+    type=int,
+    default=400,
+    help="Number of iterations for tuning",
+)
+
+p.add_argument(
+    "--search_op",
+    type=str,
+    default="all",
+    help="Op to be optimized, options are matmul, bmm, conv and all",
+)
+

 args, unknown = p.parse_known_args()
 if args.import_debug:
--- a/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
+++ b/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
@@ -126,14 +126,14 @@ def controlnet_hint_conversion(


 stencil_to_model_id_map = {
-    "canny": "lllyasviel/sd-controlnet-canny",
-    "depth": "lllyasviel/sd-controlnet-depth",
+    "canny": "lllyasviel/control_v11p_sd15_canny",
+    "depth": "lllyasviel/control_v11p_sd15_depth",
    "hed": "lllyasviel/sd-controlnet-hed",
-    "mlsd": "lllyasviel/sd-controlnet-mlsd",
-    "normal": "lllyasviel/sd-controlnet-normal",
-    "openpose": "lllyasviel/sd-controlnet-openpose",
-    "scribble": "lllyasviel/sd-controlnet-scribble",
-    "seg": "lllyasviel/sd-controlnet-seg",
+    "mlsd": "lllyasviel/control_v11p_sd15_mlsd",
+    "normal": "lllyasviel/control_v11p_sd15_normalbae",
+    "openpose": "lllyasviel/control_v11p_sd15_openpose",
+    "scribble": "lllyasviel/control_v11p_sd15_scribble",
+    "seg": "lllyasviel/control_v11p_sd15_seg",
 }


--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -3,6 +3,7 @@ import gc
 import json
 import re
 from PIL import PngImagePlugin
+from PIL import Image
 from datetime import datetime as dt
 from csv import DictWriter
 from pathlib import Path
@@ -24,7 +25,12 @@ from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
 import sys
 from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
    download_from_original_stable_diffusion_ckpt,
+    create_vae_diffusers_config,
+    convert_ldm_vae_checkpoint,
 )
+import requests
+from io import BytesIO
+from omegaconf import OmegaConf


 def get_extended_name(model_name):
@@ -38,6 +44,15 @@ def get_vmfb_path_name(model_name):
    return vmfb_path


+def _load_vmfb(shark_module, vmfb_path, model, precision):
+    model = "vae" if "base_vae" in model or "vae_encode" in model else model
+    model = "unet" if "stencil" in model else model
+    precision = "fp32" if "clip" in model else precision
+    extra_args = get_opt_flags(model, precision)
+    shark_module.load_module(vmfb_path, extra_args=extra_args)
+    return shark_module
+
+
 def _compile_module(shark_module, model_name, extra_args=[]):
    if args.load_vmfb or args.save_vmfb:
        vmfb_path = get_vmfb_path_name(model_name)
@@ -68,7 +83,6 @@ def get_shark_model(tank_url, model_name, extra_args=[]):

    # Set local shark_tank cache directory.
    shark_args.local_tank_cache = args.local_tank_cache
-
    from shark.shark_downloader import download_model

    if "cuda" in args.device:
@@ -89,7 +103,7 @@ def get_shark_model(tank_url, model_name, extra_args=[]):
 def compile_through_fx(
    model,
    inputs,
-    model_name,
+    extended_model_name,
    is_f16=False,
    f16_input_mask=None,
    use_tuned=False,
@@ -98,7 +112,19 @@ def compile_through_fx(
    generate_vmfb=True,
    extra_args=[],
    base_model_id=None,
+    model_name=None,
+    precision=None,
+    return_mlir=False,
 ):
+    if not return_mlir and model_name is not None:
+        vmfb_path = get_vmfb_path_name(extended_model_name)
+        if os.path.isfile(vmfb_path):
+            shark_module = SharkInference(mlir_module=None, device=args.device)
+            return (
+                _load_vmfb(shark_module, vmfb_path, model_name, precision),
+                None,
+            )
+
    from shark.parser import shark_args

    if "cuda" in args.device:
@@ -113,14 +139,16 @@ def compile_through_fx(
        is_f16=is_f16,
        f16_input_mask=f16_input_mask,
        debug=debug,
-        model_name=model_name,
+        model_name=extended_model_name,
        save_dir=save_dir,
    )
    if use_tuned:
-        if "vae" in model_name.split("_")[0]:
+        if "vae" in extended_model_name.split("_")[0]:
            args.annotation_model = "vae"
+        if "unet" in model_name.split("_")[0]:
+            args.annotation_model = "unet"
        mlir_module = sd_model_annotation(
-            mlir_module, model_name, base_model_id
+            mlir_module, extended_model_name, base_model_id
        )

    shark_module = SharkInference(
@@ -128,16 +156,11 @@ def compile_through_fx(
        device=args.device,
        mlir_dialect="tm_tensor",
    )
-
    if generate_vmfb:
-        shark_module = SharkInference(
+        return (
+            _compile_module(shark_module, extended_model_name, extra_args),
            mlir_module,
-            device=args.device,
-            mlir_dialect="tm_tensor",
        )
-        del mlir_module
-        gc.collect()
-        return _compile_module(shark_module, model_name, extra_args)

    del mlir_module
    gc.collect()
@@ -445,7 +468,7 @@ def get_path_stem(path):
 def get_path_to_diffusers_checkpoint(custom_weights):
    path = Path(custom_weights)
    diffusers_path = path.parent.absolute()
-    diffusers_directory_name = path.stem
+    diffusers_directory_name = os.path.join("diffusers", path.stem)
    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
    path_to_diffusers = complete_path_to_diffusers.as_posix()
@@ -484,6 +507,22 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
    print("Loading complete")


+def convert_original_vae(vae_checkpoint):
+    vae_state_dict = {}
+    for key in list(vae_checkpoint.keys()):
+        vae_state_dict["first_stage_model." + key] = vae_checkpoint.get(key)
+
+    config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+    original_config_file = BytesIO(requests.get(config_url).content)
+    original_config = OmegaConf.load(original_config_file)
+    vae_config = create_vae_diffusers_config(original_config, image_size=512)
+
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(
+        vae_state_dict, vae_config
+    )
+    return converted_vae_checkpoint
+
+
 def processLoRA(model, use_lora, splitting_prefix):
    state_dict = ""
    if ".safetensors" in use_lora:
@@ -593,26 +632,6 @@ def update_lora_weight(model, use_lora, model_name):
        return None


-def load_vmfb(vmfb_path, model, precision):
-    model = "vae" if "base_vae" in model or "vae_encode" in model else model
-    model = "unet" if "stencil" in model else model
-    precision = "fp32" if "clip" in model else precision
-    extra_args = get_opt_flags(model, precision)
-    shark_module = SharkInference(mlir_module=None, device=args.device)
-    shark_module.load_module(vmfb_path, extra_args=extra_args)
-    return shark_module
-
-
-# This utility returns vmfb of sub-model of the SD pipeline, if present.
-def fetch_vmfb(model, extended_model_name, precision="fp32"):
-    vmfb_path = get_vmfb_path_name(extended_model_name)
-    vmfb_present = os.path.isfile(vmfb_path)
-    compiled_model = (
-        load_vmfb(vmfb_path, model, precision) if vmfb_present else None
-    )
-    return compiled_model
-
-
 # `fetch_and_update_base_model_id` is a resource utility function which
 # helps maintaining mapping of the model to run with its base model.
 # If `base_model` is "", then this function tries to fetch the base model
@@ -666,17 +685,28 @@ def clear_all():
    if os.name == "nt":  # Windows
        appdata = os.getenv("LOCALAPPDATA")
        shutil.rmtree(os.path.join(appdata, "AMD/VkCache"), ignore_errors=True)
-        shutil.rmtree(os.path.join(home, "shark_tank"), ignore_errors=True)
+        shutil.rmtree(
+            os.path.join(home, ".local/shark_tank"), ignore_errors=True
+        )
    elif os.name == "unix":
        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
        shutil.rmtree(os.path.join(home, ".local/shark_tank"))


+def get_generated_imgs_path() -> Path:
+    return Path(
+        args.output_dir if args.output_dir else Path.cwd(), "generated_imgs"
+    )
+
+
+def get_generated_imgs_todays_subdir() -> str:
+    return dt.now().strftime("%Y%m%d")
+
+
 # save output images and the inputs corresponding to it.
 def save_output_img(output_img, img_seed, extra_info={}):
-    output_path = args.output_dir if args.output_dir else Path.cwd()
    generated_imgs_path = Path(
-        output_path, "generated_imgs", dt.now().strftime("%Y%m%d")
+        get_generated_imgs_path(), get_generated_imgs_todays_subdir()
    )
    generated_imgs_path.mkdir(parents=True, exist_ok=True)
    csv_path = Path(generated_imgs_path, "imgs_details.csv")
@@ -749,3 +779,46 @@ def get_generation_text_info(seeds, device):
    text_output += f"\nsize={args.height}x{args.width}, batch_count={args.batch_count}, batch_size={args.batch_size}, max_length={args.max_length}"

    return text_output
+
+
+# For stencil, the input image can be of any size but we need to ensure that
+# it conforms with our model contraints :-
+#   Both width and height should be in the range of [128, 768] and multiple of 8.
+# This utility function performs the transformation on the input image while
+# also maintaining the aspect ratio before sending it to the stencil pipeline.
+def resize_stencil(image: Image.Image):
+    width, height = image.size
+    aspect_ratio = width / height
+    min_size = min(width, height)
+    if min_size < 128:
+        n_size = 128
+        if width == min_size:
+            width = n_size
+            height = n_size / aspect_ratio
+        else:
+            height = n_size
+            width = n_size * aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+
+    min_size = min(width, height)
+    if min_size > 768:
+        n_size = 768
+        if width == min_size:
+            height = n_size
+            width = n_size * aspect_ratio
+        else:
+            width = n_size
+            height = n_size / aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+    new_image = image.resize((n_width, n_height))
+    return new_image, n_width, n_height
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,7 +1,9 @@
+from multiprocessing import Process, freeze_support
 import os
 import sys
-import transformers
+import transformers  # ensures inclusion in pysintaller exe generation
 from apps.stable_diffusion.src import args, clear_all
+import apps.stable_diffusion.web.utils.global_obj as global_obj

 if sys.platform == "darwin":
    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
@@ -9,31 +11,60 @@ if sys.platform == "darwin":
 if args.clear_all:
    clear_all()

+
+def launch_app(address):
+    from tkinter import Tk
+    import webview
+
+    window = Tk()
+
+    # getting screen width and height of display
+    width = window.winfo_screenwidth()
+    height = window.winfo_screenheight()
+    webview.create_window(
+        "SHARK AI Studio", url=address, width=width, height=height
+    )
+    webview.start(private_mode=False)
+
+
 if __name__ == "__main__":
-    if args.api:
-        from apps.stable_diffusion.web.ui import txt2img_inf
+    # required to do multiprocessing in a pyinstaller freeze
+    freeze_support()
+    if args.api or "api" in args.ui.split(","):
+        from apps.stable_diffusion.web.ui import (
+            txt2img_api,
+            img2img_api,
+            upscaler_api,
+            inpaint_api,
+        )
        from fastapi import FastAPI, APIRouter
        import uvicorn

+        # init global sd pipeline and config
+        global_obj._init()
+
        app = FastAPI()
-        app.add_api_route("/sdapi/txt2img", txt2img_inf, methods=["post"])
+        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
+        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
+        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
+        #  app.add_api_route(
+        #      "/sdapi/v1/outpaint", outpaint_api, methods=["post"]
+        #  )
+        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
        app.include_router(APIRouter())
-        uvicorn.run(app, host="0.0.0.0", port=args.server_port)
+        uvicorn.run(app, host="127.0.0.1", port=args.server_port)
        sys.exit(0)

    import gradio as gr
-    import apps.stable_diffusion.web.utils.global_obj as global_obj
    from apps.stable_diffusion.web.utils.gradio_configs import (
        clear_gradio_tmp_imgs_folder,
    )
-    from apps.stable_diffusion.web.ui.utils import get_custom_model_path
+    from apps.stable_diffusion.web.ui.utils import create_custom_models_folders

    # Clear all gradio tmp images from the last session
    clear_gradio_tmp_imgs_folder()
-    # Create the custom model folder if it doesn't already exist
-    dir = ["models", "vae", "lora"]
-    for root in dir:
-        get_custom_model_path(root).mkdir(parents=True, exist_ok=True)
+    # Create custom models folders if they don't exist
+    create_custom_models_folders()

    def resource_path(relative_path):
        """Get absolute path to resource, works for dev and for PyInstaller"""
@@ -46,36 +77,69 @@ if __name__ == "__main__":

    from apps.stable_diffusion.web.ui import (
        txt2img_web,
+        txt2img_custom_model,
+        txt2img_hf_model_id,
        txt2img_gallery,
+        txt2img_png_info_img,
+        txt2img_status,
        txt2img_sendto_img2img,
        txt2img_sendto_inpaint,
        txt2img_sendto_outpaint,
        txt2img_sendto_upscaler,
        img2img_web,
+        img2img_custom_model,
+        img2img_hf_model_id,
        img2img_gallery,
        img2img_init_image,
+        img2img_status,
        img2img_sendto_inpaint,
        img2img_sendto_outpaint,
        img2img_sendto_upscaler,
        inpaint_web,
+        inpaint_custom_model,
+        inpaint_hf_model_id,
        inpaint_gallery,
        inpaint_init_image,
+        inpaint_status,
        inpaint_sendto_img2img,
        inpaint_sendto_outpaint,
        inpaint_sendto_upscaler,
        outpaint_web,
+        outpaint_custom_model,
+        outpaint_hf_model_id,
        outpaint_gallery,
        outpaint_init_image,
+        outpaint_status,
        outpaint_sendto_img2img,
        outpaint_sendto_inpaint,
        outpaint_sendto_upscaler,
        upscaler_web,
+        upscaler_custom_model,
+        upscaler_hf_model_id,
        upscaler_gallery,
        upscaler_init_image,
+        upscaler_status,
        upscaler_sendto_img2img,
        upscaler_sendto_inpaint,
        upscaler_sendto_outpaint,
        lora_train_web,
+        model_web,
+        hf_models,
+        modelmanager_sendto_txt2img,
+        modelmanager_sendto_img2img,
+        modelmanager_sendto_inpaint,
+        modelmanager_sendto_outpaint,
+        modelmanager_sendto_upscaler,
+        stablelm_chat,
+        outputgallery_web,
+        outputgallery_tab_select,
+        outputgallery_watch,
+        outputgallery_filename,
+        outputgallery_sendto_txt2img,
+        outputgallery_sendto_img2img,
+        outputgallery_sendto_inpaint,
+        outputgallery_sendto_outpaint,
+        outputgallery_sendto_upscaler,
    )

    # init global sd pipeline and config
@@ -91,6 +155,27 @@ if __name__ == "__main__":
            outputs,
        )

+    def register_modelmanager_button(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                "None",
+                x,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    def register_outputgallery_button(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                x,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
    with gr.Blocks(
        css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
    ) as sd_web:
@@ -105,11 +190,29 @@ if __name__ == "__main__":
                outpaint_web.render()
            with gr.TabItem(label="Upscaler", id=4):
                upscaler_web.render()
-
-        with gr.Tabs(visible=False) as experimental_tabs:
-            with gr.TabItem(label="LoRA Training", id=5):
+            with gr.TabItem(label="Model Manager", id=5):
+                model_web.render()
+            with gr.TabItem(label="Chat Bot(Experimental)", id=6):
+                stablelm_chat.render()
+            with gr.TabItem(label="LoRA Training(Experimental)", id=7):
                lora_train_web.render()
+            if args.output_gallery:
+                with gr.TabItem(label="Output Gallery", id=8) as og_tab:
+                    outputgallery_web.render()

+                # extra output gallery configuration
+                outputgallery_tab_select(og_tab.select)
+                outputgallery_watch(
+                    [
+                        txt2img_status,
+                        img2img_status,
+                        inpaint_status,
+                        outpaint_status,
+                        upscaler_status,
+                    ]
+                )
+
+        # send to buttons
        register_button_click(
            txt2img_sendto_img2img,
            1,
@@ -206,10 +309,77 @@ if __name__ == "__main__":
            [upscaler_gallery],
            [outpaint_init_image, tabs],
        )
+        if args.output_gallery:
+            register_outputgallery_button(
+                outputgallery_sendto_txt2img,
+                0,
+                [outputgallery_filename],
+                [txt2img_png_info_img, tabs],
+            )
+            register_outputgallery_button(
+                outputgallery_sendto_img2img,
+                1,
+                [outputgallery_filename],
+                [img2img_init_image, tabs],
+            )
+            register_outputgallery_button(
+                outputgallery_sendto_inpaint,
+                2,
+                [outputgallery_filename],
+                [inpaint_init_image, tabs],
+            )
+            register_outputgallery_button(
+                outputgallery_sendto_outpaint,
+                3,
+                [outputgallery_filename],
+                [outpaint_init_image, tabs],
+            )
+            register_outputgallery_button(
+                outputgallery_sendto_upscaler,
+                4,
+                [outputgallery_filename],
+                [upscaler_init_image, tabs],
+            )
+        register_modelmanager_button(
+            modelmanager_sendto_txt2img,
+            0,
+            [hf_models],
+            [txt2img_custom_model, txt2img_hf_model_id, tabs],
+        )
+        register_modelmanager_button(
+            modelmanager_sendto_img2img,
+            1,
+            [hf_models],
+            [img2img_custom_model, img2img_hf_model_id, tabs],
+        )
+        register_modelmanager_button(
+            modelmanager_sendto_inpaint,
+            2,
+            [hf_models],
+            [inpaint_custom_model, inpaint_hf_model_id, tabs],
+        )
+        register_modelmanager_button(
+            modelmanager_sendto_outpaint,
+            3,
+            [hf_models],
+            [outpaint_custom_model, outpaint_hf_model_id, tabs],
+        )
+        register_modelmanager_button(
+            modelmanager_sendto_upscaler,
+            4,
+            [hf_models],
+            [upscaler_custom_model, upscaler_hf_model_id, tabs],
+        )
+
    sd_web.queue()
+    if args.ui == "app":
+        t = Process(
+            target=launch_app, args=[f"http://localhost:{args.server_port}"]
+        )
+        t.start()
    sd_web.launch(
        share=args.share,
-        inbrowser=True,
+        inbrowser=args.ui == "web",
        server_name="0.0.0.0",
        server_port=args.server_port,
    )
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -1,42 +1,88 @@
 from apps.stable_diffusion.web.ui.txt2img_ui import (
    txt2img_inf,
+    txt2img_api,
    txt2img_web,
+    txt2img_custom_model,
+    txt2img_hf_model_id,
    txt2img_gallery,
+    txt2img_png_info_img,
+    txt2img_status,
    txt2img_sendto_img2img,
    txt2img_sendto_inpaint,
    txt2img_sendto_outpaint,
    txt2img_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.img2img_ui import (
+    img2img_inf,
+    img2img_api,
    img2img_web,
+    img2img_custom_model,
+    img2img_hf_model_id,
    img2img_gallery,
    img2img_init_image,
+    img2img_status,
    img2img_sendto_inpaint,
    img2img_sendto_outpaint,
    img2img_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.inpaint_ui import (
+    inpaint_inf,
+    inpaint_api,
    inpaint_web,
+    inpaint_custom_model,
+    inpaint_hf_model_id,
    inpaint_gallery,
    inpaint_init_image,
+    inpaint_status,
    inpaint_sendto_img2img,
    inpaint_sendto_outpaint,
    inpaint_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.outpaint_ui import (
+    outpaint_inf,
+    outpaint_api,
    outpaint_web,
+    outpaint_custom_model,
+    outpaint_hf_model_id,
    outpaint_gallery,
    outpaint_init_image,
+    outpaint_status,
    outpaint_sendto_img2img,
    outpaint_sendto_inpaint,
    outpaint_sendto_upscaler,
 )
 from apps.stable_diffusion.web.ui.upscaler_ui import (
+    upscaler_inf,
+    upscaler_api,
    upscaler_web,
+    upscaler_custom_model,
+    upscaler_hf_model_id,
    upscaler_gallery,
    upscaler_init_image,
+    upscaler_status,
    upscaler_sendto_img2img,
    upscaler_sendto_inpaint,
    upscaler_sendto_outpaint,
 )
+from apps.stable_diffusion.web.ui.model_manager import (
+    model_web,
+    hf_models,
+    modelmanager_sendto_txt2img,
+    modelmanager_sendto_img2img,
+    modelmanager_sendto_inpaint,
+    modelmanager_sendto_outpaint,
+    modelmanager_sendto_upscaler,
+)
 from apps.stable_diffusion.web.ui.lora_train_ui import lora_train_web
+from apps.stable_diffusion.web.ui.stablelm_ui import stablelm_chat
+from apps.stable_diffusion.web.ui.outputgallery_ui import (
+    outputgallery_web,
+    outputgallery_tab_select,
+    outputgallery_watch,
+    outputgallery_filename,
+    outputgallery_sendto_txt2img,
+    outputgallery_sendto_img2img,
+    outputgallery_sendto_inpaint,
+    outputgallery_sendto_outpaint,
+    outputgallery_sendto_upscaler,
+)
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -101,6 +101,9 @@ Procedure to upgrade the dark theme:
 }

 /* SHARK theme */
+body {
+    background-color: var(--background-fill-primary);
+}

 /* display in full width for desktop devices */
@media (min-width: 1536px)
@@ -166,14 +169,44 @@ footer {
    border-radius: 0 !important;
 }

+/* Gallery: Remove the default square ratio thumbnail and limit images height to the container */
+#gallery .thumbnail-item.thumbnail-lg {
+    aspect-ratio: unset;
+    max-height: calc(55vh - (2 * var(--spacing-lg)));
+}
+@media (min-width: 1921px) {
+    /* Force a 768px_height + 4px_margin_height + navbar_height for the gallery */
+    #gallery .grid-wrap, #gallery .preview{
+        min-height: calc(768px + 4px + var(--size-14));
+        max-height: calc(768px + 4px + var(--size-14));
+    }
+    /* Limit height to 768px_height + 2px_margin_height for the thumbnails */
+    #gallery .thumbnail-item.thumbnail-lg {
+        max-height: 770px !important;
+    }
+}
+/* Don't upscale when viewing in solo image mode */
+#gallery .preview img {
+    object-fit: scale-down;
+}
+/* Navbar images in cover mode*/
+#gallery .preview .thumbnail-item img {
+    object-fit: cover;
+}
+
+/* Limit the stable diffusion text output height */
+#std_output textarea {
+    max-height: 215px;
+}
+
 /* Prevent progress bar to block gallery navigation while building images (Gradio V3.19.0) */
 #gallery .wrap.default {
    pointer-events: none;
 }

 /* Import Png info box */
-#txt2img_prompt_image .fixed-height {
-    height: var(--size-32);
+#txt2img_prompt_image {
+    height: var(--size-32) !important;
 }

 /* Hide "remove buttons" from ui dropdowns */
@@ -197,3 +230,44 @@ footer {
 #top_logo .download {
    display: none;
 }
+
+/* output gallery tab */
+.output_parameters_dataframe tbody td {
+    font-size: small;
+    line-height: var(--line-xs)
+}
+
+#output_refresh_button {
+    max-width: 30px;
+    align-self: end;
+    padding-bottom: 8px;
+}
+
+.outputgallery_sendto {
+    min-width: 7em !important;
+}
+
+/* output gallery should take up most of the viewport height regardless of image size/number */
+#outputgallery_gallery .fixed-height {
+    min-height: 89vh !important;
+}
+
+/* don't stretch non-square images to be square, breaking their aspect ratio */
+#outputgallery_gallery .thumbnail-item.thumbnail-lg > img {
+    object-fit: contain !important;
+}
+
+/* centered logo for when there are no images */
+#top_logo.logo_centered {
+    height: 100%;
+    width: 100%;
+}
+
+#top_logo.logo_centered img{
+    object-fit: scale-down;
+    position: absolute;
+    width: 80%;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+}
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -1,18 +1,350 @@
-from pathlib import Path
 import os
+import torch
+import time
 import gradio as gr
+import PIL
 from PIL import Image
-from apps.stable_diffusion.scripts import img2img_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
-    scheduler_list,
+    scheduler_list_cpu_only,
    predefined_models,
    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    Image2ImagePipeline,
+    StencilPipeline,
+    resize_stencil,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import (
+    get_generated_imgs_path,
+    get_generation_text_info,
+)
+from apps.stable_diffusion.web.utils.common_label_calc import status_label
+import numpy as np
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def img2img_inf(
+    prompt: str,
+    negative_prompt: str,
+    image_dict,
+    height: int,
+    width: int,
+    steps: int,
+    strength: float,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    use_stencil: str,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.strength = strength
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.ondemand = ondemand
+
+    if image_dict is None:
+        return None, "An Initial Image is required"
+    if use_stencil == "scribble":
+        image = image_dict["mask"].convert("RGB")
+    elif isinstance(image_dict, PIL.Image.Image):
+        image = image_dict.convert("RGB")
+    else:
+        image = image_dict["image"].convert("RGB")
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        if "civitai" in hf_model_id:
+            args.ckpt_loc = hf_model_id
+        else:
+            args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    use_stencil = None if use_stencil == "None" else use_stencil
+    args.use_stencil = use_stencil
+    if use_stencil is not None:
+        args.scheduler = "DDIM"
+        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
+        image, width, height = resize_stencil(image)
+    elif "Shark" in args.scheduler:
+        print(
+            f"Shark schedulers are not supported. Switching to EulerDiscrete scheduler"
+        )
+        args.scheduler = "EulerDiscrete"
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    args.precision = precision
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    new_config_obj = Config(
+        "img2img",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=use_stencil,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(args.scheduler)
+
+        if use_stencil is not None:
+            args.use_tuned = False
+            global_obj.set_sd_obj(
+                StencilPipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    args.batch_size,
+                    args.height,
+                    args.width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    use_stencil=use_stencil,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    ondemand=args.ondemand,
+                )
+            )
+        else:
+            global_obj.set_sd_obj(
+                Image2ImagePipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    args.batch_size,
+                    args.height,
+                    args.width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    ondemand=args.ondemand,
+                )
+            )
+
+    global_obj.set_sd_scheduler(args.scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    extra_info = {"STRENGTH": strength}
+    text_output = ""
+    for current_batch in range(batch_count):
+        if current_batch > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            batch_size,
+            height,
+            width,
+            steps,
+            strength,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+            use_stencil=use_stencil,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(
+                out_imgs[0],
+                img_seed,
+                extra_info,
+            )
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output, status_label(
+                "Image-to-Image", current_batch + 1, batch_count, batch_size
+            )
+
+    return generated_imgs, text_output, ""
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Img2Img Rest API.
+def img2img_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["init_images"][0])
+    res = img2img_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        init_image,
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["denoising_strength"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        use_stencil=InputData["use_stencil"]
+        if "use_stencil" in InputData.keys()
+        else "None",
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Image-to-Image") as img2img_web:
@@ -30,23 +362,31 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
                with gr.Row():
-                    custom_model = gr.Dropdown(
+                    img2img_custom_model = gr.Dropdown(
                        label=f"Models (Custom Model path: {get_custom_model_path()})",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
-                        else "None",
+                        else "stabilityai/stable-diffusion-2-1-base",
                        choices=["None"]
                        + get_custom_model_files()
                        + predefined_models,
                    )
-                    hf_model_id = gr.Textbox(
+                    img2img_hf_model_id = gr.Textbox(
                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3, https://civitai.com/api/download/models/15236",
                        value="",
-                        label="HuggingFace Model ID",
+                        label="HuggingFace Model ID or Civitai model download URL",
                        lines=3,
                    )
+                    custom_vae = gr.Dropdown(
+                        label=f"Custom Vae Models (Path: {get_custom_model_path('vae')})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.custom_vae)
+                        if args.custom_vae
+                        else "None",
+                        choices=["None"] + get_custom_model_files("vae"),
+                    )

                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
@@ -63,7 +403,10 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                    )

                img2img_init_image = gr.Image(
-                    label="Input Image", type="pil"
+                    label="Input Image",
+                    source="upload",
+                    tool="sketch",
+                    type="pil",
                ).style(height=300)

                with gr.Accordion(label="Stencil Options", open=False):
@@ -74,6 +417,57 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                            value="None",
                            choices=["None", "canny", "openpose", "scribble"],
                        )
+
+                    def show_canvas(choice):
+                        if choice == "scribble":
+                            return (
+                                gr.Slider.update(visible=True),
+                                gr.Slider.update(visible=True),
+                                gr.Button.update(visible=True),
+                            )
+                        else:
+                            return (
+                                gr.Slider.update(visible=False),
+                                gr.Slider.update(visible=False),
+                                gr.Button.update(visible=False),
+                            )
+
+                    def create_canvas(w, h):
+                        return np.zeros(shape=(h, w, 3), dtype=np.uint8) + 255
+
+                    with gr.Row():
+                        canvas_width = gr.Slider(
+                            label="Canvas Width",
+                            minimum=256,
+                            maximum=1024,
+                            value=512,
+                            step=1,
+                            visible=False,
+                        )
+                        canvas_height = gr.Slider(
+                            label="Canvas Height",
+                            minimum=256,
+                            maximum=1024,
+                            value=512,
+                            step=1,
+                            visible=False,
+                        )
+                    create_button = gr.Button(
+                        label="Start",
+                        value="Open drawing canvas!",
+                        visible=False,
+                    )
+                    create_button.click(
+                        fn=create_canvas,
+                        inputs=[canvas_width, canvas_height],
+                        outputs=[img2img_init_image],
+                    )
+                    use_stencil.change(
+                        fn=show_canvas,
+                        inputs=use_stencil,
+                        outputs=[canvas_width, canvas_height, create_button],
+                    )
+
                with gr.Accordion(label="LoRA Options", open=False):
                    with gr.Row():
                        lora_weights = gr.Dropdown(
@@ -94,8 +488,8 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        scheduler = gr.Dropdown(
                            elem_id="scheduler",
                            label="Scheduler",
-                            value="PNDM",
-                            choices=scheduler_list,
+                            value="EulerDiscrete",
+                            choices=scheduler_list_cpu_only,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -205,19 +599,14 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                        label="Generated images",
                        show_label=False,
                        elem_id="gallery",
-                    ).style(grid=[2])
+                    ).style(columns=[2], object_fit="contain")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {get_generated_imgs_path()}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
+                    img2img_status = gr.Textbox(visible=False)
                with gr.Row():
                    img2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
                    img2img_sendto_outpaint = gr.Button(
@@ -242,8 +631,9 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                batch_count,
                batch_size,
                scheduler,
-                custom_model,
-                hf_model_id,
+                img2img_custom_model,
+                img2img_hf_model_id,
+                custom_vae,
                precision,
                device,
                max_length,
@@ -254,13 +644,21 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                lora_hf_id,
                ondemand,
            ],
-            outputs=[img2img_gallery, std_output],
+            outputs=[img2img_gallery, std_output, img2img_status],
            show_progress=args.progress_bar,
        )

-        prompt_submit = prompt.submit(**kwargs)
-        neg_prompt_submit = negative_prompt.submit(**kwargs)
-        generate_click = stable_diffusion.click(**kwargs)
+        status_kwargs = dict(
+            fn=lambda bc, bs: status_label("Image-to-Image", 0, bc, bs),
+            inputs=[batch_count, batch_size],
+            outputs=img2img_status,
+        )
+
+        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
+            **kwargs
+        )
+        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
        stop_batch.click(
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -1,18 +1,299 @@
-from pathlib import Path
 import os
+import torch
+import time
+import sys
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import inpaint_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
-    scheduler_list,
+    scheduler_list_cpu_only,
    predefined_paint_models,
    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    InpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import (
+    get_generated_imgs_path,
+    get_generation_text_info,
+)
+from apps.stable_diffusion.web.utils.common_label_calc import status_label
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def inpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    image_dict,
+    height: int,
+    width: int,
+    inpaint_full_res: bool,
+    inpaint_full_res_padding: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.mask_path = "not none"
+    args.ondemand = ondemand
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        if "civitai" in hf_model_id:
+            args.ckpt_loc = hf_model_id
+        else:
+            args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "inpaint",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            InpaintPipeline.from_pretrained(
+                scheduler=scheduler_obj,
+                import_mlir=args.import_mlir,
+                model_id=args.hf_model_id,
+                ckpt_loc=args.ckpt_loc,
+                custom_vae=args.custom_vae,
+                precision=args.precision,
+                max_length=args.max_length,
+                batch_size=args.batch_size,
+                height=args.height,
+                width=args.width,
+                use_base_vae=args.use_base_vae,
+                use_tuned=args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                debug=args.import_debug if args.import_mlir else False,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    image = image_dict["image"]
+    mask_image = image_dict["mask"]
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            image,
+            mask_image,
+            batch_size,
+            height,
+            width,
+            inpaint_full_res,
+            inpaint_full_res_padding,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output, status_label(
+                "Inpaint", i + 1, batch_count, batch_size
+            )
+
+    return generated_imgs, text_output
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Inpaint Rest API.
+def inpaint_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["image"])
+    mask = decode_base64_to_image(InputData["mask"])
+    res = inpaint_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        {"image": init_image, "mask": mask},
+        InputData["height"],
+        InputData["width"],
+        InputData["is_full_res"],
+        InputData["full_res_padding"],
+        InputData["steps"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Inpainting") as inpaint_web:
@@ -30,23 +311,33 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
                with gr.Row():
-                    custom_model = gr.Dropdown(
+                    inpaint_custom_model = gr.Dropdown(
                        label=f"Models (Custom Model path: {get_custom_model_path()})",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
-                        else "None",
+                        else "stabilityai/stable-diffusion-2-inpainting",
                        choices=["None"]
-                        + get_custom_model_files()
+                        + get_custom_model_files(
+                            custom_checkpoint_type="inpainting"
+                        )
                        + predefined_paint_models,
                    )
-                    hf_model_id = gr.Textbox(
+                    inpaint_hf_model_id = gr.Textbox(
                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting, https://civitai.com/api/download/models/3433",
                        value="",
-                        label="HuggingFace Model ID",
+                        label="HuggingFace Model ID or Civitai model download URL",
                        lines=3,
                    )
+                    custom_vae = gr.Dropdown(
+                        label=f"Custom Vae Models (Path: {get_custom_model_path('vae')})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.custom_vae)
+                        if args.custom_vae
+                        else "None",
+                        choices=["None"] + get_custom_model_files("vae"),
+                    )

                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
@@ -89,8 +380,8 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        scheduler = gr.Dropdown(
                            elem_id="scheduler",
                            label="Scheduler",
-                            value="PNDM",
-                            choices=scheduler_list,
+                            value="EulerDiscrete",
+                            choices=scheduler_list_cpu_only,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -207,19 +498,15 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        label="Generated images",
                        show_label=False,
                        elem_id="gallery",
-                    ).style(grid=[2])
+                    ).style(columns=[2], object_fit="contain")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {get_generated_imgs_path()}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
+                    inpaint_status = gr.Textbox(visible=False)
+
                with gr.Row():
                    inpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    inpaint_sendto_outpaint = gr.Button(
@@ -245,8 +532,9 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                batch_count,
                batch_size,
                scheduler,
-                custom_model,
-                hf_model_id,
+                inpaint_custom_model,
+                inpaint_hf_model_id,
+                custom_vae,
                precision,
                device,
                max_length,
@@ -256,13 +544,20 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                lora_hf_id,
                ondemand,
            ],
-            outputs=[inpaint_gallery, std_output],
+            outputs=[inpaint_gallery, std_output, inpaint_status],
            show_progress=args.progress_bar,
        )
+        status_kwargs = dict(
+            fn=lambda bc, bs: status_label("Inpaint", 0, bc, bs),
+            inputs=[batch_count, batch_size],
+            outputs=inpaint_status,
+        )

-        prompt_submit = prompt.submit(**kwargs)
-        neg_prompt_submit = negative_prompt.submit(**kwargs)
-        generate_click = stable_diffusion.click(**kwargs)
+        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
+            **kwargs
+        )
+        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
        stop_batch.click(
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
--- a/apps/stable_diffusion/web/ui/lora_train_ui.py
+++ b/apps/stable_diffusion/web/ui/lora_train_ui.py
@@ -9,7 +9,8 @@ from apps.stable_diffusion.web.ui.utils import (
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
-    scheduler_list_txt2img,
+    get_custom_vae_or_lora_weights,
+    scheduler_list,
    predefined_models,
 )

@@ -48,6 +49,20 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                                lines=3,
                            )

+                with gr.Row():
+                    lora_weights = gr.Dropdown(
+                        label=f"Standlone LoRA weights to initialize weights (Path: {get_custom_model_path('lora')})",
+                        elem_id="lora_weights",
+                        value="None",
+                        choices=["None"] + get_custom_model_files("lora"),
+                    )
+                    lora_hf_id = gr.Textbox(
+                        elem_id="lora_hf_id",
+                        placeholder="Select 'None' in the Standlone LoRA weights dropdown on the left if you want to use a standalone HuggingFace model ID for LoRA here e.g: sayakpaul/sd-model-finetuned-lora-t4",
+                        value="",
+                        label="HuggingFace Model ID to initialize weights",
+                        lines=3,
+                    )
                with gr.Group(elem_id="image_dir_box_outer"):
                    training_images_dir = gr.Textbox(
                        label="ImageDirectory",
@@ -68,7 +83,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                            elem_id="scheduler",
                            label="Scheduler",
                            value=args.scheduler,
-                            choices=scheduler_list_txt2img,
+                            choices=scheduler_list,
                        )
                    with gr.Row():
                        height = gr.Slider(
@@ -195,6 +210,9 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                max_length,
                training_images_dir,
                output_loc,
+                get_custom_vae_or_lora_weights(
+                    lora_weights, lora_hf_id, "lora"
+                ),
            ],
            outputs=[std_output],
            show_progress=args.progress_bar,
--- a/apps/stable_diffusion/web/ui/model_manager.py
+++ b/apps/stable_diffusion/web/ui/model_manager.py
@@ -0,0 +1,157 @@
+import os
+import gradio as gr
+import requests
+from io import BytesIO
+from PIL import Image
+
+
+def get_hf_list(num_of_models=20):
+    path = "https://huggingface.co/api/models"
+    params = {
+        "search": "stable-diffusion",
+        "sort": "downloads",
+        "direction": "-1",
+        "limit": {num_of_models},
+        "full": "true",
+    }
+    response = requests.get(path, params=params)
+    return response.json()
+
+
+def get_civit_list(num_of_models=50):
+    path = f"https://civitai.com/api/v1/models?limit={num_of_models}&types=Checkpoint"
+    headers = {"Content-Type": "application/json"}
+    raw_json = requests.get(path, headers=headers).json()
+    models = list(raw_json.items())[0][1]
+    safe_models = [
+        safe_model for safe_model in models if not safe_model["nsfw"]
+    ]
+    version_id = 0  # Currently just using the first version.
+    safe_models = [
+        safe_model
+        for safe_model in safe_models
+        if safe_model["modelVersions"][version_id]["files"][0]["metadata"][
+            "format"
+        ]
+        == "SafeTensor"
+    ]
+    first_version_models = []
+    for model_iter in safe_models:
+        # The modelVersion would only keep the version name.
+        if (
+            model_iter["modelVersions"][version_id]["images"][0]["nsfw"]
+            != "None"
+        ):
+            continue
+        model_iter["modelVersions"][version_id]["modelName"] = model_iter[
+            "name"
+        ]
+        model_iter["modelVersions"][version_id]["rating"] = model_iter[
+            "stats"
+        ]["rating"]
+        model_iter["modelVersions"][version_id]["favoriteCount"] = model_iter[
+            "stats"
+        ]["favoriteCount"]
+        model_iter["modelVersions"][version_id]["downloadCount"] = model_iter[
+            "stats"
+        ]["downloadCount"]
+        first_version_models.append(model_iter["modelVersions"][version_id])
+    return first_version_models
+
+
+def get_image_from_model(model_json):
+    model_id = model_json["modelId"]
+    image = None
+    for img_info in model_json["images"]:
+        if img_info["nsfw"] == "None":
+            image_url = model_json["images"][0]["url"]
+            response = requests.get(image_url)
+            image = BytesIO(response.content)
+            break
+    return image
+
+
+with gr.Blocks() as model_web:
+    with gr.Row():
+        model_source = gr.Radio(
+            value=None,
+            choices=["Hugging Face", "Civitai"],
+            type="value",
+            label="Model Source",
+        )
+        model_numebr = gr.Slider(
+            1,
+            100,
+            value=10,
+            step=1,
+            label="Number of models",
+            interactive=True,
+        )
+        # TODO: add more filters
+    get_model_btn = gr.Button(value="Get Models")
+
+    hf_models = gr.Dropdown(
+        label="Hugging Face Model List",
+        choices=None,
+        value=None,
+        visible=False,
+    )
+    # TODO: select and SendTo
+    civit_models = gr.Gallery(
+        label="Civitai Model Gallery",
+        value=None,
+        interactive=True,
+        visible=False,
+    )
+
+    with gr.Row(visible=False) as sendto_btns:
+        modelmanager_sendto_txt2img = gr.Button(value="SendTo Txt2Img")
+        modelmanager_sendto_img2img = gr.Button(value="SendTo Img2Img")
+        modelmanager_sendto_inpaint = gr.Button(value="SendTo Inpaint")
+        modelmanager_sendto_outpaint = gr.Button(value="SendTo Outpaint")
+        modelmanager_sendto_upscaler = gr.Button(value="SendTo Upscaler")
+
+    def get_model_list(model_source, model_numebr):
+        if model_source == "Hugging Face":
+            hf_model_list = get_hf_list(model_numebr)
+            models = []
+            for model in hf_model_list:
+                # TODO: add model info
+                models.append(f'{model["modelId"]}')
+            return (
+                gr.Dropdown.update(choices=models, visible=True),
+                gr.Gallery.update(value=None, visible=False),
+                gr.Row.update(visible=True),
+            )
+        elif model_source == "Civitai":
+            civit_model_list = get_civit_list(model_numebr)
+            models = []
+            for model in civit_model_list:
+                image = get_image_from_model(model)
+                if image is None:
+                    continue
+                # TODO: add model info
+                models.append(
+                    (Image.open(image), f'{model["files"][0]["downloadUrl"]}')
+                )
+            return (
+                gr.Dropdown.update(value=None, choices=None, visible=False),
+                gr.Gallery.update(value=models, visible=True),
+                gr.Row.update(visible=False),
+            )
+        else:
+            return (
+                gr.Dropdown.update(value=None, choices=None, visible=False),
+                gr.Gallery.update(value=None, visible=False),
+                gr.Row.update(visible=False),
+            )
+
+    get_model_btn.click(
+        fn=get_model_list,
+        inputs=[model_source, model_numebr],
+        outputs=[
+            hf_models,
+            civit_models,
+            sendto_btns,
+        ],
+    )
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -1,18 +1,308 @@
-from pathlib import Path
 import os
+import torch
+import time
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import outpaint_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
-    scheduler_list,
+    scheduler_list_cpu_only,
    predefined_paint_models,
    cancel_sd,
 )
+from apps.stable_diffusion.src import (
+    args,
+    OutpaintPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import (
+    get_generated_imgs_path,
+    get_generation_text_info,
+)
+from apps.stable_diffusion.web.utils.common_label_calc import status_label
+
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def outpaint_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    pixels: int,
+    mask_blur: int,
+    directions: list,
+    noise_q: float,
+    color_variation: float,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.img_path = "not none"
+    args.ondemand = ondemand
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        if "civitai" in hf_model_id:
+            args.ckpt_loc = hf_model_id
+        else:
+            args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "outpaint",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-inpainting"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            OutpaintPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+
+    left = True if "left" in directions else False
+    right = True if "right" in directions else False
+    top = True if "up" in directions else False
+    bottom = True if "down" in directions else False
+
+    text_output = ""
+    for i in range(batch_count):
+        if i > 0:
+            img_seed = utils.sanitize_seed(-1)
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            init_image,
+            pixels,
+            mask_blur,
+            left,
+            right,
+            top,
+            bottom,
+            noise_q,
+            color_variation,
+            batch_size,
+            height,
+            width,
+            steps,
+            guidance_scale,
+            img_seed,
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+        )
+        seeds.append(img_seed)
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(seeds, device)
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], img_seed)
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output, status_label(
+                "Outpaint", i + 1, batch_count, batch_size
+            )
+
+    return generated_imgs, text_output, ""
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Inpaint Rest API.
+def outpaint_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["init_images"][0])
+    res = outpaint_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        init_image,
+        InputData["pixels"],
+        InputData["mask_blur"],
+        InputData["directions"],
+        InputData["noise_q"],
+        InputData["color_variation"],
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Outpainting") as outpaint_web:
@@ -30,23 +320,33 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
                with gr.Row():
-                    custom_model = gr.Dropdown(
+                    outpaint_custom_model = gr.Dropdown(
                        label=f"Models (Custom Model path: {get_custom_model_path()})",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
-                        else "None",
+                        else "stabilityai/stable-diffusion-2-inpainting",
                        choices=["None"]
-                        + get_custom_model_files()
+                        + get_custom_model_files(
+                            custom_checkpoint_type="inpainting"
+                        )
                        + predefined_paint_models,
                    )
-                    hf_model_id = gr.Textbox(
+                    outpaint_hf_model_id = gr.Textbox(
                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: ghunkins/stable-diffusion-liberty-inpainting, https://civitai.com/api/download/models/3433",
                        value="",
-                        label="HuggingFace Model ID",
+                        label="HuggingFace Model ID or Civitai model download URL",
                        lines=3,
                    )
+                    custom_vae = gr.Dropdown(
+                        label=f"Custom Vae Models (Path: {get_custom_model_path('vae')})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.custom_vae)
+                        if args.custom_vae
+                        else "None",
+                        choices=["None"] + get_custom_model_files("vae"),
+                    )

                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
@@ -86,8 +386,8 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        scheduler = gr.Dropdown(
                            elem_id="scheduler",
                            label="Scheduler",
-                            value="PNDM",
-                            choices=scheduler_list,
+                            value="EulerDiscrete",
+                            choices=scheduler_list_cpu_only,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -226,19 +526,14 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        label="Generated images",
                        show_label=False,
                        elem_id="gallery",
-                    ).style(grid=[2])
+                    ).style(columns=[2], object_fit="contain")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {get_generated_imgs_path()}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
+                    outpaint_status = gr.Textbox(visible=False)
                with gr.Row():
                    outpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    outpaint_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -265,8 +560,9 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                batch_count,
                batch_size,
                scheduler,
-                custom_model,
-                hf_model_id,
+                outpaint_custom_model,
+                outpaint_hf_model_id,
+                custom_vae,
                precision,
                device,
                max_length,
@@ -276,13 +572,20 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                lora_hf_id,
                ondemand,
            ],
-            outputs=[outpaint_gallery, std_output],
+            outputs=[outpaint_gallery, std_output, outpaint_status],
            show_progress=args.progress_bar,
        )
+        status_kwargs = dict(
+            fn=lambda bc, bs: status_label("Outpaint", 0, bc, bs),
+            inputs=[batch_count, batch_size],
+            outputs=outpaint_status,
+        )

-        prompt_submit = prompt.submit(**kwargs)
-        neg_prompt_submit = negative_prompt.submit(**kwargs)
-        generate_click = stable_diffusion.click(**kwargs)
+        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
+            **kwargs
+        )
+        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
        stop_batch.click(
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
--- a/apps/stable_diffusion/web/ui/outputgallery_ui.py
+++ b/apps/stable_diffusion/web/ui/outputgallery_ui.py
@@ -0,0 +1,450 @@
+import glob
+import gradio as gr
+import os
+from PIL import Image
+
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.src.utils import (
+    get_generated_imgs_path,
+    get_generated_imgs_todays_subdir,
+)
+from apps.stable_diffusion.web.ui.utils import nodlogo_loc
+from apps.stable_diffusion.web.utils.png_metadata import (
+    parse_generation_parameters,
+)
+from apps.stable_diffusion.web.utils.exif_metadata import parse_exif
+
+# -- Functions for file, directory and image info querying
+
+output_dir = get_generated_imgs_path()
+
+
+def outputgallery_filenames(subdir) -> list[str]:
+    new_dir_path = os.path.join(output_dir, subdir)
+    if os.path.exists(new_dir_path):
+        filenames = [
+            glob.glob(new_dir_path + "/" + ext)
+            for ext in ("*.png", "*.jpg", "*.jpeg")
+        ]
+
+        return sorted(sum(filenames, []), key=os.path.getmtime, reverse=True)
+    else:
+        return []
+
+
+def parameters_for_display(image_filename) -> tuple[str, list[list[str]]]:
+    pil_image = Image.open(image_filename)
+
+    # we have PNG generation parameters
+    if "parameters" in pil_image.info:
+        params = parse_generation_parameters(pil_image.info["parameters"])
+
+        # make showing the sizes more compact by using only one line each
+        if params.keys() & {"Size-1", "Size-2"}:
+            params["Size"] = f"{params.pop('Size-1')}x{params.pop('Size-2')}"
+
+        if params.keys() & {"Hires resize-1", "Hires resize-1"}:
+            hires_x = params.pop("Hires resize-1")
+            hires_y = params.pop("Hires resize-2")
+
+            if hires_x == 0 and hires_y == 0:
+                params["Hires resize"] = "None"
+            else:
+                params["Hires resize"] = f"{hires_x}x{hires_y}"
+
+        return "params", list(map(list, params.items()))
+
+    # we have EXIF data, but no generation parameters we know how to read
+    elif pil_image.getexif():
+        return "exif", list(map(list, parse_exif(pil_image).items()))
+
+    # couldn't find anything
+    else:
+        return None, None
+
+
+def output_subdirs() -> list[str]:
+    # Gets a list of subdirectories of output_dir and below, as relative paths.
+    relative_paths = [
+        os.path.relpath(entry[0], output_dir)
+        for entry in os.walk(
+            output_dir, followlinks=args.output_gallery_followlinks
+        )
+    ]
+
+    # It is less confusing to always including the subdir that will take any images generated
+    # today even if it doesn't exist yet
+    if get_generated_imgs_todays_subdir() not in relative_paths:
+        relative_paths.append(get_generated_imgs_todays_subdir())
+
+    # sort subdirectories so that that the date named ones we probably created in this or previous sessions
+    # come first, sorted with the most recent first. Other subdirs are listed after.
+    generated_paths = sorted(
+        [path for path in relative_paths if path.isnumeric()], reverse=True
+    )
+    result_paths = generated_paths + sorted(
+        [
+            path
+            for path in relative_paths
+            if (not path.isnumeric()) and path != "."
+        ]
+    )
+
+    return result_paths
+
+
+# --- Define UI layout for Gradio
+
+with gr.Blocks() as outputgallery_web:
+    nod_logo = Image.open(nodlogo_loc)
+
+    with gr.Row(elem_id="outputgallery_gallery"):
+        # needed to workaround gradio issue: https://github.com/gradio-app/gradio/issues/2907
+        dev_null = gr.Textbox("", visible=False)
+
+        gallery_files = gr.State(value=[])
+        subdirectory_paths = gr.State(value=[])
+
+        with gr.Column(scale=6):
+            logo = gr.Image(
+                label="Getting subdirectories...",
+                value=nod_logo,
+                interactive=False,
+                visible=True,
+                show_label=True,
+                elem_id="top_logo",
+                elem_classes="logo_centered",
+            )
+
+            gallery = gr.Gallery(
+                label="",
+                value=gallery_files.value,
+                visible=False,
+                show_label=True,
+            ).style(grid=4)
+
+        with gr.Column(scale=4):
+            with gr.Box():
+                with gr.Row():
+                    with gr.Column(scale=16, min_width=160):
+                        subdirectories = gr.Dropdown(
+                            label=f"Subdirectories of {output_dir}",
+                            type="value",
+                            choices=subdirectory_paths.value,
+                            value="",
+                            interactive=True,
+                        ).style(container=False)
+                    with gr.Column(
+                        scale=1, min_width=32, elem_id="output_refresh_button"
+                    ):
+                        refresh = gr.Button(
+                            variant="secondary",
+                            value="\u21BB",  # unicode clockwise arrow circle
+                        ).style(size="sm")
+
+            image_columns = gr.Slider(
+                label="Columns shown", value=4, minimum=1, maximum=16, step=1
+            )
+            outputgallery_filename = gr.Textbox(
+                label="Filename", value="None", interactive=False
+            ).style(show_copy_button=True)
+
+            with gr.Accordion(
+                label="Parameter Information", open=False
+            ) as parameters_accordian:
+                image_parameters = gr.DataFrame(
+                    headers=["Parameter", "Value"],
+                    col_count=2,
+                    wrap=True,
+                    elem_classes="output_parameters_dataframe",
+                    value=[["Status", "No image selected"]],
+                )
+
+            with gr.Accordion(label="Send To", open=True):
+                with gr.Row():
+                    outputgallery_sendto_txt2img = gr.Button(
+                        value="Txt2Img",
+                        interactive=False,
+                        elem_classes="outputgallery_sendto",
+                    ).style(size="sm")
+
+                    outputgallery_sendto_img2img = gr.Button(
+                        value="Img2Img",
+                        interactive=False,
+                        elem_classes="outputgallery_sendto",
+                    ).style(size="sm")
+
+                    outputgallery_sendto_inpaint = gr.Button(
+                        value="Inpaint",
+                        interactive=False,
+                        elem_classes="outputgallery_sendto",
+                    ).style(size="sm")
+
+                    outputgallery_sendto_outpaint = gr.Button(
+                        value="Outpaint",
+                        interactive=False,
+                        elem_classes="outputgallery_sendto",
+                    ).style(size="sm")
+
+                    outputgallery_sendto_upscaler = gr.Button(
+                        value="Upscaler",
+                        interactive=False,
+                        elem_classes="outputgallery_sendto",
+                    ).style(size="sm")
+
+    # --- Event handlers
+
+    def on_clear_gallery():
+        return [
+            gr.Gallery.update(
+                value=[],
+                visible=False,
+            ),
+            gr.Image.update(
+                visible=True,
+            ),
+        ]
+
+    def on_select_subdir(subdir) -> list:
+        # evt.value is the subdirectory name
+        new_images = outputgallery_filenames(subdir)
+        new_label = (
+            f"{len(new_images)} images in {os.path.join(output_dir, subdir)}"
+        )
+        return [
+            new_images,
+            gr.Gallery.update(
+                value=new_images,
+                label=new_label,
+                visible=len(new_images) > 0,
+            ),
+            gr.Image.update(
+                label=new_label,
+                visible=len(new_images) == 0,
+            ),
+        ]
+
+    def on_refresh(current_subdir: str) -> list:
+        # get an up to date subdirectory list
+        refreshed_subdirs = output_subdirs()
+        # get the images using either the current subdirectory or the most recent valid one
+        new_subdir = (
+            current_subdir
+            if current_subdir in refreshed_subdirs
+            else refreshed_subdirs[0]
+        )
+        new_images = outputgallery_filenames(new_subdir)
+        new_label = f"{len(new_images)} images in {os.path.join(output_dir, new_subdir)}"
+
+        return [
+            gr.Dropdown.update(
+                choices=refreshed_subdirs,
+                value=new_subdir,
+            ),
+            refreshed_subdirs,
+            new_images,
+            gr.Gallery.update(
+                value=new_images, label=new_label, visible=len(new_images) > 0
+            ),
+            gr.Image.update(
+                label=new_label,
+                visible=len(new_images) == 0,
+            ),
+        ]
+
+    def on_new_image(subdir, subdir_paths, status) -> list:
+        # prevent error triggered when an image generates before the tab has even been selected
+        subdir_paths = (
+            subdir_paths
+            if len(subdir_paths) > 0
+            else [get_generated_imgs_todays_subdir()]
+        )
+
+        # only update if the current subdir is the most recent one as new images only go there
+        if subdir_paths[0] == subdir:
+            new_images = outputgallery_filenames(subdir)
+            new_label = f"{len(new_images)} images in {os.path.join(output_dir, subdir)} - {status}"
+
+            return [
+                new_images,
+                gr.Gallery.update(
+                    value=new_images,
+                    label=new_label,
+                    visible=len(new_images) > 0,
+                ),
+                gr.Image.update(
+                    label=new_label,
+                    visible=len(new_images) == 0,
+                ),
+            ]
+        else:
+            # otherwise change nothing, (only untyped gradio gr.update() does this)
+            return [gr.update(), gr.update(), gr.update()]
+
+    def on_select_image(images: list[str], evt: gr.SelectData) -> list:
+        # evt.index is an index into the full list of filenames for the current subdirectory
+        filename = images[evt.index]
+
+        # this gets the parameters in the form our dataframe is expecting (list of lists)
+        params_type, params = parameters_for_display(filename)
+
+        if params_type == "params":
+            new_parameters = params
+        elif params_type == "exif":
+            new_parameters = [
+                ["Status", "No PNG parameters found, showing EXIF metadata"]
+            ] + params
+        else:
+            new_parameters = [["Status", "No parameters found"]]
+
+        return [filename, new_parameters]
+
+    def on_outputgallery_filename_change(filename: str) -> list:
+        exists = filename != "None" and os.path.exists(filename)
+        return [
+            # disable or enable each of the sendto button based on whether an image is selected
+            gr.Button.update(interactive=exists),
+            gr.Button.update(interactive=exists),
+            gr.Button.update(interactive=exists),
+            gr.Button.update(interactive=exists),
+            gr.Button.update(interactive=exists),
+            gr.Button.update(interactive=exists),
+        ]
+
+    # The time first our tab is selected we need to do an initial refresh to populate
+    # the subdirectory select box and the images from the most recent subdirectory.
+    #
+    # We do it at this point rather than setting this up in the controls' definitions
+    # as when you refresh the browser you always get what was *initially* set, which
+    # won't include any new subdirectories or images that might have created since
+    # the application was started. Doing it this way means a browser refresh/reload
+    # always gets the most up to date data.
+    def on_select_tab(subdir_paths):
+        if len(subdir_paths) == 0:
+            return on_refresh("")
+        else:
+            return (
+                # Change nothing, (only untyped gr.update() does this)
+                gr.update(),
+                gr.update(),
+                gr.update(),
+                gr.update(),
+                gr.update(),
+            )
+
+    # Unfortunately as of gradio 3.22.0 gr.update against Galleries doesn't support
+    # things set with .style, nor the elem_classes kwarg so we have to directly set
+    # things up via JavaScript if we want the client to take notice of any of our
+    # changes to the number of columns after it decides to put them back to the
+    # original number when we change something
+    def js_set_columns_in_browser(timeout_length):
+        return f"""
+            (new_cols) => {{
+                setTimeout(() => {{
+                    required_style = "auto ".repeat(new_cols).trim();
+                    gallery = document.querySelector('#outputgallery_gallery .grid-container');
+                    if (gallery) {{
+                        gallery.style.gridTemplateColumns = required_style
+                    }}
+                }}, {timeout_length});
+                return [];      // prevents console error from gradio
+            }}
+        """
+
+    # --- Wire handlers up to the actions
+
+    # - Many actions reset the number of columns shown in the gallery on the browser end,
+    #   so we have to set them back to what we think they should be after the initial
+    #   action.
+    # - None of the actions on this tab trigger inference, and we want the user to be able
+    #   to do them whilst other tabs have ongoing inference running. Waiting in the queue
+    #   behind inference jobs would mean the UI can't fully respond until the inference tasks
+    #   complete, hence queue=False on all of these.
+    set_gallery_columns_immediate = dict(
+        fn=None,
+        inputs=[image_columns],
+        # gradio blanks the UI on Chrome on Linux on gallery select if I don't put an output here
+        outputs=[dev_null],
+        _js=js_set_columns_in_browser(0),
+        queue=False,
+    )
+
+    # setting columns after selecting a gallery item needs a real timeout length for the
+    # number of columns to actually be applied. Not really sure why, maybe something has
+    # to finish animating?
+    set_gallery_columns_delayed = dict(
+        set_gallery_columns_immediate, _js=js_set_columns_in_browser(250)
+    )
+
+    # clearing images when we need to completely change what's in the gallery avoids current
+    # images being shown replacing piecemeal and prevents weirdness and errors if the user
+    # selects an image during the replacement phase.
+    clear_gallery = dict(
+        fn=on_clear_gallery,
+        inputs=None,
+        outputs=[gallery, logo],
+        queue=False,
+    )
+
+    image_columns.change(**set_gallery_columns_immediate)
+
+    subdirectories.select(**clear_gallery).then(
+        on_select_subdir,
+        [subdirectories],
+        [gallery_files, gallery, logo],
+        queue=False,
+    ).then(**set_gallery_columns_immediate)
+
+    refresh.click(**clear_gallery).then(
+        on_refresh,
+        [subdirectories],
+        [subdirectories, subdirectory_paths, gallery_files, gallery, logo],
+        queue=False,
+    ).then(**set_gallery_columns_immediate)
+
+    gallery.select(
+        on_select_image,
+        [gallery_files],
+        [outputgallery_filename, image_parameters],
+        queue=False,
+    ).then(**set_gallery_columns_delayed)
+
+    outputgallery_filename.change(
+        on_outputgallery_filename_change,
+        [outputgallery_filename],
+        [
+            outputgallery_sendto_txt2img,
+            outputgallery_sendto_img2img,
+            outputgallery_sendto_inpaint,
+            outputgallery_sendto_outpaint,
+            outputgallery_sendto_upscaler,
+        ],
+        queue=False,
+    )
+
+    # We should have been given the .select function for our tab, so set it up
+    def outputgallery_tab_select(select):
+        select(
+            fn=on_select_tab,
+            inputs=[subdirectory_paths],
+            outputs=[
+                subdirectories,
+                subdirectory_paths,
+                gallery_files,
+                gallery,
+                logo,
+            ],
+            queue=False,
+        ).then(**set_gallery_columns_immediate)
+
+    # We should have been passed a list of components on other tabs that update
+    # when a new image has generated on that tab, so set things up so the user
+    # will see that new image if they are looking at today's subdirectory
+    def outputgallery_watch(components: gr.Textbox):
+        for component in components:
+            component.change(
+                on_new_image,
+                inputs=[subdirectories, subdirectory_paths, component],
+                outputs=[gallery_files, gallery, logo],
+                queue=False,
+            ).then(**set_gallery_columns_immediate)
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -0,0 +1,188 @@
+import gradio as gr
+import torch
+import os
+from pathlib import Path
+from transformers import (
+    AutoModelForCausalLM,
+)
+from apps.stable_diffusion.web.ui.utils import available_devices
+
+start_message = """<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+"""
+
+
+def user(message, history):
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+
+
+sharkModel = 0
+sharded_model = 0
+
+
+start_message_vicuna = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+past_key_values = None
+
+
+def chat(curr_system_message, history, model):
+    print(f"In chat for {model}")
+    global sharded_model
+    global past_key_values
+    if "vicuna" in model:
+        from apps.language_models.scripts.vicuna import (
+            tokenizer,
+            get_sharded_model,
+        )
+
+        SAMPLE_INPUT_LEN = 137
+        curr_system_message = start_message_vicuna
+        if sharded_model == 0:
+            sharded_model = get_sharded_model()
+        messages = curr_system_message + "".join(
+            [
+                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+                for item in history
+            ]
+        )
+        prompt = messages.strip()
+        print("prompt = ", prompt)
+        input_ids = tokenizer(prompt).input_ids
+        new_sentence = ""
+        for _ in range(200):
+            original_input_ids = input_ids
+            input_id_len = len(input_ids)
+            pad_len = SAMPLE_INPUT_LEN - input_id_len
+            attention_mask = torch.ones([1, input_id_len], dtype=torch.int64)
+            input_ids = torch.tensor(input_ids)
+            input_ids = input_ids.reshape([1, input_id_len])
+            attention_mask = torch.nn.functional.pad(
+                torch.tensor(attention_mask),
+                (0, pad_len),
+                mode="constant",
+                value=0,
+            )
+
+            if _ == 0:
+                output = sharded_model.forward(input_ids, is_first=True)
+            else:
+                output = sharded_model.forward(
+                    input_ids, past_key_values=past_key_values, is_first=False
+                )
+            logits = output["logits"]
+            past_key_values = output["past_key_values"]
+            new_word = tokenizer.decode(torch.argmax(logits[:, -1, :], dim=1))
+            if new_word == "</s>":
+                break
+            new_sentence += " " + new_word
+            history[-1][1] = new_sentence
+            yield history
+            next_token = torch.argmax(logits[:, input_id_len - 1, :], dim=1)
+            original_input_ids.append(next_token)
+            input_ids = [next_token]
+        print(new_sentence)
+        return history
+
+    # else Model is StableLM
+    global sharkModel
+    from apps.language_models.src.pipelines.stablelm_pipeline import (
+        SharkStableLM,
+    )
+
+    if sharkModel == 0:
+        # max_new_tokens=512
+        shark_slm = SharkStableLM(
+            "StableLM"
+        )  # pass elements from UI as required
+
+    # Construct the input message string for the model by concatenating the current system message and conversation history
+    if len(curr_system_message.split()) > 160:
+        print("clearing context")
+        curr_system_message = start_message
+    messages = curr_system_message + "".join(
+        [
+            "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+            for item in history
+        ]
+    )
+
+    generate_kwargs = dict(prompt=messages)
+
+    words_list = shark_slm.generate(**generate_kwargs)
+
+    partial_text = ""
+    for new_text in words_list:
+        # print(new_text)
+        partial_text += new_text
+        history[-1][1] = partial_text
+        # Yield an empty string to cleanup the message textbox and the updated conversation history
+        yield history
+    return words_list
+
+
+with gr.Blocks(title="Chatbot") as stablelm_chat:
+    with gr.Row():
+        model = gr.Dropdown(
+            label="Select Model",
+            value="TheBloke/vicuna-7B-1.1-HF",
+            choices=[
+                "stabilityai/stablelm-tuned-alpha-3b",
+                "TheBloke/vicuna-7B-1.1-HF",
+            ],
+        )
+        device_value = None
+        for d in available_devices:
+            if "vulkan" in d:
+                device_value = d
+                break
+
+        device = gr.Dropdown(
+            label="Device",
+            value=device_value if device_value else available_devices[0],
+            interactive=False,
+            choices=available_devices,
+        )
+    chatbot = gr.Chatbot().style(height=500)
+    with gr.Row():
+        with gr.Column():
+            msg = gr.Textbox(
+                label="Chat Message Box",
+                placeholder="Chat Message Box",
+                show_label=False,
+            ).style(container=False)
+        with gr.Column():
+            with gr.Row():
+                submit = gr.Button("Submit")
+                stop = gr.Button("Stop")
+                clear = gr.Button("Clear")
+    system_msg = gr.Textbox(
+        start_message, label="System Message", interactive=False, visible=False
+    )
+
+    submit_event = msg.submit(
+        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+    ).then(
+        fn=chat,
+        inputs=[system_msg, chatbot, model],
+        outputs=[chatbot],
+        queue=True,
+    )
+    submit_click_event = submit.click(
+        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+    ).then(
+        fn=chat,
+        inputs=[system_msg, chatbot, model],
+        outputs=[chatbot],
+        queue=True,
+    )
+    stop.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, [chatbot], queue=False)
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -1,18 +1,23 @@
-from pathlib import Path
 import os
 import torch
 import time
+import sys
 import gradio as gr
 from PIL import Image
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
-    scheduler_list_txt2img,
+    scheduler_list,
    predefined_models,
    cancel_sd,
 )
+from apps.stable_diffusion.web.utils.png_metadata import import_png_metadata
+from apps.stable_diffusion.web.utils.common_label_calc import status_label
 from apps.stable_diffusion.src import (
    args,
    Text2ImagePipeline,
@@ -22,7 +27,10 @@ from apps.stable_diffusion.src import (
    save_output_img,
    prompt_examples,
 )
-from apps.stable_diffusion.src.utils import get_generation_text_info
+from apps.stable_diffusion.src.utils import (
+    get_generated_imgs_path,
+    get_generation_text_info,
+)

 # set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
 init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
@@ -43,6 +51,7 @@ def txt2img_inf(
    scheduler: str,
    custom_model: str,
    hf_model_id: str,
+    custom_vae: str,
    precision: str,
    device: str,
    max_length: int,
@@ -72,17 +81,23 @@ def txt2img_inf(
    # set ckpt_loc and hf_model_id.
    args.ckpt_loc = ""
    args.hf_model_id = ""
+    args.custom_vae = ""
    if custom_model == "None":
        if not hf_model_id:
            return (
                None,
                "Please provide either custom model or huggingface model ID, both must not be empty",
            )
-        args.hf_model_id = hf_model_id
+        if "civitai" in hf_model_id:
+            args.ckpt_loc = hf_model_id
+        else:
+            args.hf_model_id = hf_model_id
    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
        args.ckpt_loc = get_custom_model_pathfile(custom_model)
    else:
        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")

    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png
@@ -97,6 +112,7 @@ def txt2img_inf(
        "txt2img",
        args.hf_model_id,
        args.ckpt_loc,
+        args.custom_vae,
        precision,
        batch_size,
        max_length,
@@ -105,6 +121,7 @@ def txt2img_inf(
        device,
        use_lora=args.use_lora,
        use_stencil=None,
+        ondemand=ondemand,
    )
    if (
        not global_obj.get_sd_obj()
@@ -188,9 +205,68 @@ def txt2img_inf(
        else:
            save_output_img(out_imgs[0], img_seed)
            generated_imgs.extend(out_imgs)
-            yield generated_imgs, text_output
+            yield generated_imgs, text_output, status_label(
+                "Text-to-Image", i + 1, batch_count, batch_size
+            )

-    return generated_imgs, text_output
+    return generated_imgs, text_output, ""
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Text2Img Rest API.
+def txt2img_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    res = txt2img_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Text-to-Image") as txt2img_web:
@@ -210,25 +286,34 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                with gr.Row():
                    with gr.Column(scale=10):
                        with gr.Row():
-                            custom_model = gr.Dropdown(
+                            txt2img_custom_model = gr.Dropdown(
                                label=f"Models (Custom Model path: {get_custom_model_path()})",
                                elem_id="custom_model",
                                value=os.path.basename(args.ckpt_loc)
                                if args.ckpt_loc
-                                else "None",
+                                else "stabilityai/stable-diffusion-2-1-base",
                                choices=["None"]
                                + get_custom_model_files()
                                + predefined_models,
                            )
-                            hf_model_id = gr.Textbox(
+                            txt2img_hf_model_id = gr.Textbox(
                                elem_id="hf_model_id",
-                                placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                                placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3, https://civitai.com/api/download/models/15236",
                                value="",
-                                label="HuggingFace Model ID",
+                                label="HuggingFace Model ID or Civitai model download URL",
                                lines=3,
                            )
+                            custom_vae = gr.Dropdown(
+                                label=f"Custom Vae Models (Path: {get_custom_model_path('vae')})",
+                                elem_id="custom_model",
+                                value=os.path.basename(args.custom_vae)
+                                if args.custom_vae
+                                else "None",
+                                choices=["None"]
+                                + get_custom_model_files("vae"),
+                            )
                    with gr.Column(scale=1, min_width=170):
-                        png_info_img = gr.Image(
+                        txt2img_png_info_img = gr.Image(
                            label="Import PNG info",
                            elem_id="txt2img_prompt_image",
                            type="pil",
@@ -270,7 +355,7 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            elem_id="scheduler",
                            label="Scheduler",
                            value=args.scheduler,
-                            choices=scheduler_list_txt2img,
+                            choices=scheduler_list,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -388,19 +473,14 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        label="Generated images",
                        show_label=False,
                        elem_id="gallery",
-                    ).style(grid=[2])
+                    ).style(columns=[2], object_fit="contain")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {get_generated_imgs_path()}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
+                    txt2img_status = gr.Textbox(visible=False)
                with gr.Row():
                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -424,8 +504,9 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                batch_count,
                batch_size,
                scheduler,
-                custom_model,
-                hf_model_id,
+                txt2img_custom_model,
+                txt2img_hf_model_id,
+                custom_vae,
                precision,
                device,
                max_length,
@@ -435,29 +516,30 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                lora_hf_id,
                ondemand,
            ],
-            outputs=[txt2img_gallery, std_output],
+            outputs=[txt2img_gallery, std_output, txt2img_status],
            show_progress=args.progress_bar,
        )

-        prompt_submit = prompt.submit(**kwargs)
-        neg_prompt_submit = negative_prompt.submit(**kwargs)
-        generate_click = stable_diffusion.click(**kwargs)
+        status_kwargs = dict(
+            fn=lambda bc, bs: status_label("Text-to-Image", 0, bc, bs),
+            inputs=[batch_count, batch_size],
+            outputs=txt2img_status,
+        )
+
+        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
+            **kwargs
+        )
+        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
        stop_batch.click(
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
        )

-        from apps.stable_diffusion.web.utils.png_metadata import (
-            import_png_metadata,
-        )
-
-        png_info_img.change(
+        txt2img_png_info_img.change(
            fn=import_png_metadata,
            inputs=[
-                png_info_img,
-            ],
-            outputs=[
-                png_info_img,
+                txt2img_png_info_img,
                prompt,
                negative_prompt,
                steps,
@@ -466,7 +548,20 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                seed,
                width,
                height,
-                custom_model,
-                hf_model_id,
+                txt2img_custom_model,
+                txt2img_hf_model_id,
+            ],
+            outputs=[
+                txt2img_png_info_img,
+                prompt,
+                negative_prompt,
+                steps,
+                scheduler,
+                guidance_scale,
+                seed,
+                width,
+                height,
+                txt2img_custom_model,
+                txt2img_hf_model_id,
            ],
        )
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -1,17 +1,309 @@
-from pathlib import Path
 import os
+import torch
+import time
 import gradio as gr
 from PIL import Image
-from apps.stable_diffusion.scripts import upscaler_inf
-from apps.stable_diffusion.src import args
+import base64
+from io import BytesIO
+from fastapi.exceptions import HTTPException
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
-    scheduler_list,
+    scheduler_list_cpu_only,
    predefined_upscaler_models,
+    cancel_sd,
 )
+from apps.stable_diffusion.web.utils.common_label_calc import status_label
+from apps.stable_diffusion.src import (
+    args,
+    UpscalerPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    save_output_img,
+)
+from apps.stable_diffusion.src.utils import get_generated_imgs_path
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+# Exposed to UI.
+def upscaler_inf(
+    prompt: str,
+    negative_prompt: str,
+    init_image,
+    height: int,
+    width: int,
+    steps: int,
+    noise_level: int,
+    guidance_scale: float,
+    seed: int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    custom_model: str,
+    hf_model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_hf_id: str,
+    ondemand: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.seed = seed
+    args.steps = steps
+    args.scheduler = scheduler
+    args.ondemand = ondemand
+
+    if init_image is None:
+        return None, "An Initial Image is required"
+    image = init_image.convert("RGB").resize((height, width))
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+    if custom_model == "None":
+        if not hf_model_id:
+            return (
+                None,
+                "Please provide either custom model or huggingface model ID, both must not be empty",
+            )
+        if "civitai" in hf_model_id:
+            args.ckpt_loc = hf_model_id
+        else:
+            args.hf_model_id = hf_model_id
+    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
+        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+    else:
+        args.hf_model_id = custom_model
+    if custom_vae != "None":
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    args.use_lora = get_custom_vae_or_lora_weights(
+        lora_weights, lora_hf_id, "lora"
+    )
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    args.height = 128
+    args.width = 128
+    new_config_obj = Config(
+        "upscaler",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        args.height,
+        args.width,
+        device,
+        use_lora=args.use_lora,
+        use_stencil=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-2-1-base"
+        )
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        global_obj.set_sd_obj(
+            UpscalerPipeline.from_pretrained(
+                scheduler_obj,
+                args.import_mlir,
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                args.precision,
+                args.max_length,
+                args.batch_size,
+                args.height,
+                args.width,
+                args.use_base_vae,
+                args.use_tuned,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                use_lora=args.use_lora,
+                ondemand=args.ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+    global_obj.get_sd_obj().low_res_scheduler = global_obj.get_scheduler(
+        "DDPM"
+    )
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    seeds = []
+    img_seed = utils.sanitize_seed(seed)
+    extra_info = {"NOISE LEVEL": noise_level}
+    for current_batch in range(batch_count):
+        if current_batch > 0:
+            img_seed = utils.sanitize_seed(-1)
+        low_res_img = image
+        high_res_img = Image.new("RGB", (height * 4, width * 4))
+
+        for i in range(0, width, 128):
+            for j in range(0, height, 128):
+                box = (j, i, j + 128, i + 128)
+                upscaled_image = global_obj.get_sd_obj().generate_images(
+                    prompt,
+                    negative_prompt,
+                    low_res_img.crop(box),
+                    batch_size,
+                    args.height,
+                    args.width,
+                    steps,
+                    noise_level,
+                    guidance_scale,
+                    img_seed,
+                    args.max_length,
+                    dtype,
+                    args.use_base_vae,
+                    cpu_scheduling,
+                )
+                if global_obj.get_sd_status() == SD_STATE_CANCEL:
+                    break
+                else:
+                    high_res_img.paste(upscaled_image[0], (j * 4, i * 4))
+
+            if global_obj.get_sd_status() == SD_STATE_CANCEL:
+                break
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(high_res_img, img_seed, extra_info)
+            generated_imgs.append(high_res_img)
+            seeds.append(img_seed)
+            global_obj.get_sd_obj().log += "\n"
+            yield generated_imgs, global_obj.get_sd_obj().log, status_label(
+                "Upscaler", current_batch + 1, batch_count, batch_size
+            )
+
+    total_time = time.time() - start_time
+    text_output = f"prompt={args.prompts}"
+    text_output += f"\nnegative prompt={args.negative_prompts}"
+    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+    text_output += f"\nscheduler={args.scheduler}, device={device}"
+    text_output += f"\nsteps={steps}, noise_level={noise_level}, guidance_scale={guidance_scale}, seed={seeds}"
+    text_output += f"\nsize={height}x{width}, batch_count={batch_count}, batch_size={batch_size}, max_length={args.max_length}"
+    text_output += global_obj.get_sd_obj().log
+    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+    yield generated_imgs, text_output, ""
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=500, detail="Invalid encoded image")
+
+
+def encode_pil_to_base64(images):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+# Upscaler Rest API.
+def upscaler_api(
+    InputData: dict,
+):
+    print(
+        f'Prompt: {InputData["prompt"]}, Negative Prompt: {InputData["negative_prompt"]}, Seed: {InputData["seed"]}'
+    )
+    init_image = decode_base64_to_image(InputData["init_images"][0])
+    res = upscaler_inf(
+        InputData["prompt"],
+        InputData["negative_prompt"],
+        init_image,
+        InputData["height"],
+        InputData["width"],
+        InputData["steps"],
+        InputData["noise_level"],
+        InputData["cfg_scale"],
+        InputData["seed"],
+        batch_count=1,
+        batch_size=1,
+        scheduler="EulerDiscrete",
+        custom_model="None",
+        hf_model_id=InputData["hf_model_id"]
+        if "hf_model_id" in InputData.keys()
+        else "stabilityai/stable-diffusion-2-1-base",
+        custom_vae="None",
+        precision="fp16",
+        device=available_devices[0],
+        max_length=64,
+        save_metadata_to_json=False,
+        save_metadata_to_png=False,
+        lora_weights="None",
+        lora_hf_id="",
+        ondemand=False,
+    )
+    return {
+        "images": encode_pil_to_base64(res[0]),
+        "parameters": {},
+        "info": res[1],
+    }


 with gr.Blocks(title="Upscaler") as upscaler_web:
@@ -29,23 +321,33 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
                with gr.Row():
-                    custom_model = gr.Dropdown(
+                    upscaler_custom_model = gr.Dropdown(
                        label=f"Models (Custom Model path: {get_custom_model_path()})",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
-                        else "None",
+                        else "stabilityai/stable-diffusion-x4-upscaler",
                        choices=["None"]
-                        + get_custom_model_files()
+                        + get_custom_model_files(
+                            custom_checkpoint_type="upscaler"
+                        )
                        + predefined_upscaler_models,
                    )
-                    hf_model_id = gr.Textbox(
+                    upscaler_hf_model_id = gr.Textbox(
                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3",
+                        placeholder="Select 'None' in the Models dropdown on the left and enter model ID here e.g: SG161222/Realistic_Vision_V1.3, https://civitai.com/api/download/models/15236",
                        value="",
-                        label="HuggingFace Model ID",
+                        label="HuggingFace Model ID or Civitai model download URL",
                        lines=3,
                    )
+                    custom_vae = gr.Dropdown(
+                        label=f"Custom Vae Models (Path: {get_custom_model_path('vae')})",
+                        elem_id="custom_model",
+                        value=os.path.basename(args.custom_vae)
+                        if args.custom_vae
+                        else "None",
+                        choices=["None"] + get_custom_model_files("vae"),
+                    )

                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
@@ -86,7 +388,7 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                            elem_id="scheduler",
                            label="Scheduler",
                            value="DDIM",
-                            choices=scheduler_list,
+                            choices=scheduler_list_cpu_only,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -204,19 +506,15 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                        label="Generated images",
                        show_label=False,
                        elem_id="gallery",
-                    ).style(grid=[2])
+                    ).style(columns=[2], object_fit="contain")
                    std_output = gr.Textbox(
-                        value="Nothing to show.",
+                        value=f"Images will be saved at {get_generated_imgs_path()}",
                        lines=1,
+                        elem_id="std_output",
                        show_label=False,
                    )
-                output_dir = args.output_dir if args.output_dir else Path.cwd()
-                output_dir = Path(output_dir, "generated_imgs")
-                output_loc = gr.Textbox(
-                    label="Saving Images at",
-                    value=output_dir,
-                    interactive=False,
-                )
+                    upscaler_status = gr.Textbox(visible=False)
+
                with gr.Row():
                    upscaler_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    upscaler_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -239,8 +537,9 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                batch_count,
                batch_size,
                scheduler,
-                custom_model,
-                hf_model_id,
+                upscaler_custom_model,
+                upscaler_hf_model_id,
+                custom_vae,
                precision,
                device,
                max_length,
@@ -250,13 +549,21 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                lora_hf_id,
                ondemand,
            ],
-            outputs=[upscaler_gallery, std_output],
+            outputs=[upscaler_gallery, std_output, upscaler_status],
            show_progress=args.progress_bar,
        )
-
-        prompt_submit = prompt.submit(**kwargs)
-        neg_prompt_submit = negative_prompt.submit(**kwargs)
-        generate_click = stable_diffusion.click(**kwargs)
-        stop_batch.click(
-            fn=None, cancels=[prompt_submit, neg_prompt_submit, generate_click]
+        status_kwargs = dict(
+            fn=lambda bc, bs: status_label("Upscaler", 0, bc, bs),
+            inputs=[batch_count, batch_size],
+            outputs=upscaler_status,
+        )
+
+        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
+            **kwargs
+        )
+        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
+        stop_batch.click(
+            fn=cancel_sd,
+            cancels=[prompt_submit, neg_prompt_submit, generate_click],
        )
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -16,6 +16,7 @@ class Config:
    mode: str
    model_id: str
    ckpt_loc: str
+    custom_vae: str
    precision: str
    batch_size: int
    max_length: int
@@ -24,6 +25,7 @@ class Config:
    device: str
    use_lora: str
    use_stencil: str
+    ondemand: str


 custom_model_filetypes = (
@@ -31,13 +33,7 @@ custom_model_filetypes = (
    "*.safetensors",
 )  # the tuple of file types

-scheduler_list = [
-    "DDIM",
-    "PNDM",
-    "DPMSolverMultistep",
-    "EulerAncestralDiscrete",
-]
-scheduler_list_txt2img = [
+scheduler_list_cpu_only = [
    "DDIM",
    "PNDM",
    "LMSDiscrete",
@@ -45,6 +41,8 @@ scheduler_list_txt2img = [
    "DPMSolverMultistep",
    "EulerDiscrete",
    "EulerAncestralDiscrete",
+]
+scheduler_list = scheduler_list_cpu_only + [
    "SharkEulerDiscrete",
 ]

@@ -74,30 +72,36 @@ def resource_path(relative_path):
    return os.path.join(base_path, relative_path)


+def create_custom_models_folders():
+    dir = ["vae", "lora"]
+    if not args.ckpt_dir:
+        dir.insert(0, "models")
+    else:
+        if not os.path.isdir(args.ckpt_dir):
+            sys.exit(
+                f"Invalid --ckpt_dir argument, {args.ckpt_dir} folder does not exists."
+            )
+    for root in dir:
+        get_custom_model_path(root).mkdir(parents=True, exist_ok=True)
+
+
 def get_custom_model_path(model="models"):
-    # If `--ckpt_dir` is provided it'd override the heirarchical folder
    # structure in WebUI :-
-    #       model
+    #       models or args.ckpt_dir
    #         |___lora
    #         |___vae
+    sub_folder = "" if model == "models" else model
    if args.ckpt_dir:
-        return Path(args.ckpt_dir)
-    match model:
-        case "models":
-            return Path(Path.cwd(), "models")
-        case "vae":
-            return Path(Path.cwd(), "models/vae")
-        case "lora":
-            return Path(Path.cwd(), "models/lora")
-        case _:
-            return ""
+        return Path(Path(args.ckpt_dir), sub_folder)
+    else:
+        return Path(Path.cwd(), "models/" + sub_folder)


 def get_custom_model_pathfile(custom_model_name, model="models"):
    return os.path.join(get_custom_model_path(model), custom_model_name)


-def get_custom_model_files(model="models"):
+def get_custom_model_files(model="models", custom_checkpoint_type=""):
    ckpt_files = []
    file_types = custom_model_filetypes
    if model == "lora":
@@ -109,6 +113,28 @@ def get_custom_model_files(model="models"):
                os.path.join(get_custom_model_path(model), extn)
            )
        ]
+        match custom_checkpoint_type:
+            case "inpainting":
+                files = [
+                    val
+                    for val in files
+                    if val.endswith("inpainting" + extn.removeprefix("*"))
+                ]
+            case "upscaler":
+                files = [
+                    val
+                    for val in files
+                    if val.endswith("upscaler" + extn.removeprefix("*"))
+                ]
+            case _:
+                files = [
+                    val
+                    for val in files
+                    if not (
+                        val.endswith("inpainting" + extn.removeprefix("*"))
+                        or val.endswith("upscaler" + extn.removeprefix("*"))
+                    )
+                ]
        ckpt_files.extend(files)
    return sorted(ckpt_files, key=str.casefold)

--- a/apps/stable_diffusion/web/utils/common_label_calc.py
+++ b/apps/stable_diffusion/web/utils/common_label_calc.py
@@ -0,0 +1,9 @@
+# functions for generating labels used in common by tabs across the UI
+
+
+def status_label(tab_name, batch_index=0, batch_count=1, batch_size=1):
+    if batch_index < batch_count:
+        bs = f"x{batch_size}" if batch_size > 1 else ""
+        return f"{tab_name} generating {batch_index+1}/{batch_count}{bs}"
+    else:
+        return f"{tab_name} complete"
--- a/apps/stable_diffusion/web/utils/exif_metadata.py
+++ b/apps/stable_diffusion/web/utils/exif_metadata.py
@@ -0,0 +1,48 @@
+from PIL import Image
+from PIL.ExifTags import Base as EXIFKeys, TAGS, IFD, GPSTAGS
+
+
+def parse_exif(pil_image: Image) -> dict:
+    img_exif = pil_image.getexif()
+
+    # See this stackoverflow answer for where most this comes from: https://stackoverflow.com/a/75357594
+    # I did try to use the exif library but it broke just as much as my initial attempt at this (albeit I
+    # I was probably using it wrong) so I reverted back to using PIL with more filtering and saved a
+    # dependency
+    exif_tags = {
+        TAGS.get(key, key): str(val)
+        for (key, val) in img_exif.items()
+        if key in TAGS
+        and key not in (EXIFKeys.ExifOffset, EXIFKeys.GPSInfo)
+        and val
+        and (not isinstance(val, bytes))
+        and (not str(val).isspace())
+    }
+
+    def try_get_ifd(ifd_id):
+        try:
+            return img_exif.get_ifd(ifd_id).items()
+        except KeyError:
+            return {}
+
+    ifd_tags = {
+        TAGS.get(key, key): str(val)
+        for ifd_id in IFD
+        for (key, val) in try_get_ifd(ifd_id)
+        if ifd_id != IFD.GPSInfo
+        and key in TAGS
+        and val
+        and (not isinstance(val, bytes))
+        and (not str(val).isspace())
+    }
+
+    gps_tags = {
+        GPSTAGS.get(key, key): str(val)
+        for (key, val) in try_get_ifd(IFD.GPSInfo)
+        if key in GPSTAGS
+        and val
+        and (not isinstance(val, bytes))
+        and (not str(val).isspace())
+    }
+
+    return {**exif_tags, **ifd_tags, **gps_tags}
--- a/apps/stable_diffusion/web/utils/global_obj.py
+++ b/apps/stable_diffusion/web/utils/global_obj.py
@@ -43,18 +43,22 @@ def set_schedulers(value):


 def get_sd_obj():
+    global _sd_obj
    return _sd_obj


 def get_sd_status():
+    global _sd_obj
    return _sd_obj.status


 def get_cfg_obj():
+    global _config_obj
    return _config_obj


 def get_scheduler(key):
+    global _schedulers
    return _schedulers[key]


--- a/apps/stable_diffusion/web/utils/png_metadata.py
+++ b/apps/stable_diffusion/web/utils/png_metadata.py
@@ -1,21 +1,8 @@
 import re
 from pathlib import Path
-from apps.stable_diffusion.web.ui.txt2img_ui import (
-    png_info_img,
-    prompt,
-    negative_prompt,
-    steps,
-    scheduler,
-    guidance_scale,
-    seed,
-    width,
-    height,
-    custom_model,
-    hf_model_id,
-)
 from apps.stable_diffusion.web.ui.utils import (
    get_custom_model_pathfile,
-    scheduler_list_txt2img,
+    scheduler_list,
    predefined_models,
 )

@@ -75,7 +62,19 @@ def parse_generation_parameters(x: str):
    return res


-def import_png_metadata(pil_data):
+def import_png_metadata(
+    pil_data,
+    prompt,
+    negative_prompt,
+    steps,
+    sampler,
+    cfg_scale,
+    seed,
+    width,
+    height,
+    custom_model,
+    hf_model_id,
+):
    try:
        png_info = pil_data.info["parameters"]
        metadata = parse_generation_parameters(png_info)
@@ -110,39 +109,44 @@ def import_png_metadata(pil_data):
                    % metadata["Model"]
                )

-        outputs = {
-            png_info_img: None,
-            negative_prompt: metadata["Negative prompt"],
-            steps: int(metadata["Steps"]),
-            guidance_scale: float(metadata["CFG scale"]),
-            seed: int(metadata["Seed"]),
-            width: float(metadata["Size-1"]),
-            height: float(metadata["Size-2"]),
-        }
+        negative_prompt = metadata["Negative prompt"]
+        steps = int(metadata["Steps"])
+        cfg_scale = float(metadata["CFG scale"])
+        seed = int(metadata["Seed"])
+        width = float(metadata["Size-1"])
+        height = float(metadata["Size-2"])
        if "Model" in metadata and png_custom_model:
-            outputs[custom_model] = png_custom_model
-            outputs[hf_model_id] = ""
+            custom_model = png_custom_model
+            hf_model_id = ""
        if "Model" in metadata and png_hf_model_id:
-            outputs[custom_model] = "None"
-            outputs[hf_model_id] = png_hf_model_id
+            custom_model = "None"
+            hf_model_id = png_hf_model_id
        if "Prompt" in metadata:
-            outputs[prompt] = metadata["Prompt"]
+            prompt = metadata["Prompt"]
        if "Sampler" in metadata:
-            if metadata["Sampler"] in scheduler_list_txt2img:
-                outputs[scheduler] = metadata["Sampler"]
+            if metadata["Sampler"] in scheduler_list:
+                sampler = metadata["Sampler"]
            else:
                print(
                    "Import PNG info: Unable to find a scheduler for %s"
                    % metadata["Sampler"]
                )

-        return outputs
-
    except Exception as ex:
        if pil_data and pil_data.info.get("parameters"):
            print("import_png_metadata failed with %s" % ex)
        pass

-    return {
-        png_info_img: None,
-    }
+    return (
+        None,
+        prompt,
+        negative_prompt,
+        steps,
+        sampler,
+        cfg_scale,
+        seed,
+        width,
+        height,
+        custom_model,
+        hf_model_id,
+    )
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -65,18 +65,18 @@ A tool for benchmarking other models is built and can be invoked with a command
 see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
 ```bash
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
 ```
 VAE and Autoencoder are also available
 ```bash
 # VAE
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32

 # CLIP Autoencoder
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=mhlo --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/cpp/vision_inference/CMakeLists.txt
+++ b/cpp/vision_inference/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 # Compile mnist.mlir to mnist.vmfb.
 set(_COMPILE_TOOL_EXECUTABLE $<TARGET_FILE:iree-compile>)
 set(_COMPILE_ARGS)
-list(APPEND _COMPILE_ARGS "--iree-input-type=mhlo")
+list(APPEND _COMPILE_ARGS "--iree-input-type=auto")
 list(APPEND _COMPILE_ARGS "--iree-hal-target-backends=llvm-cpu")
 list(APPEND _COMPILE_ARGS "${IREE_SOURCE_DIR}/samples/models/mnist.mlir")
 list(APPEND _COMPILE_ARGS "-o")
--- a/docs/shark_sd_blender.md
+++ b/docs/shark_sd_blender.md
@@ -0,0 +1,75 @@
+# Overview
+
+This document is intended to provide a starting point for using SHARK stable diffusion with Blender. 
+
+We currently make use of the [AI-Render Plugin](https://github.com/benrugg/AI-Render) to integrate with Blender.
+
+## Setup SHARK and prerequisites:
+
+ * Download the latest SHARK SD webui .exe from [here](https://github.com/nod-ai/SHARK/releases) or follow instructions on the [README](https://github.com/nod-ai/SHARK#readme)
+ * Once you have the .exe where you would like SHARK to install, run the .exe from terminal/PowerShell with the `--api` flag:
+```
+## Run the .exe in API mode:
+.\shark_sd_<date>_<ver>.exe --api
+
+## For example:
+.\shark_sd_20230411_671.exe --api --server_port=8082
+
+## From a the base directory of a source clone of SHARK:
+./setup_venv.ps1
+python apps\stable_diffusion\web\index.py --api
+
+```
+
+Your local SD server should start and look something like this:
+![image](https://user-images.githubusercontent.com/87458719/231369758-e2c3c45a-eccc-4fe5-a788-4a3bf1ace1d1.png)
+
+ * Note: When running in api mode with `--api`, the .exe will not function as a webUI. Thus, the address in the terminal output will only be useful for API requests.
+
+### Install AI Render
+
+- Get AI Render on [Blender Market](https://blendermarket.com/products/ai-render) or [Gumroad](https://airender.gumroad.com/l/ai-render)
+- Open Blender, then go to Edit > Preferences > Add-ons > Install and then find the zip file
+- We will be using the Automatic1111 SD backend for the AI-Render plugin. Follow instructions [here](https://github.com/benrugg/AI-Render/wiki/Local-Installation) to setup local SD backend.
+
+Your AI-Render preferences should be configured as shown; the highlighted part should match your terminal output:
+![image](https://user-images.githubusercontent.com/87458719/231390322-59a54a09-520a-4a08-b658-6e37bd63e932.png)
+
+
+The [AI-Render README](https://github.com/benrugg/AI-Render/blob/main/README.md) has more details on installation and usage, as well as video tutorials.
+
+## Using AI-Render + SHARK in your Blender project
+
+- In the Render Properties tab, in the AI-Render dropdown, enable AI-Render.
+
+![image](https://user-images.githubusercontent.com/87458719/231392843-9bd51744-3ce2-464e-843a-0c4d4c96df0c.png)
+
+- Select an image size (it's usually better to upscale later than go high on the img2img resolution here.)
+
+![image](https://user-images.githubusercontent.com/87458719/231394288-0c4ab8c5-dc30-4dbe-8bc1-7520ded5efe8.png)
+
+- From here, you can enter a prompt and configure img2img Stable Diffusion parameters, and AI-Render will run SHARK SD img2img on the rendered scene.
+- AI-Render has useful presets for aesthetic styles, so you should be able to keep your subject prompt simple and focus on creating a decent Blender scene to start from.
+
+![image](https://user-images.githubusercontent.com/87458719/231440729-2fe69586-41cb-4274-9ce7-f6c08def600b.png)
+
+## Examples:
+Scene (Input image):
+
+![blender-sample-2](https://user-images.githubusercontent.com/87458719/231450408-0e680086-3e52-4962-a5c1-c703a94d1583.png)
+
+Prompt:
+"A bowl of tangerines in front of rocks, masterpiece, oil on canvas, by Georgia O'Keefe, trending on artstation, landscape painting by Caspar David Friedrich"
+
+Negative Prompt (default):
+"ugly, bad art, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, blurry, bad anatomy, blurred, watermark, grainy, tiling, signature, cut off, draft"
+
+Example output:
+
+![blender-sample-2_out](https://user-images.githubusercontent.com/87458719/231451145-a0b56897-a7d0-4add-bbed-7e8af21a65df.png)
+
+
+
+
+
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,13 +19,16 @@ transformers
 diffusers @ git+https://github.com/huggingface/diffusers@e47459c80f6f6a5a1c19d32c3fd74edf94f47aa2
 scipy
 ftfy
-gradio
+gradio==3.22.0
 altair
 omegaconf
 safetensors
 opencv-python
 scikit-image
 pytorch_lightning # for runwayml models
+tk
+pywebview
+sentencepiece

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -2,9 +2,10 @@
 # Sets up a venv suitable for running samples.
 # e.g:
 # ./setup_venv.sh  #setup a default $PYTHON3 shark.venv
-# Environment Variables by the script.
+# Environment variables used by the script.
 # PYTHON=$PYTHON3.10 ./setup_venv.sh  #pass a version of $PYTHON to use
 # VENV_DIR=myshark.venv #create a venv called myshark.venv
+# SKIP_VENV=1 #Don't create and activate a Python venv. Use the current environment. 
 # USE_IREE=1 #use stock IREE instead of Nod.ai's SHARK build
 # IMPORTER=1 #Install importer deps
 # BENCHMARK=1 #Install benchmark deps
@@ -26,15 +27,17 @@ PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; prin
 echo "Python: $PYTHON"
 echo "Python version: $PYTHON_VERSION_X_Y"

-if [[ -z "${CONDA_PREFIX}" ]]; then
-  # Not a conda env. So create a new VENV dir
-  VENV_DIR=${VENV_DIR:-shark.venv}
-  echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
-  $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
-  source "$VENV_DIR/bin/activate" || die "Could not activate venv"
-  PYTHON="$(which python3)"
-else
-  echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+if [[ "$SKIP_VENV" != "1" ]]; then
+  if [[ -z "${CONDA_PREFIX}" ]]; then
+    # Not a conda env. So create a new VENV dir
+    VENV_DIR=${VENV_DIR:-shark.venv}
+    echo "Using pip venv.. Setting up venv dir: $VENV_DIR"
+    $PYTHON -m venv "$VENV_DIR" || die "Could not create venv."
+    source "$VENV_DIR/bin/activate" || die "Could not activate venv"
+    PYTHON="$(which python3)"
+  else
+    echo "Found conda env $CONDA_DEFAULT_ENV. Running pip install inside the conda env"
+  fi
 fi

 Red=`tput setaf 1`
@@ -147,8 +150,7 @@ if [[ ! -z "${ONNX}" ]]; then
  fi
 fi

-if [[ -z "${CONDA_PREFIX}" ]]; then
+if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
  echo "${Green}Before running examples activate venv with:"
  echo "  ${Green}source $VENV_DIR/bin/activate"
 fi
-
--- a/shark/examples/shark_inference/minilm_jax.py
+++ b/shark/examples/shark_inference/minilm_jax.py
@@ -0,0 +1,73 @@
+from transformers import AutoTokenizer, FlaxAutoModel
+import torch
+import jax
+from typing import Union, Dict, List, Any
+import numpy as np
+from shark.shark_inference import SharkInference
+import io
+
+NumpyTree = Union[np.ndarray, Dict[str, np.ndarray], List[np.ndarray]]
+
+
+def convert_torch_tensor_tree_to_numpy(
+    tree: Union[torch.tensor, Dict[str, torch.tensor], List[torch.tensor]]
+) -> NumpyTree:
+    return jax.tree_util.tree_map(
+        lambda torch_tensor: torch_tensor.cpu().detach().numpy(), tree
+    )
+
+
+def convert_int64_to_int32(tree: NumpyTree) -> NumpyTree:
+    return jax.tree_util.tree_map(
+        lambda tensor: np.array(tensor, dtype=np.int32)
+        if tensor.dtype == np.int64
+        else tensor,
+        tree,
+    )
+
+
+def get_sample_input():
+    tokenizer = AutoTokenizer.from_pretrained(
+        "microsoft/MiniLM-L12-H384-uncased"
+    )
+    inputs_torch = tokenizer("Hello, World!", return_tensors="pt")
+    return convert_int64_to_int32(
+        convert_torch_tensor_tree_to_numpy(inputs_torch.data)
+    )
+
+
+def get_jax_model():
+    return FlaxAutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
+
+
+def export_jax_to_mlir(jax_model: Any, sample_input: NumpyTree):
+    model_mlir = jax.jit(jax_model).lower(**sample_input).compiler_ir()
+    byte_stream = io.BytesIO()
+    model_mlir.operation.write_bytecode(file=byte_stream)
+    return byte_stream.getvalue()
+
+
+def assert_array_list_allclose(x, y, *args, **kwargs):
+    assert len(x) == len(y)
+    for a, b in zip(x, y):
+        np.testing.assert_allclose(
+            np.asarray(a), np.asarray(b), *args, **kwargs
+        )
+
+
+sample_input = get_sample_input()
+jax_model = get_jax_model()
+mlir = export_jax_to_mlir(jax_model, sample_input)
+
+# Compile and load module.
+shark_inference = SharkInference(mlir_module=mlir, mlir_dialect="mhlo")
+shark_inference.compile()
+
+# Run main function.
+result = shark_inference("main", jax.tree_util.tree_flatten(sample_input)[0])
+
+# Run JAX model.
+reference_result = jax.tree_util.tree_flatten(jax_model(**sample_input))[0]
+
+# Verify result.
+assert_array_list_allclose(result, reference_result, atol=1e-5)
--- a/shark/examples/shark_inference/minilm_jax_requirements.txt
+++ b/shark/examples/shark_inference/minilm_jax_requirements.txt
@@ -0,0 +1,6 @@
+flax
+jax[cpu]
+nodai-SHARK
+orbax
+transformers
+torch
--- a/shark/iree_utils/_common.py
+++ b/shark/iree_utils/_common.py
@@ -45,10 +45,15 @@ def run_cmd(cmd, debug=False):

 def iree_device_map(device):
    uri_parts = device.split("://", 2)
+    iree_driver = (
+        _IREE_DEVICE_MAP[uri_parts[0]]
+        if uri_parts[0] in _IREE_DEVICE_MAP
+        else uri_parts[0]
+    )
    if len(uri_parts) == 1:
-        return _IREE_DEVICE_MAP[uri_parts[0]]
+        return iree_driver
    else:
-        return f"{_IREE_DEVICE_MAP[uri_parts[0]]}://{uri_parts[1]}"
+        return f"{iree_driver}://{uri_parts[1]}"


 def get_supported_device_list():
@@ -68,7 +73,7 @@ _IREE_DEVICE_MAP = {
 def iree_target_map(device):
    if "://" in device:
        device = device.split("://")[0]
-    return _IREE_TARGET_MAP[device]
+    return _IREE_TARGET_MAP[device] if device in _IREE_TARGET_MAP else device


 _IREE_TARGET_MAP = {
@@ -110,10 +115,8 @@ def check_device_drivers(device):
            subprocess.check_output("rocminfo")
        except Exception:
            return True
-    # Unknown device.
-    else:
-        return True

+    # Unknown device. We assume drivers are installed.
    return False


--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -23,6 +23,7 @@ import re

 # Get the iree-compile arguments given device.
 def get_iree_device_args(device, extra_args=[]):
+    print("Configuring for device:" + device)
    device_uri = device.split("://")
    if len(device_uri) > 1:
        if device_uri[0] not in ["vulkan"]:
@@ -30,6 +31,9 @@ def get_iree_device_args(device, extra_args=[]):
                f"Specific device selection only supported for vulkan now."
                f"Proceeding with {device} as device."
            )
+        device_num = device_uri[1]
+    else:
+        device_num = 0

    if device_uri[0] == "cpu":
        from shark.iree_utils.cpu_utils import get_iree_cpu_args
@@ -42,7 +46,9 @@ def get_iree_device_args(device, extra_args=[]):
    if device_uri[0] in ["metal", "vulkan"]:
        from shark.iree_utils.vulkan_utils import get_iree_vulkan_args

-        return get_iree_vulkan_args(extra_args=extra_args)
+        return get_iree_vulkan_args(
+            device_num=device_num, extra_args=extra_args
+        )
    if device_uri[0] == "rocm":
        from shark.iree_utils.gpu_utils import get_iree_rocm_args

@@ -54,7 +60,7 @@ def get_iree_device_args(device, extra_args=[]):
 def get_iree_frontend_args(frontend):
    if frontend in ["torch", "pytorch", "linalg", "tm_tensor"]:
        return ["--iree-llvmcpu-target-cpu-features=host"]
-    elif frontend in ["tensorflow", "tf", "mhlo"]:
+    elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
        return [
            "--iree-llvmcpu-target-cpu-features=host",
            "--iree-mhlo-demote-i64-to-i32=false",
@@ -259,8 +265,8 @@ def compile_module_to_flatbuffer(
    args += extra_args

    if frontend in ["tensorflow", "tf"]:
-        input_type = "mhlo"
-    elif frontend in ["mhlo", "tosa"]:
+        input_type = "auto"
+    elif frontend in ["stablehlo", "tosa"]:
        input_type = frontend
    elif frontend in ["tflite", "tflite-tosa"]:
        input_type = "tosa"
@@ -307,7 +313,7 @@ def get_iree_module(flatbuffer_blob, device, device_idx=None):
    )
    ctx = ireert.SystemContext(config=config)
    ctx.add_vm_module(vm_module)
-    ModuleCompiled = ctx.modules.module
+    ModuleCompiled = getattr(ctx.modules, vm_module.name)
    return ModuleCompiled, config


@@ -361,7 +367,7 @@ def export_iree_module_to_vmfb(
 def export_module_to_mlir_file(module, frontend, directory: str):
    # TODO: write proper documentation.
    mlir_str = module
-    if frontend in ["tensorflow", "tf", "mhlo", "tflite"]:
+    if frontend in ["tensorflow", "tf", "mhlo", "stablehlo", "tflite"]:
        mlir_str = module.decode("utf-8")
    elif frontend in ["pytorch", "torch"]:
        mlir_str = module.operation.get_asm()
--- a/shark/iree_utils/vulkan_target_env_utils.py
+++ b/shark/iree_utils/vulkan_target_env_utils.py
@@ -117,7 +117,8 @@ def get_extensions(triple):

    if get_vendor(triple) == "NVIDIA" or arch == "rdna3":
        ext.append("VK_NV_cooperative_matrix")
-
+    if get_vendor(triple) == ["NVIDIA", "AMD", "Intel"]:
+        ext.append("VK_KHR_shader_integer_dot_product")
    return make_ext_list(ext_list=ext)


@@ -133,7 +134,7 @@ def get_vendor(triple):
        return "Apple"
    if arch in ["arc", "UHD"]:
        return "Intel"
-    if arch in ["turing", "ampere"]:
+    if arch in ["turing", "ampere", "pascal"]:
        return "NVIDIA"
    if arch == "ardeno":
        return "Qualcomm"
@@ -151,7 +152,7 @@ def get_device_type(triple):
        return "Unknown"
    if arch == "cpu":
        return "CPU"
-    if arch in ["turing", "ampere", "arc"]:
+    if arch in ["turing", "ampere", "arc", "pascal"]:
        return "DiscreteGPU"
    if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn5"]:
        if product == "ivega10":
@@ -228,6 +229,7 @@ def get_vulkan_target_capabilities(triple):
        cap["shaderInt8"] = True
        cap["shaderInt16"] = True
        cap["shaderInt64"] = True
+        cap["shaderIntegerDotProduct"] = True
        cap["storageBuffer16BitAccess"] = True
        cap["storagePushConstant16"] = True
        cap["uniformAndStorageBuffer16BitAccess"] = True
@@ -236,12 +238,12 @@ def get_vulkan_target_capabilities(triple):
        cap["uniformAndStorageBuffer8BitAccess"] = True
        cap["variablePointers"] = True
        cap["variablePointersStorageBuffer"] = True
-
        if arch == "rdna3":
            # TODO: Get scope value
            cap["coopmatCases"] = [
                "mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, scope = #vk.scope<Subgroup>"
            ]
+
        if product == "rx5700xt":
            cap["storagePushConstant16"] = False
            cap["storagePushConstant8"] = False
@@ -274,7 +276,7 @@ def get_vulkan_target_capabilities(triple):
        cap["shaderInt8"] = True
        cap["shaderInt16"] = True
        cap["shaderInt64"] = True
-
+        cap["shaderIntegerDotProduct"] = True
        cap["storagePushConstant16"] = False
        cap["uniformAndStorageBuffer16BitAccess"] = True
        cap["storageBuffer8BitAccess"] = True
@@ -305,6 +307,7 @@ def get_vulkan_target_capabilities(triple):
        cap["shaderInt8"] = True
        cap["shaderInt16"] = True
        cap["shaderInt64"] = True
+        cap["shaderIntegerDotProduct"] = False
        cap["storageBuffer16BitAccess"] = True
        cap["storagePushConstant16"] = True
        cap["uniformAndStorageBuffer16BitAccess"] = True
@@ -367,6 +370,7 @@ def get_vulkan_target_capabilities(triple):
        cap["shaderInt8"] = True
        cap["shaderInt16"] = True
        cap["shaderInt64"] = False
+        cap["shaderIntegerDotProduct"] = True
        cap["storageBuffer16BitAccess"] = True
        cap["storagePushConstant16"] = True
        cap["uniformAndStorageBuffer16BitAccess"] = True
@@ -389,6 +393,40 @@ def get_vulkan_target_capabilities(triple):
                "ShuffleRelative",
            ]

+    elif arch in ["pascal"]:
+        cap["maxComputeSharedMemorySize"] = 49152
+        cap["maxComputeWorkGroupInvocations"] = 1536
+        cap["maxComputeWorkGroupSize"] = [1536, 1024, 64]
+
+        cap["subgroupSize"] = 32
+        cap["minSubgroupSize"] = 32
+        cap["maxSubgroupSize"] = 32
+        cap["subgroupFeatures"] = [
+            "Basic",
+            "Vote",
+            "Arithmetic",
+            "Ballot",
+            "Shuffle",
+            "ShuffleRelative",
+            "Clustered",
+            "Quad",
+        ]
+
+        cap["shaderFloat16"] = False
+        cap["shaderFloat64"] = True
+        cap["shaderInt8"] = True
+        cap["shaderInt16"] = True
+        cap["shaderInt64"] = True
+        cap["shaderIntegerDotProduct"] = True
+        cap["storageBuffer16BitAccess"] = True
+        cap["storagePushConstant16"] = True
+        cap["uniformAndStorageBuffer16BitAccess"] = True
+        cap["storageBuffer8BitAccess"] = True
+        cap["storagePushConstant8"] = True
+        cap["uniformAndStorageBuffer8BitAccess"] = True
+        cap["variablePointers"] = True
+        cap["variablePointersStorageBuffer"] = True
+
    elif arch in ["ampere", "turing"]:
        cap["maxComputeSharedMemorySize"] = 49152
        cap["maxComputeWorkGroupInvocations"] = 1024
@@ -413,6 +451,7 @@ def get_vulkan_target_capabilities(triple):
        cap["shaderInt8"] = True
        cap["shaderInt16"] = True
        cap["shaderInt64"] = True
+        cap["shaderIntegerDotProduct"] = True
        cap["storageBuffer16BitAccess"] = True
        cap["storagePushConstant16"] = True
        cap["uniformAndStorageBuffer16BitAccess"] = True
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -21,7 +21,7 @@ from sys import platform
 from shark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag


-def get_vulkan_device_name():
+def get_vulkan_device_name(device_num=0):
    vulkaninfo_dump, _ = run_cmd("vulkaninfo")
    vulkaninfo_dump = vulkaninfo_dump.split(linesep)
    vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
@@ -31,8 +31,8 @@ def get_vulkan_device_name():
        print("Following devices found:")
        for i, dname in enumerate(vulkaninfo_list):
            print(f"{i}. {dname}")
-        print(f"Choosing first one: {vulkaninfo_list[0]}")
-    return vulkaninfo_list[0]
+        print(f"Choosing device: {vulkaninfo_list[device_num]}")
+    return vulkaninfo_list[device_num]


 def get_os_name():
@@ -107,6 +107,8 @@ def get_vulkan_target_triple(device_name):
    # Windows: AMD Radeon RX 7900 XTX
    elif all(x in device_name for x in ("RX", "7900")):
        triple = f"rdna3-7900-{system_os}"
+    elif all(x in device_name for x in ("AMD", "PRO", "W7900")):
+        triple = f"rdna3-w7900-{system_os}"
    elif any(x in device_name for x in ("AMD", "Radeon")):
        triple = f"rdna2-unknown-{system_os}"
    # Intel Targets
@@ -117,14 +119,14 @@ def get_vulkan_target_triple(device_name):
    return triple


-def get_vulkan_triple_flag(device_name="", extra_args=[]):
+def get_vulkan_triple_flag(device_name="", device_num=0, extra_args=[]):
    for flag in extra_args:
        if "-iree-vulkan-target-triple=" in flag:
            print(f"Using target triple {flag.split('=')[1]}")
            return None

    if device_name == "" or device_name == [] or device_name is None:
-        vulkan_device = get_vulkan_device_name()
+        vulkan_device = get_vulkan_device_name(device_num=device_num)
    else:
        vulkan_device = device_name
    triple = get_vulkan_target_triple(vulkan_device)
@@ -142,7 +144,7 @@ def get_vulkan_triple_flag(device_name="", extra_args=[]):
    return None


-def get_iree_vulkan_args(extra_args=[]):
+def get_iree_vulkan_args(device_num=0, extra_args=[]):
    # res_vulkan_flag = ["--iree-flow-demote-i64-to-i32"]

    res_vulkan_flag = []
@@ -154,7 +156,9 @@ def get_iree_vulkan_args(extra_args=[]):
            break

    if vulkan_triple_flag is None:
-        vulkan_triple_flag = get_vulkan_triple_flag(extra_args=extra_args)
+        vulkan_triple_flag = get_vulkan_triple_flag(
+            device_num=device_num, extra_args=extra_args
+        )

    if vulkan_triple_flag is not None:
        vulkan_target_env = get_vulkan_target_env_flag(vulkan_triple_flag)
--- a/shark/model_annotation.py
+++ b/shark/model_annotation.py
@@ -30,8 +30,8 @@ import os
 import sys
 from typing import Dict, List

+import iree.compiler._mlir_libs
 from iree.compiler import ir
-from iree.compiler.transforms import ireec as ireec_trans


 def model_annotation(
@@ -311,11 +311,18 @@ def add_attributes(op: ir.Operation, config: List[Dict]):
            split_k = config["split_k"]
    elif "SPIRV" in config["pipeline"]:
        pipeline = config["pipeline"]
-        tile_sizes = [
-            config["work_group_tile_sizes"],
-            config["parallel_tile_sizes"],
-            config["reduction_tile_sizes"],
-        ]
+        if pipeline == "SPIRVMatmulPromoteVectorize":
+            tile_sizes = [
+                config["work_group_tile_sizes"]
+                + [config["reduction_tile_sizes"][-1]],
+            ]
+        else:
+            tile_sizes = [
+                config["work_group_tile_sizes"],
+                config["parallel_tile_sizes"],
+                config["reduction_tile_sizes"],
+            ]
+
        workgroup_size = config["work_group_sizes"]
        if "vector_tile_sizes" in config.keys():
            tile_sizes += [config["vector_tile_sizes"]]
@@ -409,7 +416,6 @@ def shape_list_to_string(input):

 def create_context() -> ir.Context:
    context = ir.Context()
-    ireec_trans.register_all_dialects(context)
    context.allow_unregistered_dialects = True
    return context

--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -61,6 +61,8 @@ def download_public_file(
                continue

        destination_filename = os.path.join(destination_folder_name, blob_name)
+        if os.path.isdir(destination_filename):
+            continue
        with open(destination_filename, "wb") as f:
            with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
                storage_client.download_blob_to_file(blob, file_obj)
@@ -196,13 +198,13 @@ def download_model(
    tank_url=None,
    frontend=None,
    tuned=None,
-    import_args={"batch_size": "1"},
+    import_args={"batch_size": 1},
 ):
    model_name = model_name.replace("/", "_")
    dyn_str = "_dynamic" if dynamic else ""
    os.makedirs(WORKDIR, exist_ok=True)
    shark_args.shark_prefix = get_sharktank_prefix()
-    if import_args["batch_size"] != 1:
+    if import_args["batch_size"] and import_args["batch_size"] != 1:
        model_dir_name = (
            model_name
            + "_"
@@ -210,6 +212,9 @@ def download_model(
            + "_BS"
            + str(import_args["batch_size"])
        )
+    elif any(model in model_name for model in ["clip", "unet", "vae"]):
+        # TODO(Ean Garvey): rework extended naming such that device is only included in model_name after .vmfb compilation.
+        model_dir_name = model_name
    else:
        model_dir_name = model_name + "_" + frontend
    model_dir = os.path.join(WORKDIR, model_dir_name)
@@ -270,6 +275,9 @@ def download_model(
    tuned_str = "" if tuned is None else "_" + tuned
    suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
    filename = os.path.join(model_dir, model_name + suffix)
+    print(
+        f"Verifying that model artifacts were downloaded successfully to {filename}..."
+    )
    if not os.path.exists(filename):
        from tank.generate_sharktank import gen_shark_files

--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -81,7 +81,7 @@ class SharkImporter:

    # NOTE: The default function for torch is "forward" and tf-lite is "main".

-    def _torch_mlir(self, is_dynamic, tracing_required):
+    def _torch_mlir(self, is_dynamic, tracing_required, mlir_type):
        from shark.torch_mlir_utils import get_torch_mlir_module

        return get_torch_mlir_module(
@@ -90,6 +90,7 @@ class SharkImporter:
            is_dynamic,
            tracing_required,
            self.return_str,
+            mlir_type,
        )

    def _tf_mlir(self, func_name, save_dir="."):
@@ -120,6 +121,7 @@ class SharkImporter:
        tracing_required=False,
        func_name="forward",
        save_dir="./shark_tmp/",
+        mlir_type="linalg",
    ):
        if self.frontend in ["torch", "pytorch"]:
            if self.inputs == None:
@@ -127,7 +129,10 @@ class SharkImporter:
                    "Please pass in the inputs, the inputs are required to determine the shape of the mlir_module"
                )
                sys.exit(1)
-            return self._torch_mlir(is_dynamic, tracing_required), func_name
+            return (
+                self._torch_mlir(is_dynamic, tracing_required, mlir_type),
+                func_name,
+            )
        if self.frontend in ["tf", "tensorflow"]:
            return self._tf_mlir(func_name, save_dir), func_name
        if self.frontend in ["tflite", "tf-lite"]:
@@ -143,14 +148,23 @@ class SharkImporter:

    # Saves `function_name.npy`, `inputs.npz`, `golden_out.npz` and `model_name.mlir` in the directory `dir`.
    def save_data(
-        self, dir, model_name, mlir_data, func_name, inputs, outputs
+        self,
+        dir,
+        model_name,
+        mlir_data,
+        func_name,
+        inputs,
+        outputs,
+        mlir_type="linalg",
    ):
        import numpy as np

        inputs_name = "inputs.npz"
        outputs_name = "golden_out.npz"
        func_file_name = "function_name"
-        model_name_mlir = model_name + "_" + self.frontend + ".mlir"
+        model_name_mlir = (
+            model_name + "_" + self.frontend + "_" + mlir_type + ".mlir"
+        )
        print(f"saving {model_name_mlir} to {dir}")
        try:
            inputs = [x.cpu().detach() for x in inputs]
@@ -186,19 +200,23 @@ class SharkImporter:
        dir=tempfile.gettempdir(),
        model_name="model",
        golden_values=None,
+        mlir_type="linalg",
    ):
        if self.inputs == None:
            print(
                f"There is no input provided: {self.inputs}, please provide inputs or simply run import_mlir."
            )
            sys.exit(1)
-        model_name_mlir = model_name + "_" + self.frontend + ".mlir"
+        model_name_mlir = (
+            model_name + "_" + self.frontend + "_" + mlir_type + ".mlir"
+        )
        artifact_path = os.path.join(dir, model_name_mlir)
        imported_mlir = self.import_mlir(
            is_dynamic,
            tracing_required,
            func_name,
            save_dir=artifact_path,
+            mlir_type=mlir_type,
        )
        # TODO: Make sure that any generic function name is accepted. Currently takes in the default function names.
        # TODO: Check for multiple outputs.
@@ -224,6 +242,7 @@ class SharkImporter:
                imported_mlir[1],
                self.inputs,
                golden_out,
+                mlir_type,
            )
            return (
                imported_mlir,
@@ -301,6 +320,9 @@ def transform_fx(fx_g):
        "device": torch.device(type="cpu"),
        "pin_memory": False,
    }
+    kwargs_dict1 = {
+        "dtype": torch.float16,
+    }
    for node in fx_g.graph.nodes:
        if node.op == "call_function":
            if node.target in [
@@ -308,7 +330,16 @@ def transform_fx(fx_g):
                torch.ops.aten.empty,
                torch.ops.aten.zeros,
            ]:
-                node.kwargs = kwargs_dict
+                if node.kwargs.get("dtype") == torch.float32:
+                    node.kwargs = kwargs_dict
+
+            # Vicuna
+            if node.target in [
+                torch.ops.aten._to_copy,
+            ]:
+                if node.kwargs.get("dtype") == torch.float32:
+                    node.kwargs = kwargs_dict1
+
            # Inputs and outputs of aten.var.mean should be upcasted to fp32.
            if node.target in [torch.ops.aten.var_mean]:
                with fx_g.graph.inserting_before(node):
@@ -318,6 +349,7 @@ def transform_fx(fx_g):
                        kwargs={},
                    )
                    node.args = (new_node, node.args[1])
+
            if node.name.startswith("getitem"):
                with fx_g.graph.inserting_before(node):
                    if node.args[0].target in [torch.ops.aten.var_mean]:
@@ -330,6 +362,19 @@ def transform_fx(fx_g):
                        node.replace_all_uses_with(new_node)
                        new_node.args = (node,)
                        new_node.kwargs = {"dtype": torch.float16}
+
+            # Change the default dtype of aten.full op. (Vicuna)
+            if node.target in [torch.ops.aten.full]:
+                new_node = fx_g.graph.call_function(
+                    torch.ops.aten._to_copy,
+                    args=(node,),
+                    kwargs={"dtype": torch.float16},
+                )
+                node.append(new_node)
+                node.replace_all_uses_with(new_node)
+                new_node.args = (node,)
+                new_node.kwargs = {"dtype": torch.float16}
+
            # aten.empty should be filled with zeros.
            if node.target in [torch.ops.aten.empty]:
                with fx_g.graph.inserting_after(node):
@@ -392,6 +437,9 @@ def import_with_fx(
    return_str=False,
    save_dir=tempfile.gettempdir(),
    model_name="model",
+    mlir_type="linalg",
+    is_dynamic=False,
+    tracing_required=False,
 ):
    import torch
    from torch.fx.experimental.proxy_tensor import make_fx
@@ -458,7 +506,12 @@ def import_with_fx(

    if debug:  # and not is_f16:
        (mlir_module, func_name), _, _ = mlir_importer.import_debug(
-            dir=save_dir, model_name=model_name, golden_values=golden_values
+            dir=save_dir,
+            model_name=model_name,
+            golden_values=golden_values,
+            mlir_type=mlir_type,
+            is_dynamic=is_dynamic,
+            tracing_required=tracing_required,
        )
        return mlir_module, func_name

--- a/shark/shark_runner.py
+++ b/shark/shark_runner.py
@@ -25,7 +25,14 @@ import sys


 # supported dialects by the shark-runtime.
-supported_dialects = {"linalg", "mhlo", "tosa", "tf-lite", "tm_tensor"}
+supported_dialects = {
+    "linalg",
+    "mhlo",
+    "stablehlo",
+    "tosa",
+    "tf-lite",
+    "tm_tensor",
+}


 class SharkRunner:
--- a/shark/shark_trainer.py
+++ b/shark/shark_trainer.py
@@ -59,6 +59,7 @@ class SharkTrainer:
            "torch",
            "tensorflow",
            "tf",
+            "stablehlo",
            "mhlo",
            "linalg",
            "tosa",
@@ -84,7 +85,7 @@ class SharkTrainer:
                "tm_tensor",
                extra_args=extra_args,
            )
-        elif self.frontend in ["tensorflow", "tf", "mhlo"]:
+        elif self.frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
            self.shark_runner = SharkRunner(
                self.model,
                self.input,
--- a/shark/torch_mlir_utils.py
+++ b/shark/torch_mlir_utils.py
@@ -19,6 +19,12 @@ import tempfile
 from shark.parser import shark_args
 import io

+mlir_type_mapping_dict = {
+    "linalg": torch_mlir.OutputType.LINALG_ON_TENSORS,
+    "stablehlo": torch_mlir.OutputType.STABLEHLO,
+    "tosa": torch_mlir.OutputType.TOSA,
+}
+

 def get_module_name_for_asm_dump(module):
    """Gets a name suitable for an assembly dump.
@@ -57,6 +63,7 @@ def get_torch_mlir_module(
    dynamic: bool,
    jit_trace: bool,
    return_str: bool = False,
+    mlir_type: str = "linalg",
 ):
    """Get the MLIR's linalg-on-tensors module from the torchscipt module."""
    ignore_traced_shapes = False
@@ -70,10 +77,11 @@ def get_torch_mlir_module(
    mlir_module = torch_mlir.compile(
        module,
        input,
-        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS,
+        output_type=mlir_type_mapping_dict[mlir_type],
        use_tracing=jit_trace,
        ignore_traced_shapes=ignore_traced_shapes,
    )
+
    if return_str:
        return mlir_module.operation.get_asm()
    bytecode_stream = io.BytesIO()
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -8,23 +8,23 @@ distilbert-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 facebook/convnext-tiny-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/311 & https://github.com/nod-ai/SHARK/issues/342","macos"
 funnel-transformer/small,mhlo,tf,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/201",""
 google/electra-small-discriminator,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile",""
+google/mobilebert-uncased,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"Fails during iree-compile","macos"
 google/vit-base-patch16-224,mhlo,tf,1e-2,1e-3,tf_vit,nhcw-nhwc,False,False,False,"",""
 microsoft/MiniLM-L12-H384-uncased,mhlo,tf,1e-2,1e-3,tf_hf,None,True,False,False,"Fails during iree-compile.",""
 microsoft/layoutlm-base-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 microsoft/mpnet-base,mhlo,tf,1e-2,1e-2,default,None,True,True,True,"",""
 albert-base-v2,linalg,torch,1e-2,1e-3,default,None,True,True,True,"issue with aten.tanh in torch-mlir",""
 alexnet,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/879",""
-bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,False,True,"",""
-bert-large-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
+bert-base-cased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
+bert-base-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
+bert-base-uncased_fp16,linalg,torch,1e-1,1e-1,default,None,True,True,True,"",""
+bert-large-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
 bert-large-uncased,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
 facebook/deit-small-distilled-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails during iree-compile.",""
 google/vit-base-patch16-224,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/311",""
 microsoft/beit-base-patch16-224-pt22k-ft22k,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/390","macos"
-microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"",""
-google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,False,False,"https://github.com/nod-ai/SHARK/issues/344",""
+microsoft/MiniLM-L12-H384-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"",""
+google/mobilebert-uncased,linalg,torch,1e-2,1e-3,default,None,False,True,False,"https://github.com/nod-ai/SHARK/issues/344","macos"
 mobilenet_v3_small,linalg,torch,1e-1,1e-2,default,nhcw-nhwc,False,True,False,"https://github.com/nod-ai/SHARK/issues/388","macos"
 nvidia/mit-b0,linalg,torch,1e-2,1e-3,default,None,True,True,False,"https://github.com/nod-ai/SHARK/issues/343","macos"
 resnet101,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
@@ -35,12 +35,13 @@ squeezenet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"","mac
 wide_resnet50_2,linalg,torch,1e-2,1e-3,default,nhcw-nhwc/img2col,False,False,False,"","macos"
 efficientnet-v2-s,mhlo,tf,1e-02,1e-3,default,nhcw-nhwc,False,False,False,"","macos"
 mnasnet1_0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,True,"","macos"
-efficientnet_b0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/1243",""
-efficientnet_b7,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"Fails on MacOS builder, VK device lost","macos"
+efficientnet_b0,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,True,True,False,"https://github.com/nod-ai/SHARK/issues/1243","macos"
+efficientnet_b7,linalg,torch,1e-2,1e-3,default,nhcw-nhwc,False,True,False,"Fails on MacOS builder, VK device lost","macos"
 efficientnet_b0,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"",""
 efficientnet_b7,mhlo,tf,1e-2,1e-3,default,nhcw-nhwc,False,False,False,"Fails on MacOS builder, VK device lost","macos"
-gpt2,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"",""
-t5-base,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported.",""
-t5-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
-t5-large,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported",""
-t5-large,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"",""
+gpt2,mhlo,tf,1e-2,1e-3,default,None,True,False,False,"","macos"
+t5-base,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported.","macos"
+t5-base,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"","macos"
+t5-large,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported","macos"
+t5-large,mhlo,tf,1e-2,1e-3,default,None,False,False,False,"","macos"
+stabilityai/stable-diffusion-2-1-base,linalg,torch,1e-3,1e-3,default,None,True,False,False,"","macos"
--- a/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
+++ b/tank/examples/MiniLM_tf/huggingface_MiniLM_run.py
@@ -75,7 +75,7 @@ if __name__ == "__main__":
        compiler_module,
        target_backends=[backend],
        extra_args=args,
-        input_type="mhlo",
+        input_type="auto",
    )
    # flatbuffer_blob = compile_str(compiler_module, target_backends=["dylib-llvm-aot"])

--- a/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
+++ b/tank/examples/bert_fine_tuning/bert_fine_tune_tf.py
@@ -153,7 +153,7 @@ if __name__ == "__main__":
        compiler_module,
        target_backends=[backend],
        extra_args=args,
-        input_type="mhlo",
+        input_type="auto",
    )

    # Save module as MLIR file in a directory
--- a/tank/examples/bert_tf/bert_large_run.py
+++ b/tank/examples/bert_tf/bert_large_run.py
@@ -96,7 +96,7 @@ if __name__ == "__main__":
        compiler_module,
        target_backends=[backend],
        extra_args=args,
-        input_type="mhlo",
+        input_type="auto",
    )
    # flatbuffer_blob = compile_str(compiler_module, target_backends=["dylib-llvm-aot"])

--- a/tank/examples/bert_tf/bert_small_run.py
+++ b/tank/examples/bert_tf/bert_small_run.py
@@ -91,7 +91,7 @@ if __name__ == "__main__":
        compiler_module,
        target_backends=[backend],
        extra_args=args,
-        input_type="mhlo",
+        input_type="auto",
    )
    # flatbuffer_blob = compile_str(compiler_module, target_backends=["dylib-llvm-aot"])

--- a/tank/examples/opt/opt_torch_test.py
+++ b/tank/examples/opt/opt_torch_test.py
@@ -6,7 +6,7 @@ from hacked_hf_opt import OPTModel
 from shark.iree_utils._common import check_device_drivers, device_driver_info
 from shark.shark_inference import SharkInference
 from tank.model_utils import compare_tensors
-from transformers import GPT2Tokenizer
+from transformers import AutoTokenizer

 OPT_MODEL = "facebook/opt-350m"
 OPT_MODEL_66B = "facebook/opt-66b"
@@ -20,7 +20,7 @@ class OPTModuleTester:
        self.benchmark = benchmark

    def create_and_check_module(self, dynamic, device, model_name):
-        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        # config = OPTConfig()
        # opt_model = OPTModel(config)
        opt_model = OPTModel.from_pretrained(model_name)
--- a/tank/generate_sharktank.py
+++ b/tank/generate_sharktank.py
@@ -37,10 +37,12 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
    from tank.model_utils import (
        get_hf_model,
        get_hf_seq2seq_model,
+        get_hf_causallm_model,
        get_vision_model,
        get_hf_img_cls_model,
        get_fp16_model,
    )
+    from shark.shark_importer import import_with_fx

    with open(torch_model_list) as csvfile:
        torch_reader = csv.reader(csvfile, delimiter=",")
@@ -50,6 +52,8 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
            tracing_required = row[1]
            model_type = row[2]
            is_dynamic = row[3]
+            mlir_type = row[4]
+            is_decompose = row[5]

            tracing_required = False if tracing_required == "False" else True
            is_dynamic = False if is_dynamic == "False" else True
@@ -91,6 +95,10 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
                model, input, _ = get_hf_seq2seq_model(
                    torch_model_name, import_args
                )
+            elif model_type == "hf_causallm":
+                model, input, _ = get_hf_causallm_model(
+                    torch_model_name, import_args
+                )
            elif model_type == "hf_img_cls":
                model, input, _ = get_hf_img_cls_model(
                    torch_model_name, import_args
@@ -111,25 +119,45 @@ def save_torch_model(torch_model_list, local_tank_cache, import_args):
                )
            os.makedirs(torch_model_dir, exist_ok=True)

-            mlir_importer = SharkImporter(
-                model,
-                (input,),
-                frontend="torch",
-            )
-            mlir_importer.import_debug(
-                is_dynamic=False,
-                tracing_required=tracing_required,
-                dir=torch_model_dir,
-                model_name=torch_model_name,
-            )
-            # Generate torch dynamic models.
-            if is_dynamic:
+            if is_decompose:
+                # Add decomposition to some torch ops
+                # TODO add op whitelist/blacklist
+                import_with_fx(
+                    model,
+                    (input,),
+                    is_f16=False,
+                    f16_input_mask=None,
+                    debug=True,
+                    training=False,
+                    return_str=False,
+                    save_dir=torch_model_dir,
+                    model_name=torch_model_name,
+                    mlir_type=mlir_type,
+                    is_dynamic=False,
+                    tracing_required=tracing_required,
+                )
+            else:
+                mlir_importer = SharkImporter(
+                    model,
+                    (input,),
+                    frontend="torch",
+                )
                mlir_importer.import_debug(
-                    is_dynamic=True,
+                    is_dynamic=False,
                    tracing_required=tracing_required,
                    dir=torch_model_dir,
-                    model_name=torch_model_name + "_dynamic",
+                    model_name=torch_model_name,
+                    mlir_type=mlir_type,
                )
+                # Generate torch dynamic models.
+                if is_dynamic:
+                    mlir_importer.import_debug(
+                        is_dynamic=True,
+                        tracing_required=tracing_required,
+                        dir=torch_model_dir,
+                        model_name=torch_model_name + "_dynamic",
+                        mlir_type=mlir_type,
+                    )


 def save_tf_model(tf_model_list, local_tank_cache, import_args):
--- a/tank/model_utils.py
+++ b/tank/model_utils.py
@@ -176,6 +176,43 @@ def get_hf_seq2seq_model(name, import_args):
    return m, test_input, actual_out


+##################### Hugging Face CausalLM Models ###################################
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+def prepare_sentence_tokens(hf_model: str, sentence: str):
+    tokenizer = AutoTokenizer.from_pretrained(hf_model)
+    return torch.tensor([tokenizer.encode(sentence)])
+
+
+class HFCausalLM(torch.nn.Module):
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,  # The pretrained model name.
+            # The number of output labels--2 for binary classification.
+            num_labels=2,
+            # Whether the model returns attentions weights.
+            output_attentions=False,
+            # Whether the model returns all hidden-states.
+            output_hidden_states=False,
+            torchscript=True,
+        )
+        self.model.eval()
+
+    def forward(self, tokens):
+        return self.model.forward(tokens)[0]
+
+
+def get_hf_causallm_model(name, import_args):
+    m = HFCausalLM(name)
+    test_input = prepare_sentence_tokens(
+        name, "this project is very interesting"
+    )
+    actual_out = m.forward(*test_input)
+    return m, test_input, actual_out
+
+
 ################################################################################

 ##################### Torch Vision Models    ###################################
--- a/tank/torch_model_list.csv
+++ b/tank/torch_model_list.csv
@@ -1,23 +1,27 @@
-model_name, use_tracing, model_type, dynamic, param_count, tags, notes
-efficientnet_b0,True,vision,False,5.3M,"image-classification;cnn;conv2d;depthwise-conv","Smallest EfficientNet variant with 224x224 input"
-efficientnet_b7,True,vision,False,66M,"image-classification;cnn;conv2d;depthwise-conv","Largest EfficientNet variant with 600x600 input"
-microsoft/MiniLM-L12-H384-uncased,True,hf,True,66M,"nlp;bert-variant;transformer-encoder","Large version has 12 layers; 384 hidden size; Smaller than BERTbase (66M params vs 109M params)"
-bert-base-uncased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
-bert-base-cased,True,hf,True,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
-google/mobilebert-uncased,True,hf,True,25M,"nlp,bert-variant,transformer-encoder,mobile","24 layers, 512 hidden size, 128 embedding"
-alexnet,False,vision,True,61M,"cnn,parallel-layers","The CNN that revolutionized computer vision (move away from hand-crafted features to neural networks),10 years old now and probably no longer used in prod."
-resnet18,False,vision,True,11M,"cnn,image-classification,residuals,resnet-variant","1 7x7 conv2d and the rest are 3x3 conv2d"
-resnet50,False,vision,True,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
-resnet101,False,vision,True,29M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
-squeezenet1_0,False,vision,True,1.25M,"cnn,image-classification,mobile,parallel-layers","Parallel conv2d (1x1 conv to compress -> (3x3 expand | 1x1 expand) -> concat)"
-wide_resnet50_2,False,vision,True,69M,"cnn,image-classification,residuals,resnet-variant","Resnet variant where model depth is decreased and width is increased."
-mobilenet_v3_small,False,vision,True,2.5M,"image-classification,cnn,mobile",N/A
-google/vit-base-patch16-224,True,hf_img_cls,False,86M,"image-classification,vision-transformer,transformer-encoder",N/A
-microsoft/resnet-50,True,hf_img_cls,False,23M,"image-classification,cnn,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
-facebook/deit-small-distilled-patch16-224,True,hf_img_cls,False,22M,"image-classification,vision-transformer,cnn",N/A
-microsoft/beit-base-patch16-224-pt22k-ft22k,True,hf_img_cls,False,86M,"image-classification,transformer-encoder,bert-variant,vision-transformer",N/A
-nvidia/mit-b0,True,hf_img_cls,False,3.7M,"image-classification,transformer-encoder",SegFormer
-mnasnet1_0,False,vision,True,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
-resnet50_fp16,False,vision,True,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
-bert-base-uncased_fp16,True,fp16,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
-bert-large-uncased,True,hf,True,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
+model_name, use_tracing, model_type, dynamic, mlir_type, decompose, param_count, tags, notes
+efficientnet_b0,True,vision,False,linalg,False,5.3M,"image-classification;cnn;conv2d;depthwise-conv","Smallest EfficientNet variant with 224x224 input"
+efficientnet_b7,True,vision,False,linalg,False,66M,"image-classification;cnn;conv2d;depthwise-conv","Largest EfficientNet variant with 600x600 input"
+microsoft/MiniLM-L12-H384-uncased,True,hf,True,linalg,False,66M,"nlp;bert-variant;transformer-encoder","Large version has 12 layers; 384 hidden size; Smaller than BERTbase (66M params vs 109M params)"
+bert-base-uncased,True,hf,True,linalg,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+bert-base-cased,True,hf,True,linalg,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+google/mobilebert-uncased,True,hf,True,linalg,False,25M,"nlp,bert-variant,transformer-encoder,mobile","24 layers, 512 hidden size, 128 embedding"
+alexnet,False,vision,True,linalg,False,61M,"cnn,parallel-layers","The CNN that revolutionized computer vision (move away from hand-crafted features to neural networks),10 years old now and probably no longer used in prod."
+resnet18,False,vision,True,linalg,False,11M,"cnn,image-classification,residuals,resnet-variant","1 7x7 conv2d and the rest are 3x3 conv2d"
+resnet50,False,vision,True,linalg,False,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+resnet101,False,vision,True,linalg,False,29M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+squeezenet1_0,False,vision,True,linalg,False,1.25M,"cnn,image-classification,mobile,parallel-layers","Parallel conv2d (1x1 conv to compress -> (3x3 expand | 1x1 expand) -> concat)"
+wide_resnet50_2,False,vision,True,linalg,False,69M,"cnn,image-classification,residuals,resnet-variant","Resnet variant where model depth is decreased and width is increased."
+mobilenet_v3_small,False,vision,True,linalg,False,2.5M,"image-classification,cnn,mobile",N/A
+google/vit-base-patch16-224,True,hf_img_cls,False,linalg,False,86M,"image-classification,vision-transformer,transformer-encoder",N/A
+microsoft/resnet-50,True,hf_img_cls,False,linalg,False,23M,"image-classification,cnn,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+facebook/deit-small-distilled-patch16-224,True,hf_img_cls,False,linalg,False,22M,"image-classification,vision-transformer,cnn",N/A
+microsoft/beit-base-patch16-224-pt22k-ft22k,True,hf_img_cls,False,linalg,False,86M,"image-classification,transformer-encoder,bert-variant,vision-transformer",N/A
+nvidia/mit-b0,True,hf_img_cls,False,linalg,False,3.7M,"image-classification,transformer-encoder",SegFormer
+mnasnet1_0,False,vision,True,linalg,False,-,"cnn, torchvision, mobile, architecture-search","Outperforms other mobile CNNs on Accuracy vs. Latency"
+resnet50_fp16,False,vision,True,linalg,False,23M,"cnn,image-classification,residuals,resnet-variant","Bottlenecks with only conv2d (1x1 conv -> 3x3 conv -> 1x1 conv blocks)"
+bert-base-uncased_fp16,True,fp16,False,linalg,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+bert-large-uncased,True,hf,True,linalg,False,330M,"nlp;bert-variant;transformer-encoder","24 layers, 1024 hidden units, 16 attention heads"
+bert-base-uncased,True,hf,False,stablehlo,False,109M,"nlp;bert-variant;transformer-encoder","12 layers; 768 hidden; 12 attention heads"
+gpt2,True,hf_causallm,False,stablehlo,True,125M,"nlp;transformer-encoder","-"
+facebook/opt-125m,True,hf,False,stablehlo,True,125M,"nlp;transformer-encoder","-"
+distilgpt2,True,hf,False,stablehlo,True,88M,"nlp;transformer-encoder","-"
--- a/tank_version.json
+++ b/tank_version.json
@@ -1,3 +1,3 @@
 {
-	"version": "2023-03-31_02d52bb"
+	"version": "nightly"
 }
Author	SHA1	Message	Date
Anush Elangovan	7ef1bea953	XFAIL some macos tests	2023-06-04 15:27:03 -07:00
Chi_Liu	ad89bb1413	Add distilgpt2 to stablehlo in shark tank (#1481 )	2023-06-02 16:44:46 -05:00
Ean Garvey	218ed78c40	Change instances of input_type='mhlo' to 'auto' (#1482 )	2023-06-02 16:43:47 -05:00
Stefan Kapusniak	6046f36ab6	UI/Web: Fix upscaler stop button (mostly) (#1479 ) * UI/Web: Fix upscaler stop button * Hook the cancel_sd function up to the Stop button. * Adds checks for SD_STATE_CANCEL in the upscaler ui inference function. * Set and check for SD_STATE_IDLE, SD_STATE_CANCEL in the upscaler pipeline. * UI/Web: lint fixes for upscaler stop button fix --------- Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-06-01 22:26:55 -07:00
Foxlum	5915bf7de3	Add to and tweak vulkan configuration environments. (#1475 ) * Update vulkan_target_env_utils.py * Update vulkan_target_env_utils.py Adjust target environment capabilities. * Update vulkan_target_env_utils.py black linted?	2023-06-01 22:25:20 -07:00
Phaneesh Barwaria	f0a4e59758	LLM Pipeline Wrapper (#1477 ) * [LLM] Add LLM pipeline Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> * add base pipeline and stableLM * StableLM on UI - full block * add SLM default model name * add vicuna with pipeline * add one token gen api for vic * Fix stableLM bugs * debug vic memory * lint fix --------- Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com> Co-authored-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-31 10:17:20 -07:00
Stefan Kapusniak	1ddef26af5	Web/UI: Add an Output Gallery tab for SD (#1470 ) * WebUI: Adds an Output Gallery tab Adds an new Output Gallery tab to the ui/webui with these features: * Subdirectory select dropdown listing subdirectories at any depth below the <output_dir>/generated_imgs directory, * Large, full height, gallery area displaying the images in the selected subdirectory. Shows nod logo when no images are in the selected subdirectory. * Slider that changes the number of columns of images that the gallery displays from between 1 to 16 columns (defaults to 4). * Expandable parameter info panel showing any generation parameters saved in the file of the selected image for PNGs, alternatively the image's EXIF data for JPEGs * Send to buttons for txt2img, img2img, inpaint, outpaint and upscaler. * Auto update of gallery and gallery label (to show generation status), when a new image is generated by any of the stable diffusion tabs, and is outputted to the currently selected subdirectory. * Command line option for enabling and disabling the output gallery (defaults to enabled) * Command line option for following symlinks when getting entries for the subdirectory list (defaults to off, as Python os.walk doesn't check for circular references if following symlinks) * Reformat with black Reformat changes with black and then adjust some places where black's formatting then needed some rephrasing of the code to make things clearer. * Add back transformers and sd_cancel imports Adds back the transformers import in index.py needed for .exe generation. Add comment so it doesn't get mistakenly removed next time. Adds back sd_cancel import in upscaler.py that is currently unused but should be being used for the 'Stop' button.	2023-05-30 13:47:48 -07:00
Chi_Liu	ba8eddb12f	Add GPT3/OPT to Stablehlo in shark tank (#1468 ) Co-authored-by: AmosLewis <Amos_Lewsi@foxmail.com> Co-authored-by: Ean Garvey <87458719+monorimet@users.noreply.github.com>	2023-05-29 21:58:39 -07:00
yzhang93	47b346d428	Modify the lowering config format for SPIRVMatmulPromoteVectorize pipeline (#1471 )	2023-05-29 21:53:48 -07:00
Ean Garvey	1b4f4f5f4d	Fix download path for SD1.4 Unet. (#1469 )	2023-05-26 11:59:51 -07:00
Elias Joseph	73cd7e8320	added full vicuna to vicuna.py	2023-05-26 22:06:40 +05:30
Ean Garvey	19c0ae3702	Cleanup SD pipeline utils (#1466 )	2023-05-25 12:50:11 -05:00
Ean Garvey	54e57f7771	Revive SD downloads from shark_tank. (#1465 )	2023-05-25 12:03:21 -05:00
PhaneeshB	6d64b8e273	vic and slm common generation base	2023-05-25 20:29:41 +05:30
PhaneeshB	a8ea0326f5	correct SLM saved vmfb naming	2023-05-25 20:29:41 +05:30
PhaneeshB	58e9194553	add Lists import	2023-05-25 20:29:41 +05:30
PhaneeshB	eb360e255d	remove unused imports	2023-05-25 20:29:41 +05:30
PhaneeshB	a6f88d7f72	refactor mlir compile	2023-05-25 20:29:41 +05:30
Prashant Kumar	8e571d165f	Enable cpu f16 dtype tracing for the vicuna model. (#1461 )	2023-05-24 09:37:57 -07:00
Ean Garvey	3cddd01b10	Update OPT tokenizer and xfail a few more large tests on macos CI (#1459 ) * Update opt_torch_test.py * Update all_models.csv	2023-05-23 14:36:57 -07:00
Chi_Liu	64c2b2d96b	Add gpt2 to stablehlo support in shark tank (#1447 ) - Add torch decomposition support when generating shark tank - Add gpt2 stablehlo	2023-05-22 10:45:51 -07:00
Phaneesh Barwaria	f5ce121988	SLM on Sharkstudio (#1454 ) * localize import, fix file reading, device cpu * extract out model args	2023-05-19 11:21:08 -07:00
Ean Garvey	991f144598	Add iree hidden imports to SD spec (#1456 ) * Add iree hidden imports to SD spec * Update shark_sd_cli.spec	2023-05-19 11:19:16 -07:00
PhaneeshB	09bea17e59	fix #2 SLM in SharkStudio	2023-05-18 00:56:22 +05:30
Daniel Garvey	aefcf80b48	swap to cpu an remove hardcoded paths (#1448 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-05-17 10:53:34 -07:00
PhaneeshB	512235892e	fix SLM for SharkStudio	2023-05-17 22:34:30 +05:30
PhaneeshB	6602a2f5ba	add continuous output for CLI	2023-05-17 18:33:46 +05:30
Boian Petkantchin	20114deea0	In MiniLM JAX example verify MLIR result against JAX	2023-05-16 09:54:07 -07:00
Boian Petkantchin	9acf519078	Add option to skip venv creation in setup script	2023-05-16 09:54:07 -07:00
Boian Petkantchin	bdf37b5311	If device/backend is unknown pass it to IREE verbatim	2023-05-16 09:54:07 -07:00
powderluv	8ee2ac89f8	Rename sharded_vicuna_fp32_web.py to vicuna_web.py	2023-05-16 09:41:35 -07:00
powderluv	60cb48be2e	Rename sharded_vicuna_fp32.py to vicuna.py	2023-05-16 09:40:51 -07:00
powderluv	86a215b063	Delete sharded_vicunia.py	2023-05-16 09:37:39 -07:00
powderluv	d6e3a9a236	Delete standalone_vicuna.py	2023-05-16 09:37:26 -07:00
Chi_Liu	a0097a1ead	Add mlir_type for torch_model_list.csv (#1428 ) - Enable stablehlo/tosa mlir output for torch model - Add BERT stablehlo support	2023-05-15 10:23:54 -07:00
Ean Garvey	a9bae00606	Fix vulkan device selection at compile time and adapt to IREE python changes. (#1407 ) * Add support for vulkan device selection at compile time. * Don't convert device ID to int and fix .exe imports	2023-05-12 23:31:50 -07:00
Daniel Garvey	4731c1a835	prevent loading tokenizer on import (#1432 ) also adds sentencepiece dep for exe moved vicuna imports to after an if statement in general we should avoid importing files that load whole models as global variables	2023-05-12 19:11:45 -07:00
Ean Garvey	4c07e47e8c	Specify a few models for expected failure on CUDA CI. (#1430 )	2023-05-12 17:03:37 -05:00
Gaurav Shukla	e0cc2871bb	[SD] Yield 2 tokens at a time in vicuna Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-11 23:49:01 +05:30
Gaurav Shukla	649f39408b	[SD] Fix vicuna response Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-11 18:06:21 +05:30
Gaurav Shukla	c142297d73	[SD] Fix gradio to 3.22.0 version Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com	2023-05-11 18:05:55 +05:30
Gaurav Shukla	9e07360b00	[SD] Standalone vicuna with web Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-11 17:23:44 +05:30
Gaurav Shukla	7b74c86e42	[SD] Fix SAMPLE_INPUT_LEN import issue Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-11 15:41:43 +05:30
Eliasj42	fa833f8366	fixed spacing issue with chat-bot (#1417 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-05-10 16:07:50 -07:00
Gaurav Shukla	fcb059aa38	[SD] Integrate vicuna in the web (#1410 )	2023-05-10 11:30:22 -07:00
PhaneeshB	517c670f82	vicuna chat cli	2023-05-10 22:55:06 +05:30
Eliasj42	59df14f18b	added vicuna demo (#1408 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-05-09 21:18:20 -07:00
Ean Garvey	6c95ac0f37	Revert dialect registration in model annotator (#1406 ) Matches https://github.com/nod-ai/SHARK-Runtime/pull/58	2023-05-09 11:50:19 -07:00
Daniel Garvey	7a4a51ae73	vulkan vic f16 (#1404 ) Co-authored-by: dan <dan@nod-labs.com>	2023-05-08 16:46:53 -07:00
powderluv	d816cc015e	Revert "added standalone vicuna script (#1399 )" (#1402 ) This reverts commit `0e4a8ca240`.	2023-05-05 16:08:05 -07:00
Eliasj42	54ce3d48ca	added standalone vicuna script (#1401 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-05-05 18:05:52 -05:00
Eliasj42	0e4a8ca240	added standalone vicuna script (#1399 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-05-05 15:46:05 -07:00
Daniel Garvey	6ca1298675	maximizes window size for webview launch (#1394 )	2023-05-04 20:43:06 -07:00
jinchen62	bbef7a6464	Redesign model manager webui (#1391 )	2023-05-04 20:41:29 -07:00
Ean Garvey	cdf2d61d53	Remove imports from iree.compiler.transforms from model annotator. (#1392 )	2023-05-04 20:40:19 -07:00
Ean Garvey	6c14847d1f	xfail some large tests on macOS builder and switch to hash updates. (#1341 ) * Update test-models.yml * Disable large tests on macOS builder	2023-05-04 19:47:03 -05:00
Gaurav Shukla	68ecdd2a73	[SD] Add LoRA as experimental tab Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-04 22:30:25 +05:30
Gaurav Shukla	3f4d444d18	[SD] Fix stable LM chatbot Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-05-04 22:30:25 +05:30
m68k-fr	e473d0375b	[Web] Models folders cleanup (#1365 )	2023-05-03 16:13:20 -05:00
Ean Garvey	e38d96850f	Fix input image loading in img2img rest API (#1388 )	2023-05-03 15:51:00 -05:00
Gaurav Shukla	fed63dfd4b	[SD] Add stableLM chatbot (#1383 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com> Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-05-03 15:37:20 -05:00
Boian Petkantchin	eba4d06405	In MiniLM JAX example do not hardcode device (#1385 ) * In MiniLM JAX example do not hardcode device * In MiniLM JAX example don't use bytecode MLIR --------- Co-authored-by: Boian Petkantchin <boian@nod-labs.com>	2023-05-03 10:34:42 -07:00
Boian Petkantchin	4cfba153d2	Add example JAX MiniLM inference (#1380 ) * Do not hardcode the name of the VM module in get_iree_module * Add example JAX MiniLM inference --------- Co-authored-by: Boian Petkantchin <boian@nod-labs.com>	2023-05-02 15:03:54 -07:00
jinchen62	307c05f38d	Convert original vae to diffusers (#1382 )	2023-05-02 01:27:28 -07:00
jinchen62	696df349cb	Fix curl issue (#1369 )	2023-04-28 09:31:14 -07:00
jinchen62	cb54cb1348	Add model manager tab for SD webui (#1368 )	2023-04-28 02:43:40 -07:00
Daniel Garvey	9bdb86637d	add tkinter launch for webui (#1364 )	2023-04-27 19:17:55 -05:00
jinchen62	fb6f26517f	Fix webui note (#1367 )	2023-04-27 16:14:43 -07:00
Chi_Liu	aa8ada9da9	Add support for torch to stablehlo and tosa in shark_importer (#1360 )	2023-04-27 08:09:45 -07:00
powderluv	1db906a373	Revert "Add model manager tab for webui (#1359 )" (#1362 ) This reverts commit `9d1d1617d8`.	2023-04-26 22:25:26 -07:00
jinchen62	9d1d1617d8	Add model manager tab for webui (#1359 )	2023-04-26 13:38:18 -07:00
jinchen62	7112789cb8	Add support of using civitai model download url (#1357 )	2023-04-25 23:39:52 -07:00
jinchen62	d6b8be2849	Add drawing canvas for img2img stencil scribble (#1355 )	2023-04-25 14:41:01 -07:00
powderluv	822171277c	Revert "[SD] Add FastChat as part of SD WebUI (#1349 )" (#1350 ) This reverts commit `a5ae9d9f02`.	2023-04-24 15:22:25 -07:00
Abhishek Varma	a5ae9d9f02	[SD] Add FastChat as part of SD WebUI (#1349 ) -- This commit includes FastChat as part of SD WebUI. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com>	2023-04-24 11:12:58 -07:00
powderluv	09e3f63d5b	Fix pascal (#1346 ) * Add fp32 for upscaler VAE * Plumb Pascal vulkan support	2023-04-23 20:28:25 -07:00
powderluv	d60a5a9396	Add fp32 for upscaler VAE (#1345 )	2023-04-23 15:27:55 -07:00
m68k-fr	90df0ee365	[Web] Gallery set to a 768px reference for high-end desktop users (#1344 )	2023-04-23 11:48:06 -07:00
nirvedhmeshram	133c1bcadd	add device to scheduler model names (#1338 )	2023-04-22 20:13:56 -05:00
powderluv	caadbe14e9	Revert VAE to use im2col (#1339 )	2023-04-22 15:23:41 -07:00
Ean Garvey	5f5823ccd9	Fix inference object imports for SD apps. (#1334 )	2023-04-21 13:40:48 -05:00
Vivek Khandelwal	d2f7e03b7e	Add StableLM model (#1331 )	2023-04-21 09:51:02 -07:00
Gaurav Shukla	0b01bbe479	[SD] Add txt2img/upscaler/inpaint/outpaint Rest API (#1325 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-04-21 09:06:06 -07:00
yzhang93	25c5fc44ae	Modify tuner.py to take vulkan target triple flag (#1328 )	2023-04-20 14:31:32 -07:00
Daniel Garvey	7330729c92	enable sd pytest (#1322 )	2023-04-19 22:11:30 -05:00
Ean Garvey	ce16cd5431	Create local shark_tank if needed for tuning configs. (#1321 ) Now that --clear_all successfully deletes local shark_tank cache, we need to make sure it exists before trying to use it.	2023-04-19 11:44:21 -05:00
Ean Garvey	598dc5f79d	Don't dump image data on img2img api call. (#1320 )	2023-04-19 21:24:46 +05:30
Abhishek Varma	1f8e332cbe	[SD] Fix img2img API bug for custom_vae argument (#1319 ) -- https://github.com/nod-ai/SHARK/pull/1314 misses to add `custom_vae` parameter to img2img_if's invocation within img2img_api. -- This commit adds a fix to the same. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Co-authored-by: Abhishek Varma <abhishek@nod-labs.com>	2023-04-19 10:39:52 -05:00
Abhishek Varma	17b9632659	[SD] Adapted SHARK's v1 img2img API for SdPaint + updated Stencil model ID (#1318 )	2023-04-19 06:29:36 -07:00
jinchen62	bda92a54ab	Fix custom vae path (#1317 )	2023-04-18 20:50:43 -07:00
jinchen62	747ed383b1	Add custom vae dropdown in webui (#1314 )	2023-04-18 17:24:02 -07:00
Ean Garvey	1afe07c296	Disable winograd on VAE with rdna2 and fix unet tuning. (#1313 ) * Disable winograd on VAE with rdna2 and fix unet tuning. * Fix batch size 1 downloads and clear_all on windows.	2023-04-18 15:55:10 -05:00
jinchen62	b70919b38d	Fix memory leak with ondemand (#1312 ) support ondemand for outpainting and multi batch_count	2023-04-18 13:03:16 -05:00
m68k-fr	4e513d647f	Update list of scheduler available for inferences (#1298 )	2023-04-17 22:37:00 -05:00
jinchen62	94cd2a0fed	Fix outpainting config (#1310 )	2023-04-17 10:48:52 -07:00
Kyle Herndon	606029c01c	Fix LoRA device format bug and allow LoRA to resume from a previous training	2023-04-17 13:19:46 +05:30
powderluv	1aa85222e9	Add AMD W7900 target triple (#1304 ) This maps to RDNA3	2023-04-16 00:14:21 -07:00
m68k-fr	1b3f468c04	[Web] Style Fixes for Gradio V3.25.0 (#1300 )	2023-04-13 18:40:42 -05:00
m68k-fr	35de7e27fa	[Web] remove txt2img ui dependencies from png import metadata (#1275 )	2023-04-12 07:32:47 -10:00
yzhang93	467f900759	Add auto-tuner to SD apps (#1291 )	2023-04-12 09:21:17 -07:00
Ean Garvey	0bd9d582c7	Add documentation for using SHARK with AI-Render (#1296 )	2023-04-12 03:09:34 -10:00
jinchen62	428cfe8dae	Fix low vram mode issues (#1295 ) - add ondemand back to img2img - workaround memory leak for batch count	2023-04-11 17:59:09 -07:00
Ean Garvey	f17915bedc	Fix batch size appending to model name. (#1294 ) * Update shark_downloader.py * Update shark_downloader.py	2023-04-11 15:34:25 -05:00
Gaurav Shukla	1b49b5149a	[SD] Add Img2Img rest API Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-04-11 23:06:58 +05:30
jinchen62	3002793301	Unload clip on demand and workaround memory leak (#1283 )	2023-04-10 16:59:03 -07:00
Phaneesh Barwaria	d25ef5529f	Add fix for vae fp32 Upscalar (#1284 ) - fixes size mismatch error for upscalar vae	2023-04-07 14:36:40 -05:00
Ean Garvey	308856a947	Touch unet if base cfg needed for SD pipeline init (#1281 )	2023-04-05 03:02:29 -05:00
m68k-fr	151b4e142f	[SD] Fix encoder error for model_max_length not beeing 77 (#1278 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-04-04 22:39:29 -07:00