Update vulkan_utils.py

Revert to 20231212
2026-01-11 14:58:11 -05:00 · 2024-01-09 12:39:11 -06:00 · 2024-01-09 12:31:38 -06:00 · 2024-01-09 12:17:37 -06:00 · 2024-01-08 23:01:51 -06:00 · 2024-01-08 23:01:22 -06:00
166 changed files with 17304 additions and 6034 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -51,11 +51,11 @@ jobs:
      run: |
        ./setup_venv.ps1
        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
-        pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
+        pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
        python process_skipfiles.py
        pyinstaller .\apps\stable_diffusion\shark_sd.spec
        mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
-        signtool sign /f c:\g\shark_02152023.cer /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
+        signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
  
    - name: Upload Release Assets
      id: upload-release-assets
@@ -74,80 +74,3 @@ jobs:
        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
      with:
        release_id: ${{ steps.create_release.outputs.id }}
-
-  linux-build:
-
-    runs-on: a100
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.11"]
-        backend: [IREE, SHARK]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup pip cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Install dependencies
-      run: |
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-    - name: Build and validate the IREE package
-      if: ${{ matrix.backend == 'IREE' }}
-      continue-on-error: true
-      run: |
-        cd $GITHUB_WORKSPACE
-        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
-        source iree.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        if !(grep -Fxq " failed" pytest_results.txt) 
-          then 
-            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
-            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/nightly/
-        fi
-        rm -rf ./wheelhouse/nodai*
-
-    - name: Build and validate the SHARK Runtime package
-      if: ${{ matrix.backend == 'SHARK' }}
-      run: |
-        cd $GITHUB_WORKSPACE
-        ./setup_venv.sh
-        source shark.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
-          tail -n 1 |
-          tee -a pytest_results.txt
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -112,7 +112,7 @@ jobs:
        cd $GITHUB_WORKSPACE
        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark=native --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cpu 
+        pytest --benchmark=native --update_tank -k cpu 
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
        python build_tools/vicuna_testing.py
@@ -121,9 +121,9 @@ jobs:
      if: matrix.suite == 'cuda'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark=native --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cuda
+        pytest --benchmark=native --update_tank -k cuda
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
        # Disabled due to black image bug
@@ -137,16 +137,17 @@ jobs:
        source shark.venv/bin/activate
        echo $PATH
        pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k metal
+      # disabled due to a low-visibility memory issue with pytest on macos.
+      # pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k metal

    - name: Validate Vulkan Models (a100)
      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
      run: |
        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
+        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
        source shark.venv/bin/activate
-        pytest --forked --benchmark="native" --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k vulkan
-        python build_tools/stable_diffusion_testing.py --device=vulkan
+        pytest --update_tank -k vulkan
+        python build_tools/stable_diffusion_testing.py --device=vulkan --no-exit_on_fail

    - name: Validate Vulkan Models (Windows)
      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
--- a/.gitignore
+++ b/.gitignore
@@ -182,7 +182,7 @@ generated_imgs/

 # Custom model related artefacts
 variants.json
-models/
+/models/

 # models folder
 apps/stable_diffusion/web/models/
@@ -193,3 +193,12 @@ stencil_annotator/
 # For DocuChat
 apps/language_models/langchain/user_path/
 db_dir_UserData
+
+# Embeded browser cache and other
+apps/stable_diffusion/web/EBWebView/
+
+# Llama2 tokenizer configs
+llama2_tokenizer_configs/
+
+# Webview2 runtime artefacts
+EBWebView/
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "inference/thirdparty/shark-runtime"]
 	path = inference/thirdparty/shark-runtime
-	url =https://github.com/nod-ai/SHARK-Runtime.git
+	url =https://github.com/nod-ai/SRT.git
 	branch = shark-06032022
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
  <summary>Prerequisites - Drivers </summary>
  
 #### Install your Windows hardware drivers
-* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
+* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
 * [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
  
@@ -170,7 +170,7 @@ python -m pip install --upgrade pip
 This step pip installs SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11

 ```shell
-pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f  https://nod-ai.github.io/SRT/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```

 ### Run shark tank model tests.
@@ -254,7 +254,6 @@ if you want to instead incorporate this into a python script, you can pass the `
 ```
 shark_module = SharkInference(
        mlir_model,
-        func_name,
        device=args.device,
        mlir_dialect="tm_tensor",
        dispatch_benchmarks="all",
@@ -297,7 +296,7 @@ torch_mlir, func_name = mlir_importer.import_mlir(tracing_required=True)
 # SharkInference accepts mlir in linalg, mhlo, and tosa dialect.

 from shark.shark_inference import SharkInference
-shark_module = SharkInference(torch_mlir, func_name, device="cpu", mlir_dialect="linalg")
+shark_module = SharkInference(torch_mlir, device="cpu", mlir_dialect="linalg")
 shark_module.compile()
 result = shark_module.forward((input))

@@ -320,12 +319,17 @@ mhlo_ir = r"""builtin.module  {

 arg0 = np.ones((1, 4)).astype(np.float32)
 arg1 = np.ones((4, 1)).astype(np.float32)
-shark_module = SharkInference(mhlo_ir, func_name="forward", device="cpu", mlir_dialect="mhlo")
+shark_module = SharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
 shark_module.compile()
 result = shark_module.forward((arg0, arg1))
 ```
 </details>

+## Examples Using the REST API
+
+* [Setting up SHARK for use with Blender](./docs/shark_sd_blender.md)
+* [Setting up SHARK for use with Koboldcpp](./docs/shark_sd_koboldcpp.md)
+
 ## Supported and Validated Models

 SHARK is maintained to support the latest innovations in ML Models: 
--- a/apps/language_models/langchain/expanded_pipelines.py
+++ b/apps/language_models/langchain/expanded_pipelines.py
@@ -1,4 +1,3 @@
-"""Load question answering chains."""
 from __future__ import annotations
 from typing import (
    Any,
@@ -11,23 +10,34 @@ from typing import (
    Union,
    Protocol,
 )
+import inspect
+import json
+import warnings
+from pathlib import Path
+import yaml
+from abc import ABC, abstractmethod
+import langchain
 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.base import BaseCallbackManager
 from langchain.chains.question_answering import stuff_prompt
 from langchain.prompts.base import BasePromptTemplate
 from langchain.docstore.document import Document
-from abc import ABC, abstractmethod
-from langchain.chains.base import Chain
 from langchain.callbacks.manager import (
    CallbackManager,
    CallbackManagerForChainRun,
    Callbacks,
 )
+from langchain.load.serializable import Serializable
+from langchain.schema import RUN_KEY, BaseMemory, RunInfo
 from langchain.input import get_colored_text
 from langchain.load.dump import dumpd
 from langchain.prompts.prompt import PromptTemplate
 from langchain.schema import LLMResult, PromptValue
-from pydantic import Extra, Field, root_validator
+from pydantic import Extra, Field, root_validator, validator
+
+
+def _get_verbosity() -> bool:
+    return langchain.verbose


 def format_document(doc: Document, prompt: BasePromptTemplate) -> str:
@@ -48,6 +58,413 @@ def format_document(doc: Document, prompt: BasePromptTemplate) -> str:
    return prompt.format(**document_info)


+class Chain(Serializable, ABC):
+    """Base interface that all chains should implement."""
+
+    memory: Optional[BaseMemory] = None
+    callbacks: Callbacks = Field(default=None, exclude=True)
+    callback_manager: Optional[BaseCallbackManager] = Field(
+        default=None, exclude=True
+    )
+    verbose: bool = Field(
+        default_factory=_get_verbosity
+    )  # Whether to print the response text
+    tags: Optional[List[str]] = None
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        arbitrary_types_allowed = True
+
+    @property
+    def _chain_type(self) -> str:
+        raise NotImplementedError("Saving not supported for this chain type.")
+
+    @root_validator()
+    def raise_deprecation(cls, values: Dict) -> Dict:
+        """Raise deprecation warning if callback_manager is used."""
+        if values.get("callback_manager") is not None:
+            warnings.warn(
+                "callback_manager is deprecated. Please use callbacks instead.",
+                DeprecationWarning,
+            )
+            values["callbacks"] = values.pop("callback_manager", None)
+        return values
+
+    @validator("verbose", pre=True, always=True)
+    def set_verbose(cls, verbose: Optional[bool]) -> bool:
+        """If verbose is None, set it.
+
+        This allows users to pass in None as verbose to access the global setting.
+        """
+        if verbose is None:
+            return _get_verbosity()
+        else:
+            return verbose
+
+    @property
+    @abstractmethod
+    def input_keys(self) -> List[str]:
+        """Input keys this chain expects."""
+
+    @property
+    @abstractmethod
+    def output_keys(self) -> List[str]:
+        """Output keys this chain expects."""
+
+    def _validate_inputs(self, inputs: Dict[str, Any]) -> None:
+        """Check that all inputs are present."""
+        missing_keys = set(self.input_keys).difference(inputs)
+        if missing_keys:
+            raise ValueError(f"Missing some input keys: {missing_keys}")
+
+    def _validate_outputs(self, outputs: Dict[str, Any]) -> None:
+        missing_keys = set(self.output_keys).difference(outputs)
+        if missing_keys:
+            raise ValueError(f"Missing some output keys: {missing_keys}")
+
+    @abstractmethod
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Run the logic of this chain and return the output."""
+
+    def __call__(
+        self,
+        inputs: Union[Dict[str, Any], Any],
+        return_only_outputs: bool = False,
+        callbacks: Callbacks = None,
+        *,
+        tags: Optional[List[str]] = None,
+        include_run_info: bool = False,
+    ) -> Dict[str, Any]:
+        """Run the logic of this chain and add to output if desired.
+
+        Args:
+            inputs: Dictionary of inputs, or single input if chain expects
+                only one param.
+            return_only_outputs: boolean for whether to return only outputs in the
+                response. If True, only new keys generated by this chain will be
+                returned. If False, both input keys and new keys generated by this
+                chain will be returned. Defaults to False.
+            callbacks: Callbacks to use for this chain run. If not provided, will
+                use the callbacks provided to the chain.
+            include_run_info: Whether to include run info in the response. Defaults
+                to False.
+        """
+        input_docs = inputs["input_documents"]
+        missing_keys = set(self.input_keys).difference(inputs)
+        if missing_keys:
+            raise ValueError(f"Missing some input keys: {missing_keys}")
+
+        callback_manager = CallbackManager.configure(
+            callbacks, self.callbacks, self.verbose, tags, self.tags
+        )
+        run_manager = callback_manager.on_chain_start(
+            dumpd(self),
+            inputs,
+        )
+
+        if "is_first" in inputs.keys() and not inputs["is_first"]:
+            run_manager_ = run_manager
+            input_list = [inputs]
+            stop = None
+            prompts = []
+            for inputs in input_list:
+                selected_inputs = {
+                    k: inputs[k] for k in self.prompt.input_variables
+                }
+                prompt = self.prompt.format_prompt(**selected_inputs)
+                _colored_text = get_colored_text(prompt.to_string(), "green")
+                _text = "Prompt after formatting:\n" + _colored_text
+                if run_manager_:
+                    run_manager_.on_text(_text, end="\n", verbose=self.verbose)
+                if "stop" in inputs and inputs["stop"] != stop:
+                    raise ValueError(
+                        "If `stop` is present in any inputs, should be present in all."
+                    )
+                prompts.append(prompt)
+
+            prompt_strings = [p.to_string() for p in prompts]
+            prompts = prompt_strings
+            callbacks = run_manager_.get_child() if run_manager_ else None
+            tags = None
+
+            """Run the LLM on the given prompt and input."""
+            # If string is passed in directly no errors will be raised but outputs will
+            # not make sense.
+            if not isinstance(prompts, list):
+                raise ValueError(
+                    "Argument 'prompts' is expected to be of type List[str], received"
+                    f" argument of type {type(prompts)}."
+                )
+            params = self.llm.dict()
+            params["stop"] = stop
+            options = {"stop": stop}
+            disregard_cache = self.llm.cache is not None and not self.llm.cache
+            callback_manager = CallbackManager.configure(
+                callbacks,
+                self.llm.callbacks,
+                self.llm.verbose,
+                tags,
+                self.llm.tags,
+            )
+            if langchain.llm_cache is None or disregard_cache:
+                # This happens when langchain.cache is None, but self.cache is True
+                if self.llm.cache is not None and self.cache:
+                    raise ValueError(
+                        "Asked to cache, but no cache found at `langchain.cache`."
+                    )
+                run_manager_ = callback_manager.on_llm_start(
+                    dumpd(self),
+                    prompts,
+                    invocation_params=params,
+                    options=options,
+                )
+
+                generations = []
+                for prompt in prompts:
+                    inputs_ = prompt
+                    num_workers = None
+                    batch_size = None
+
+                    if num_workers is None:
+                        if self.llm.pipeline._num_workers is None:
+                            num_workers = 0
+                        else:
+                            num_workers = self.llm.pipeline._num_workers
+                    if batch_size is None:
+                        if self.llm.pipeline._batch_size is None:
+                            batch_size = 1
+                        else:
+                            batch_size = self.llm.pipeline._batch_size
+
+                    preprocess_params = {}
+                    generate_kwargs = {}
+                    preprocess_params.update(generate_kwargs)
+                    forward_params = generate_kwargs
+                    postprocess_params = {}
+                    # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+                    preprocess_params = {
+                        **self.llm.pipeline._preprocess_params,
+                        **preprocess_params,
+                    }
+                    forward_params = {
+                        **self.llm.pipeline._forward_params,
+                        **forward_params,
+                    }
+                    postprocess_params = {
+                        **self.llm.pipeline._postprocess_params,
+                        **postprocess_params,
+                    }
+
+                    self.llm.pipeline.call_count += 1
+                    if (
+                        self.llm.pipeline.call_count > 10
+                        and self.llm.pipeline.framework == "pt"
+                        and self.llm.pipeline.device.type == "cuda"
+                    ):
+                        warnings.warn(
+                            "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+                            " dataset",
+                            UserWarning,
+                        )
+
+                    model_inputs = self.llm.pipeline.preprocess(
+                        inputs_, **preprocess_params
+                    )
+                    model_outputs = self.llm.pipeline.forward(
+                        model_inputs, **forward_params
+                    )
+                    model_outputs["process"] = False
+                    return model_outputs
+                output = LLMResult(generations=generations)
+                run_manager_.on_llm_end(output)
+                if run_manager_:
+                    output.run = RunInfo(run_id=run_manager_.run_id)
+                response = output
+
+            outputs = [
+                # Get the text of the top generated string.
+                {self.output_key: generation[0].text}
+                for generation in response.generations
+            ][0]
+            run_manager.on_chain_end(outputs)
+            final_outputs: Dict[str, Any] = self.prep_outputs(
+                inputs, outputs, return_only_outputs
+            )
+            if include_run_info:
+                final_outputs[RUN_KEY] = RunInfo(run_id=run_manager.run_id)
+            return final_outputs
+        else:
+            _run_manager = (
+                run_manager or CallbackManagerForChainRun.get_noop_manager()
+            )
+            docs = inputs[self.input_key]
+            # Other keys are assumed to be needed for LLM prediction
+            other_keys = {
+                k: v for k, v in inputs.items() if k != self.input_key
+            }
+            doc_strings = [
+                format_document(doc, self.document_prompt) for doc in docs
+            ]
+            # Join the documents together to put them in the prompt.
+            inputs = {
+                k: v
+                for k, v in other_keys.items()
+                if k in self.llm_chain.prompt.input_variables
+            }
+            inputs[self.document_variable_name] = self.document_separator.join(
+                doc_strings
+            )
+            inputs["is_first"] = False
+            inputs["input_documents"] = input_docs
+
+            # Call predict on the LLM.
+            output = self.llm_chain(inputs, callbacks=_run_manager.get_child())
+            if "process" in output.keys() and not output["process"]:
+                return output
+            output = output[self.llm_chain.output_key]
+            extra_return_dict = {}
+        extra_return_dict[self.output_key] = output
+        outputs = extra_return_dict
+        run_manager.on_chain_end(outputs)
+        final_outputs: Dict[str, Any] = self.prep_outputs(
+            inputs, outputs, return_only_outputs
+        )
+        if include_run_info:
+            final_outputs[RUN_KEY] = RunInfo(run_id=run_manager.run_id)
+        return final_outputs
+
+    def prep_outputs(
+        self,
+        inputs: Dict[str, str],
+        outputs: Dict[str, str],
+        return_only_outputs: bool = False,
+    ) -> Dict[str, str]:
+        """Validate and prep outputs."""
+        self._validate_outputs(outputs)
+        if self.memory is not None:
+            self.memory.save_context(inputs, outputs)
+        if return_only_outputs:
+            return outputs
+        else:
+            return {**inputs, **outputs}
+
+    def prep_inputs(
+        self, inputs: Union[Dict[str, Any], Any]
+    ) -> Dict[str, str]:
+        """Validate and prep inputs."""
+        if not isinstance(inputs, dict):
+            _input_keys = set(self.input_keys)
+            if self.memory is not None:
+                # If there are multiple input keys, but some get set by memory so that
+                # only one is not set, we can still figure out which key it is.
+                _input_keys = _input_keys.difference(
+                    self.memory.memory_variables
+                )
+            if len(_input_keys) != 1:
+                raise ValueError(
+                    f"A single string input was passed in, but this chain expects "
+                    f"multiple inputs ({_input_keys}). When a chain expects "
+                    f"multiple inputs, please call it by passing in a dictionary, "
+                    "eg `chain({'foo': 1, 'bar': 2})`"
+                )
+            inputs = {list(_input_keys)[0]: inputs}
+        if self.memory is not None:
+            external_context = self.memory.load_memory_variables(inputs)
+            inputs = dict(inputs, **external_context)
+        self._validate_inputs(inputs)
+        return inputs
+
+    def apply(
+        self, input_list: List[Dict[str, Any]], callbacks: Callbacks = None
+    ) -> List[Dict[str, str]]:
+        """Call the chain on all inputs in the list."""
+        return [self(inputs, callbacks=callbacks) for inputs in input_list]
+
+    def run(
+        self,
+        *args: Any,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Run the chain as text in, text out or multiple variables, text out."""
+        if len(self.output_keys) != 1:
+            raise ValueError(
+                f"`run` not supported when there is not exactly "
+                f"one output key. Got {self.output_keys}."
+            )
+
+        if args and not kwargs:
+            if len(args) != 1:
+                raise ValueError(
+                    "`run` supports only one positional argument."
+                )
+            return self(args[0], callbacks=callbacks, tags=tags)[
+                self.output_keys[0]
+            ]
+
+        if kwargs and not args:
+            return self(kwargs, callbacks=callbacks, tags=tags)[
+                self.output_keys[0]
+            ]
+
+        if not kwargs and not args:
+            raise ValueError(
+                "`run` supported with either positional arguments or keyword arguments,"
+                " but none were provided."
+            )
+
+        raise ValueError(
+            f"`run` supported with either positional arguments or keyword arguments"
+            f" but not both. Got args: {args} and kwargs: {kwargs}."
+        )
+
+    def dict(self, **kwargs: Any) -> Dict:
+        """Return dictionary representation of chain."""
+        if self.memory is not None:
+            raise ValueError("Saving of memory is not yet supported.")
+        _dict = super().dict()
+        _dict["_type"] = self._chain_type
+        return _dict
+
+    def save(self, file_path: Union[Path, str]) -> None:
+        """Save the chain.
+
+        Args:
+            file_path: Path to file to save the chain to.
+
+        Example:
+        .. code-block:: python
+
+            chain.save(file_path="path/chain.yaml")
+        """
+        # Convert file to Path object.
+        if isinstance(file_path, str):
+            save_path = Path(file_path)
+        else:
+            save_path = file_path
+
+        directory_path = save_path.parent
+        directory_path.mkdir(parents=True, exist_ok=True)
+
+        # Fetch dictionary to save
+        chain_dict = self.dict()
+
+        if save_path.suffix == ".json":
+            with open(file_path, "w") as f:
+                json.dump(chain_dict, f, indent=4)
+        elif save_path.suffix == ".yaml":
+            with open(file_path, "w") as f:
+                yaml.dump(chain_dict, f, default_flow_style=False)
+        else:
+            raise ValueError(f"{save_path} must be json or yaml")
+
+
 class BaseCombineDocumentsChain(Chain, ABC):
    """Base interface for chains combining documents."""

@@ -79,12 +496,6 @@ class BaseCombineDocumentsChain(Chain, ABC):
        """
        return None

-    @abstractmethod
-    def combine_docs(
-        self, docs: List[Document], **kwargs: Any
-    ) -> Tuple[str, dict]:
-        """Combine documents into a single string."""
-
    def _call(
        self,
        inputs: Dict[str, List[Document]],
@@ -96,13 +507,49 @@ class BaseCombineDocumentsChain(Chain, ABC):
        docs = inputs[self.input_key]
        # Other keys are assumed to be needed for LLM prediction
        other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
-        output, extra_return_dict = self.combine_docs(
-            docs, callbacks=_run_manager.get_child(), **other_keys
+        doc_strings = [
+            format_document(doc, self.document_prompt) for doc in docs
+        ]
+        # Join the documents together to put them in the prompt.
+        inputs = {
+            k: v
+            for k, v in other_keys.items()
+            if k in self.llm_chain.prompt.input_variables
+        }
+        inputs[self.document_variable_name] = self.document_separator.join(
+            doc_strings
        )
+
+        # Call predict on the LLM.
+        output, extra_return_dict = (
+            self.llm_chain(inputs, callbacks=_run_manager.get_child())[
+                self.llm_chain.output_key
+            ],
+            {},
+        )
+
        extra_return_dict[self.output_key] = output
        return extra_return_dict


+from pydantic import BaseModel
+
+
+class Generation(Serializable):
+    """Output of a single generation."""
+
+    text: str
+    """Generated text output."""
+
+    generation_info: Optional[Dict[str, Any]] = None
+    """Raw generation info response from the provider"""
+    """May include things like reason for finishing (e.g. in OpenAI)"""
+    # TODO: add log probs
+
+
+VALID_TASKS = ("text2text-generation", "text-generation", "summarization")
+
+
 class LLMChain(Chain):
    """Chain to run queries against LLMs.

@@ -153,21 +600,13 @@ class LLMChain(Chain):
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, str]:
-        response = self.generate([inputs], run_manager=run_manager)
-        return self.create_outputs(response)[0]
-
-    def generate(
-        self,
-        input_list: List[Dict[str, Any]],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> LLMResult:
-        """Generate LLM result from inputs."""
-        prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
-        return self.llm.generate_prompt(
+        prompts, stop = self.prep_prompts([inputs], run_manager=run_manager)
+        response = self.llm.generate_prompt(
            prompts,
            stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )
+        return self.create_outputs(response)[0]

    def prep_prompts(
        self,
@@ -223,23 +662,6 @@ class LLMChain(Chain):
            for generation in response.generations
        ]

-    def predict(self, callbacks: Callbacks = None, **kwargs: Any) -> str:
-        """Format prompt with kwargs and pass to LLM.
-
-        Args:
-            callbacks: Callbacks to pass to LLMChain
-            **kwargs: Keys to pass to prompt template.
-
-        Returns:
-            Completion from LLM.
-
-        Example:
-            .. code-block:: python
-
-                completion = llm.predict(adjective="funny")
-        """
-        return self(kwargs, callbacks=callbacks)[self.output_key]
-
    def predict_and_parse(
        self, callbacks: Callbacks = None, **kwargs: Any
    ) -> Union[str, List[str], Dict[str, Any]]:
@@ -350,14 +772,6 @@ class StuffDocumentsChain(BaseCombineDocumentsChain):
        prompt = self.llm_chain.prompt.format(**inputs)
        return self.llm_chain.llm.get_num_tokens(prompt)

-    def combine_docs(
-        self, docs: List[Document], callbacks: Callbacks = None, **kwargs: Any
-    ) -> Tuple[str, dict]:
-        """Stuff all documents into one prompt and pass to LLM."""
-        inputs = self._get_inputs(docs, **kwargs)
-        # Call predict on the LLM.
-        return self.llm_chain.predict(callbacks=callbacks, **inputs), {}
-
    @property
    def _chain_type(self) -> str:
        return "stuff_documents_chain"
--- a/apps/language_models/langchain/gen.py
+++ b/apps/language_models/langchain/gen.py
@@ -1129,7 +1129,7 @@ class Langchain:
                max_time=max_time,
                num_return_sequences=num_return_sequences,
            )
-            for r in run_qa_db(
+            out = run_qa_db(
                query=instruction,
                iinput=iinput,
                context=context,
@@ -1170,689 +1170,8 @@ class Langchain:
                auto_reduce_chunks=auto_reduce_chunks,
                max_chunks=max_chunks,
                device=self.device,
-            ):
-                (
-                    outr,
-                    extra,
-                ) = r  # doesn't accumulate, new answer every yield, so only save that full answer
-                yield dict(response=outr, sources=extra)
-            if save_dir:
-                extra_dict = gen_hyper_langchain.copy()
-                extra_dict.update(
-                    prompt_type=prompt_type,
-                    inference_server=inference_server,
-                    langchain_mode=langchain_mode,
-                    langchain_action=langchain_action,
-                    document_choice=document_choice,
-                    num_prompt_tokens=num_prompt_tokens,
-                    instruction=instruction,
-                    iinput=iinput,
-                    context=context,
-                )
-                save_generate_output(
-                    prompt=prompt,
-                    output=outr,
-                    base_model=base_model,
-                    save_dir=save_dir,
-                    where_from="run_qa_db",
-                    extra_dict=extra_dict,
-                )
-                if verbose:
-                    print(
-                        "Post-Generate Langchain: %s decoded_output: %s"
-                        % (str(datetime.now()), len(outr) if outr else -1),
-                        flush=True,
-                    )
-            if outr or base_model in non_hf_types:
-                # if got no response (e.g. not showing sources and got no sources,
-                # so nothing to give to LLM), then slip through and ask LLM
-                # Or if llama/gptj, then just return since they had no response and can't go down below code path
-                # clear before return, since .then() never done if from API
-                clear_torch_cache()
-                return
-
-        if inference_server.startswith(
-            "openai"
-        ) or inference_server.startswith("http"):
-            if inference_server.startswith("openai"):
-                import openai
-
-                where_from = "openai_client"
-
-                openai.api_key = os.getenv("OPENAI_API_KEY")
-                stop_sequences = list(
-                    set(prompter.terminate_response + [prompter.PreResponse])
-                )
-                stop_sequences = [x for x in stop_sequences if x]
-                # OpenAI will complain if ask for too many new tokens, takes it as min in some sense, wrongly so.
-                max_new_tokens_openai = min(
-                    max_new_tokens, model_max_length - num_prompt_tokens
-                )
-                gen_server_kwargs = dict(
-                    temperature=temperature if do_sample else 0,
-                    max_tokens=max_new_tokens_openai,
-                    top_p=top_p if do_sample else 1,
-                    frequency_penalty=0,
-                    n=num_return_sequences,
-                    presence_penalty=1.07
-                    - repetition_penalty
-                    + 0.6,  # so good default
-                )
-                if inference_server == "openai":
-                    response = openai.Completion.create(
-                        model=base_model,
-                        prompt=prompt,
-                        **gen_server_kwargs,
-                        stop=stop_sequences,
-                        stream=stream_output,
-                    )
-                    if not stream_output:
-                        text = response["choices"][0]["text"]
-                        yield dict(
-                            response=prompter.get_response(
-                                prompt + text,
-                                prompt=prompt,
-                                sanitize_bot_response=sanitize_bot_response,
-                            ),
-                            sources="",
-                        )
-                    else:
-                        collected_events = []
-                        text = ""
-                        for event in response:
-                            collected_events.append(
-                                event
-                            )  # save the event response
-                            event_text = event["choices"][0][
-                                "text"
-                            ]  # extract the text
-                            text += event_text  # append the text
-                            yield dict(
-                                response=prompter.get_response(
-                                    prompt + text,
-                                    prompt=prompt,
-                                    sanitize_bot_response=sanitize_bot_response,
-                                ),
-                                sources="",
-                            )
-                elif inference_server == "openai_chat":
-                    response = openai.ChatCompletion.create(
-                        model=base_model,
-                        messages=[
-                            {
-                                "role": "system",
-                                "content": "You are a helpful assistant.",
-                            },
-                            {
-                                "role": "user",
-                                "content": prompt,
-                            },
-                        ],
-                        stream=stream_output,
-                        **gen_server_kwargs,
-                    )
-                    if not stream_output:
-                        text = response["choices"][0]["message"]["content"]
-                        yield dict(
-                            response=prompter.get_response(
-                                prompt + text,
-                                prompt=prompt,
-                                sanitize_bot_response=sanitize_bot_response,
-                            ),
-                            sources="",
-                        )
-                    else:
-                        text = ""
-                        for chunk in response:
-                            delta = chunk["choices"][0]["delta"]
-                            if "content" in delta:
-                                text += delta["content"]
-                                yield dict(
-                                    response=prompter.get_response(
-                                        prompt + text,
-                                        prompt=prompt,
-                                        sanitize_bot_response=sanitize_bot_response,
-                                    ),
-                                    sources="",
-                                )
-                else:
-                    raise RuntimeError(
-                        "No such OpenAI mode: %s" % inference_server
-                    )
-            elif inference_server.startswith("http"):
-                inference_server, headers = get_hf_server(inference_server)
-                from gradio_utils.grclient import GradioClient
-                from text_generation import Client as HFClient
-
-                if isinstance(model, GradioClient):
-                    gr_client = model
-                    hf_client = None
-                elif isinstance(model, HFClient):
-                    gr_client = None
-                    hf_client = model
-                else:
-                    (
-                        inference_server,
-                        gr_client,
-                        hf_client,
-                    ) = self.get_client_from_inference_server(
-                        inference_server, base_model=base_model
-                    )
-
-                # quick sanity check to avoid long timeouts, just see if can reach server
-                requests.get(
-                    inference_server,
-                    timeout=int(os.getenv("REQUEST_TIMEOUT_FAST", "10")),
-                )
-
-                if gr_client is not None:
-                    # Note: h2oGPT gradio server could handle input token size issues for prompt,
-                    # but best to handle here so send less data to server
-
-                    chat_client = False
-                    where_from = "gr_client"
-                    client_langchain_mode = "Disabled"
-                    client_langchain_action = LangChainAction.QUERY.value
-                    gen_server_kwargs = dict(
-                        temperature=temperature,
-                        top_p=top_p,
-                        top_k=top_k,
-                        num_beams=num_beams,
-                        max_new_tokens=max_new_tokens,
-                        min_new_tokens=min_new_tokens,
-                        early_stopping=early_stopping,
-                        max_time=max_time,
-                        repetition_penalty=repetition_penalty,
-                        num_return_sequences=num_return_sequences,
-                        do_sample=do_sample,
-                        chat=chat_client,
-                    )
-                    # account for gradio into gradio that handles prompting, avoid duplicating prompter prompt injection
-                    if prompt_type in [
-                        None,
-                        "",
-                        PromptType.plain.name,
-                        PromptType.plain.value,
-                        str(PromptType.plain.value),
-                    ]:
-                        # if our prompt is plain, assume either correct or gradio server knows different prompt type,
-                        # so pass empty prompt_Type
-                        gr_prompt_type = ""
-                        gr_prompt_dict = ""
-                        gr_prompt = prompt  # already prepared prompt
-                        gr_context = ""
-                        gr_iinput = ""
-                    else:
-                        # if already have prompt_type that is not plain, None, or '', then already applied some prompting
-                        #  But assume server can handle prompting, and need to avoid double-up.
-                        #  Also assume server can do better job of using stopping.py to stop early, so avoid local prompting, let server handle
-                        #  So avoid "prompt" and let gradio server reconstruct from prompt_type we passed
-                        # Note it's ok that prompter.get_response() has prompt+text, prompt=prompt passed,
-                        #  because just means extra processing and removal of prompt, but that has no human-bot prompting doesn't matter
-                        #  since those won't appear
-                        gr_context = context
-                        gr_prompt = instruction
-                        gr_iinput = iinput
-                        gr_prompt_type = prompt_type
-                        gr_prompt_dict = prompt_dict
-                    client_kwargs = dict(
-                        instruction=gr_prompt
-                        if chat_client
-                        else "",  # only for chat=True
-                        iinput=gr_iinput,  # only for chat=True
-                        context=gr_context,
-                        # streaming output is supported, loops over and outputs each generation in streaming mode
-                        # but leave stream_output=False for simple input/output mode
-                        stream_output=stream_output,
-                        **gen_server_kwargs,
-                        prompt_type=gr_prompt_type,
-                        prompt_dict=gr_prompt_dict,
-                        instruction_nochat=gr_prompt
-                        if not chat_client
-                        else "",
-                        iinput_nochat=gr_iinput,  # only for chat=False
-                        langchain_mode=client_langchain_mode,
-                        langchain_action=client_langchain_action,
-                        top_k_docs=top_k_docs,
-                        chunk=chunk,
-                        chunk_size=chunk_size,
-                        document_choice=[DocumentChoices.All_Relevant.name],
-                    )
-                    api_name = "/submit_nochat_api"  # NOTE: like submit_nochat but stable API for string dict passing
-                    if not stream_output:
-                        res = gr_client.predict(
-                            str(dict(client_kwargs)), api_name=api_name
-                        )
-                        res_dict = ast.literal_eval(res)
-                        text = res_dict["response"]
-                        sources = res_dict["sources"]
-                        yield dict(
-                            response=prompter.get_response(
-                                prompt + text,
-                                prompt=prompt,
-                                sanitize_bot_response=sanitize_bot_response,
-                            ),
-                            sources=sources,
-                        )
-                    else:
-                        job = gr_client.submit(
-                            str(dict(client_kwargs)), api_name=api_name
-                        )
-                        text = ""
-                        sources = ""
-                        res_dict = dict(response=text, sources=sources)
-                        while not job.done():
-                            outputs_list = job.communicator.job.outputs
-                            if outputs_list:
-                                res = job.communicator.job.outputs[-1]
-                                res_dict = ast.literal_eval(res)
-                                text = res_dict["response"]
-                                sources = res_dict["sources"]
-                                if gr_prompt_type == "plain":
-                                    # then gradio server passes back full prompt + text
-                                    prompt_and_text = text
-                                else:
-                                    prompt_and_text = prompt + text
-                                yield dict(
-                                    response=prompter.get_response(
-                                        prompt_and_text,
-                                        prompt=prompt,
-                                        sanitize_bot_response=sanitize_bot_response,
-                                    ),
-                                    sources=sources,
-                                )
-                            time.sleep(0.01)
-                        # ensure get last output to avoid race
-                        res_all = job.outputs()
-                        if len(res_all) > 0:
-                            res = res_all[-1]
-                            res_dict = ast.literal_eval(res)
-                            text = res_dict["response"]
-                            sources = res_dict["sources"]
-                        else:
-                            # go with old text if last call didn't work
-                            e = job.future._exception
-                            if e is not None:
-                                stre = str(e)
-                                strex = "".join(
-                                    traceback.format_tb(e.__traceback__)
-                                )
-                            else:
-                                stre = ""
-                                strex = ""
-
-                            print(
-                                "Bad final response: %s %s %s %s %s: %s %s"
-                                % (
-                                    base_model,
-                                    inference_server,
-                                    res_all,
-                                    prompt,
-                                    text,
-                                    stre,
-                                    strex,
-                                ),
-                                flush=True,
-                            )
-                        if gr_prompt_type == "plain":
-                            # then gradio server passes back full prompt + text
-                            prompt_and_text = text
-                        else:
-                            prompt_and_text = prompt + text
-                        yield dict(
-                            response=prompter.get_response(
-                                prompt_and_text,
-                                prompt=prompt,
-                                sanitize_bot_response=sanitize_bot_response,
-                            ),
-                            sources=sources,
-                        )
-                elif hf_client:
-                    # HF inference server needs control over input tokens
-                    where_from = "hf_client"
-
-                    # prompt must include all human-bot like tokens, already added by prompt
-                    # https://github.com/huggingface/text-generation-inference/tree/main/clients/python#types
-                    stop_sequences = list(
-                        set(
-                            prompter.terminate_response
-                            + [prompter.PreResponse]
-                        )
-                    )
-                    stop_sequences = [x for x in stop_sequences if x]
-                    gen_server_kwargs = dict(
-                        do_sample=do_sample,
-                        max_new_tokens=max_new_tokens,
-                        # best_of=None,
-                        repetition_penalty=repetition_penalty,
-                        return_full_text=True,
-                        seed=SEED,
-                        stop_sequences=stop_sequences,
-                        temperature=temperature,
-                        top_k=top_k,
-                        top_p=top_p,
-                        # truncate=False,  # behaves oddly
-                        # typical_p=top_p,
-                        # watermark=False,
-                        # decoder_input_details=False,
-                    )
-                    # work-around for timeout at constructor time, will be issue if multi-threading,
-                    # so just do something reasonable or max_time if larger
-                    # lower bound because client is re-used if multi-threading
-                    hf_client.timeout = max(300, max_time)
-                    if not stream_output:
-                        text = hf_client.generate(
-                            prompt, **gen_server_kwargs
-                        ).generated_text
-                        yield dict(
-                            response=prompter.get_response(
-                                text,
-                                prompt=prompt,
-                                sanitize_bot_response=sanitize_bot_response,
-                            ),
-                            sources="",
-                        )
-                    else:
-                        text = ""
-                        for response in hf_client.generate_stream(
-                            prompt, **gen_server_kwargs
-                        ):
-                            if not response.token.special:
-                                # stop_sequences
-                                text_chunk = response.token.text
-                                text += text_chunk
-                                yield dict(
-                                    response=prompter.get_response(
-                                        prompt + text,
-                                        prompt=prompt,
-                                        sanitize_bot_response=sanitize_bot_response,
-                                    ),
-                                    sources="",
-                                )
-                else:
-                    raise RuntimeError(
-                        "Failed to get client: %s" % inference_server
-                    )
-            else:
-                raise RuntimeError(
-                    "No such inference_server  %s" % inference_server
-                )
-
-            if save_dir and text:
-                # save prompt + new text
-                extra_dict = gen_server_kwargs.copy()
-                extra_dict.update(
-                    dict(
-                        inference_server=inference_server,
-                        num_prompt_tokens=num_prompt_tokens,
-                    )
-                )
-                save_generate_output(
-                    prompt=prompt,
-                    output=text,
-                    base_model=base_model,
-                    save_dir=save_dir,
-                    where_from=where_from,
-                    extra_dict=extra_dict,
-                )
-            return
-        else:
-            assert not inference_server, (
-                "inferene_server=%s not supported" % inference_server
            )
-
-        if isinstance(tokenizer, str):
-            # pipeline
-            if tokenizer == "summarization":
-                key = "summary_text"
-            else:
-                raise RuntimeError("No such task type %s" % tokenizer)
-            # NOTE: uses max_length only
-            yield dict(
-                response=model(prompt, max_length=max_new_tokens)[0][key],
-                sources="",
-            )
-
-        if "mbart-" in base_model.lower():
-            assert src_lang is not None
-            tokenizer.src_lang = self.languages_covered()[src_lang]
-
-        stopping_criteria = get_stopping(
-            prompt_type,
-            prompt_dict,
-            tokenizer,
-            self.device,
-            model_max_length=tokenizer.model_max_length,
-        )
-
-        print(prompt)
-        # exit(0)
-        inputs = tokenizer(prompt, return_tensors="pt")
-        if debug and len(inputs["input_ids"]) > 0:
-            print("input_ids length", len(inputs["input_ids"][0]), flush=True)
-        input_ids = inputs["input_ids"].to(self.device)
-        # CRITICAL LIMIT else will fail
-        max_max_tokens = tokenizer.model_max_length
-        max_input_tokens = max_max_tokens - min_new_tokens
-        # NOTE: Don't limit up front due to max_new_tokens, let go up to max or reach max_max_tokens in stopping.py
-        input_ids = input_ids[:, -max_input_tokens:]
-        # required for falcon if multiple threads or asyncio accesses to model during generation
-        if use_cache is None:
-            use_cache = False if "falcon" in base_model else True
-        gen_config_kwargs = dict(
-            temperature=float(temperature),
-            top_p=float(top_p),
-            top_k=top_k,
-            num_beams=num_beams,
-            do_sample=do_sample,
-            repetition_penalty=float(repetition_penalty),
-            num_return_sequences=num_return_sequences,
-            renormalize_logits=True,
-            remove_invalid_values=True,
-            use_cache=use_cache,
-        )
-        token_ids = [
-            "eos_token_id",
-            "pad_token_id",
-            "bos_token_id",
-            "cls_token_id",
-            "sep_token_id",
-        ]
-        for token_id in token_ids:
-            if (
-                hasattr(tokenizer, token_id)
-                and getattr(tokenizer, token_id) is not None
-            ):
-                gen_config_kwargs.update(
-                    {token_id: getattr(tokenizer, token_id)}
-                )
-        generation_config = GenerationConfig(**gen_config_kwargs)
-
-        gen_kwargs = dict(
-            input_ids=input_ids,
-            generation_config=generation_config,
-            return_dict_in_generate=True,
-            output_scores=True,
-            max_new_tokens=max_new_tokens,  # prompt + new
-            min_new_tokens=min_new_tokens,  # prompt + new
-            early_stopping=early_stopping,  # False, True, "never"
-            max_time=max_time,
-            stopping_criteria=stopping_criteria,
-        )
-        if "gpt2" in base_model.lower():
-            gen_kwargs.update(
-                dict(
-                    bos_token_id=tokenizer.bos_token_id,
-                    pad_token_id=tokenizer.eos_token_id,
-                )
-            )
-        elif "mbart-" in base_model.lower():
-            assert tgt_lang is not None
-            tgt_lang = self.languages_covered()[tgt_lang]
-            gen_kwargs.update(
-                dict(forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
-            )
-        else:
-            token_ids = ["eos_token_id", "bos_token_id", "pad_token_id"]
-            for token_id in token_ids:
-                if (
-                    hasattr(tokenizer, token_id)
-                    and getattr(tokenizer, token_id) is not None
-                ):
-                    gen_kwargs.update({token_id: getattr(tokenizer, token_id)})
-
-        decoder_kwargs = dict(
-            skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-
-        decoder = functools.partial(tokenizer.decode, **decoder_kwargs)
-        decoder_raw_kwargs = dict(
-            skip_special_tokens=False, clean_up_tokenization_spaces=True
-        )
-
-        decoder_raw = functools.partial(tokenizer.decode, **decoder_raw_kwargs)
-
-        with torch.no_grad():
-            have_lora_weights = lora_weights not in [no_lora_str, "", None]
-            context_class_cast = (
-                NullContext
-                if self.device == "cpu"
-                or have_lora_weights
-                or self.device == "mps"
-                else torch.autocast
-            )
-            with context_class_cast(self.device):
-                # protection for gradio not keeping track of closed users,
-                # else hit bitsandbytes lack of thread safety:
-                # https://github.com/h2oai/h2ogpt/issues/104
-                # but only makes sense if concurrency_count == 1
-                context_class = NullContext  # if concurrency_count > 1 else filelock.FileLock
-                if verbose:
-                    print("Pre-Generate: %s" % str(datetime.now()), flush=True)
-                decoded_output = None
-                with context_class("generate.lock"):
-                    if verbose:
-                        print("Generate: %s" % str(datetime.now()), flush=True)
-                    # decoded tokenized prompt can deviate from prompt due to special characters
-                    inputs_decoded = decoder(input_ids[0])
-                    inputs_decoded_raw = decoder_raw(input_ids[0])
-                    if inputs_decoded == prompt:
-                        # normal
-                        pass
-                    elif inputs_decoded.lstrip() == prompt.lstrip():
-                        # sometimes extra space in front, make prompt same for prompt removal
-                        prompt = inputs_decoded
-                    elif inputs_decoded_raw == prompt:
-                        # some models specify special tokens that are part of normal prompt, so can't skip them
-                        inputs_decoded = prompt = inputs_decoded_raw
-                        decoder = decoder_raw
-                        decoder_kwargs = decoder_raw_kwargs
-                    elif inputs_decoded_raw.replace("<unk> ", "").replace(
-                        "<unk>", ""
-                    ).replace("\n", " ").replace(" ", "") == prompt.replace(
-                        "\n", " "
-                    ).replace(
-                        " ", ""
-                    ):
-                        inputs_decoded = prompt = inputs_decoded_raw
-                        decoder = decoder_raw
-                        decoder_kwargs = decoder_raw_kwargs
-                    else:
-                        if verbose:
-                            print(
-                                "WARNING: Special characters in prompt",
-                                flush=True,
-                            )
-                    if stream_output:
-                        skip_prompt = False
-                        streamer = H2OTextIteratorStreamer(
-                            tokenizer,
-                            skip_prompt=skip_prompt,
-                            block=False,
-                            **decoder_kwargs,
-                        )
-                        gen_kwargs.update(dict(streamer=streamer))
-                        target = wrapped_partial(
-                            self.generate_with_exceptions,
-                            model.generate,
-                            prompt=prompt,
-                            inputs_decoded=inputs_decoded,
-                            raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
-                            **gen_kwargs,
-                        )
-                        bucket = queue.Queue()
-                        thread = EThread(
-                            target=target, streamer=streamer, bucket=bucket
-                        )
-                        thread.start()
-                        outputs = ""
-                        try:
-                            for new_text in streamer:
-                                if bucket.qsize() > 0 or thread.exc:
-                                    thread.join()
-                                outputs += new_text
-                                yield dict(
-                                    response=prompter.get_response(
-                                        outputs,
-                                        prompt=inputs_decoded,
-                                        sanitize_bot_response=sanitize_bot_response,
-                                    ),
-                                    sources="",
-                                )
-                        except BaseException:
-                            # if any exception, raise that exception if was from thread, first
-                            if thread.exc:
-                                raise thread.exc
-                            raise
-                        finally:
-                            # clear before return, since .then() never done if from API
-                            clear_torch_cache()
-                            # in case no exception and didn't join with thread yet, then join
-                            if not thread.exc:
-                                thread.join()
-                        # in case raise StopIteration or broke queue loop in streamer, but still have exception
-                        if thread.exc:
-                            raise thread.exc
-                        decoded_output = outputs
-                    else:
-                        try:
-                            outputs = model.generate(**gen_kwargs)
-                        finally:
-                            clear_torch_cache()  # has to be here for API submit_nochat_api since.then() not called
-                        outputs = [decoder(s) for s in outputs.sequences]
-                        yield dict(
-                            response=prompter.get_response(
-                                outputs,
-                                prompt=inputs_decoded,
-                                sanitize_bot_response=sanitize_bot_response,
-                            ),
-                            sources="",
-                        )
-                        if outputs and len(outputs) >= 1:
-                            decoded_output = prompt + outputs[0]
-                    if save_dir and decoded_output:
-                        extra_dict = gen_config_kwargs.copy()
-                        extra_dict.update(
-                            dict(num_prompt_tokens=num_prompt_tokens)
-                        )
-                        save_generate_output(
-                            prompt=prompt,
-                            output=decoded_output,
-                            base_model=base_model,
-                            save_dir=save_dir,
-                            where_from="evaluate_%s" % str(stream_output),
-                            extra_dict=gen_config_kwargs,
-                        )
-                if verbose:
-                    print(
-                        "Post-Generate: %s decoded_output: %s"
-                        % (
-                            str(datetime.now()),
-                            len(decoded_output) if decoded_output else -1,
-                        ),
-                        flush=True,
-                    )
-                return outputs[0]
+            return out

    inputs_list_names = list(inspect.signature(evaluate).parameters)
    global inputs_kwargs_list
--- a/apps/language_models/langchain/gpt_langchain.py
+++ b/apps/language_models/langchain/gpt_langchain.py
@@ -436,7 +436,7 @@ class GradioInference(LLM):
    chat_client: bool = False

    return_full_text: bool = True
-    stream: bool = False
+    stream_output: bool = Field(False, alias="stream")
    sanitize_bot_response: bool = False

    prompter: Any = None
@@ -481,7 +481,7 @@ class GradioInference(LLM):
        # so server should get prompt_type or '', not plain
        # This is good, so gradio server can also handle stopping.py conditions
        # this is different than TGI server that uses prompter to inject prompt_type prompting
-        stream_output = self.stream
+        stream_output = self.stream_output
        gr_client = self.client
        client_langchain_mode = "Disabled"
        client_langchain_action = LangChainAction.QUERY.value
@@ -596,7 +596,7 @@ class H2OHuggingFaceTextGenInference(HuggingFaceTextGenInference):
    inference_server_url: str = ""
    timeout: int = 300
    headers: dict = None
-    stream: bool = False
+    stream_output: bool = Field(False, alias="stream")
    sanitize_bot_response: bool = False
    prompter: Any = None
    tokenizer: Any = None
@@ -663,7 +663,7 @@ class H2OHuggingFaceTextGenInference(HuggingFaceTextGenInference):
        # lower bound because client is re-used if multi-threading
        self.client.timeout = max(300, self.timeout)

-        if not self.stream:
+        if not self.stream_output:
            res = self.client.generate(
                prompt,
                **gen_server_kwargs,
@@ -852,7 +852,7 @@ def get_llm(
                top_p=top_p,
                # typical_p=top_p,
                callbacks=callbacks if stream_output else None,
-                stream=stream_output,
+                stream_output=stream_output,
                prompter=prompter,
                tokenizer=tokenizer,
                client=hf_client,
@@ -2510,8 +2510,7 @@ def _run_qa_db(
        formatted_doc_chunks = "\n\n".join(
            [get_url(x) + "\n\n" + x.page_content for x in docs]
        )
-        yield formatted_doc_chunks, ""
-        return
+        return formatted_doc_chunks, ""
    if not docs and langchain_action in [
        LangChainAction.SUMMARIZE_MAP.value,
        LangChainAction.SUMMARIZE_ALL.value,
@@ -2523,8 +2522,7 @@ def _run_qa_db(
            else "No documents to summarize."
        )
        extra = ""
-        yield ret, extra
-        return
+        return ret, extra
    if not docs and langchain_mode not in [
        LangChainMode.DISABLED.value,
        LangChainMode.CHAT_LLM.value,
@@ -2536,8 +2534,7 @@ def _run_qa_db(
            else "No documents to query."
        )
        extra = ""
-        yield ret, extra
-        return
+        return ret, extra

    if chain is None and model_name not in non_hf_types:
        # here if no docs at all and not HF type
@@ -2557,22 +2554,7 @@ def _run_qa_db(
        )
        with context_class_cast(args.device):
            answer = chain()
-
-    if not use_context:
-        ret = answer["output_text"]
-        extra = ""
-        yield ret, extra
-    elif answer is not None:
-        ret, extra = get_sources_answer(
-            query,
-            answer,
-            scores,
-            show_rank,
-            answer_with_sources,
-            verbose=verbose,
-        )
-        yield ret, extra
-    return
+            return answer


 def get_similarity_chain(
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -3,13 +3,11 @@ from apps.stable_diffusion.src.utils.utils import _compile_module
 from io import BytesIO
 import torch_mlir

-from transformers import TextGenerationPipeline
-from transformers.pipelines.text_generation import ReturnType
-
 from stopping import get_stopping
 from prompter import Prompter, PromptType

-
+from transformers import TextGenerationPipeline
+from transformers.pipelines.text_generation import ReturnType
 from transformers.generation import (
    GenerationConfig,
    LogitsProcessorList,
@@ -22,23 +20,17 @@ import gc
 from pathlib import Path
 from shark.shark_inference import SharkInference
 from shark.shark_downloader import download_public_file
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from apps.stable_diffusion.src import args

 # Brevitas
 from typing import List, Tuple
-from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.common.generative.quantize import quantize_model
 from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


-def brevitas〇matmul_rhs_group_quant〡shape(
-    lhs: List[int],
-    rhs: List[int],
-    rhs_scale: List[int],
-    rhs_zero_point: List[int],
-    rhs_bit_width: int,
-    rhs_group_size: int,
-) -> List[int]:
+# fmt: off
+def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
    if len(lhs) == 3 and len(rhs) == 2:
        return [lhs[0], lhs[1], rhs[0]]
    elif len(lhs) == 2 and len(rhs) == 2:
@@ -47,30 +39,21 @@ def brevitas〇matmul_rhs_group_quant〡shape(
        raise ValueError("Input shapes not supported.")


-def brevitas〇matmul_rhs_group_quant〡dtype(
-    lhs_rank_dtype: Tuple[int, int],
-    rhs_rank_dtype: Tuple[int, int],
-    rhs_scale_rank_dtype: Tuple[int, int],
-    rhs_zero_point_rank_dtype: Tuple[int, int],
-    rhs_bit_width: int,
-    rhs_group_size: int,
-) -> int:
+def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
    # output dtype is the dtype of the lhs float input
    lhs_rank, lhs_dtype = lhs_rank_dtype
    return lhs_dtype


-def brevitas〇matmul_rhs_group_quant〡has_value_semantics(
-    lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size
-) -> None:
+def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
    return


 brevitas_matmul_rhs_group_quant_library = [
-    brevitas〇matmul_rhs_group_quant〡shape,
-    brevitas〇matmul_rhs_group_quant〡dtype,
-    brevitas〇matmul_rhs_group_quant〡has_value_semantics,
-]
+    quant〇matmul_rhs_group_quant〡shape,
+    quant〇matmul_rhs_group_quant〡dtype,
+    quant〇matmul_rhs_group_quant〡has_value_semantics]
+# fmt: on

 global_device = "cuda"
 global_precision = "fp16"
@@ -118,7 +101,7 @@ class H2OGPTModel(torch.nn.Module):
                dtype=torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=128,
@@ -246,7 +229,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
                ts_graph,
                [*h2ogptCompileInput],
                output_type=torch_mlir.OutputType.TORCH,
-                backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+                backend_legal_ops=["quant.matmul_rhs_group_quant"],
                extra_library=brevitas_matmul_rhs_group_quant_library,
                use_tracing=False,
                verbose=False,
@@ -254,7 +237,7 @@ class H2OGPTSHARKModel(torch.nn.Module):
            print(f"[DEBUG] converting torch to linalg")
            run_pipeline_with_repro_report(
                module,
-                "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
                description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
            )
        else:
@@ -273,6 +256,11 @@ class H2OGPTSHARKModel(torch.nn.Module):
        bytecode = bytecode_stream.getvalue()
        del module

+        bytecode = save_mlir(
+            bytecode,
+            model_name=f"h2ogpt_{precision}",
+            frontend="torch",
+        )
        return bytecode

    def forward(self, input_ids, attention_mask):
@@ -285,7 +273,215 @@ class H2OGPTSHARKModel(torch.nn.Module):
        return result


-h2ogpt_model = H2OGPTSHARKModel()
+def decode_tokens(tokenizer, res_tokens):
+    for i in range(len(res_tokens)):
+        if type(res_tokens[i]) != int:
+            res_tokens[i] = int(res_tokens[i][0])
+
+    res_str = tokenizer.decode(res_tokens, skip_special_tokens=True)
+    return res_str
+
+
+def generate_token(h2ogpt_shark_model, model, tokenizer, **generate_kwargs):
+    del generate_kwargs["max_time"]
+    generate_kwargs["input_ids"] = generate_kwargs["input_ids"].to(
+        device=tensor_device
+    )
+    generate_kwargs["attention_mask"] = generate_kwargs["attention_mask"].to(
+        device=tensor_device
+    )
+    truncated_input_ids = []
+    stopping_criteria = generate_kwargs["stopping_criteria"]
+
+    generation_config_ = GenerationConfig.from_model_config(model.config)
+    generation_config = copy.deepcopy(generation_config_)
+    model_kwargs = generation_config.update(**generate_kwargs)
+
+    logits_processor = LogitsProcessorList()
+    stopping_criteria = (
+        stopping_criteria
+        if stopping_criteria is not None
+        else StoppingCriteriaList()
+    )
+
+    eos_token_id = generation_config.eos_token_id
+    generation_config.pad_token_id = eos_token_id
+
+    (
+        inputs_tensor,
+        model_input_name,
+        model_kwargs,
+    ) = model._prepare_model_inputs(
+        None, generation_config.bos_token_id, model_kwargs
+    )
+
+    model_kwargs["output_attentions"] = generation_config.output_attentions
+    model_kwargs[
+        "output_hidden_states"
+    ] = generation_config.output_hidden_states
+    model_kwargs["use_cache"] = generation_config.use_cache
+
+    input_ids = (
+        inputs_tensor
+        if model_input_name == "input_ids"
+        else model_kwargs.pop("input_ids")
+    )
+
+    input_ids_seq_length = input_ids.shape[-1]
+
+    generation_config.max_length = (
+        generation_config.max_new_tokens + input_ids_seq_length
+    )
+
+    logits_processor = model._get_logits_processor(
+        generation_config=generation_config,
+        input_ids_seq_length=input_ids_seq_length,
+        encoder_input_ids=inputs_tensor,
+        prefix_allowed_tokens_fn=None,
+        logits_processor=logits_processor,
+    )
+
+    stopping_criteria = model._get_stopping_criteria(
+        generation_config=generation_config,
+        stopping_criteria=stopping_criteria,
+    )
+
+    logits_warper = model._get_logits_warper(generation_config)
+
+    (
+        input_ids,
+        model_kwargs,
+    ) = model._expand_inputs_for_generation(
+        input_ids=input_ids,
+        expand_size=generation_config.num_return_sequences,  # 1
+        is_encoder_decoder=model.config.is_encoder_decoder,  # False
+        **model_kwargs,
+    )
+
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    eos_token_id_tensor = (
+        torch.tensor(eos_token_id).to(device=tensor_device)
+        if eos_token_id is not None
+        else None
+    )
+
+    pad_token_id = generation_config.pad_token_id
+    eos_token_id = eos_token_id
+
+    output_scores = generation_config.output_scores  # False
+    return_dict_in_generate = (
+        generation_config.return_dict_in_generate  # False
+    )
+
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+
+    # keep track of which sequences are already finished
+    unfinished_sequences = torch.ones(
+        input_ids.shape[0],
+        dtype=torch.long,
+        device=input_ids.device,
+    )
+
+    timesRan = 0
+    import time
+
+    start = time.time()
+    print("\n")
+
+    res_tokens = []
+    while True:
+        model_inputs = model.prepare_inputs_for_generation(
+            input_ids, **model_kwargs
+        )
+
+        outputs = h2ogpt_shark_model.forward(
+            model_inputs["input_ids"], model_inputs["attention_mask"]
+        )
+
+        if args.precision == "fp16":
+            outputs = outputs.to(dtype=torch.float32)
+        next_token_logits = outputs
+
+        # pre-process distribution
+        next_token_scores = logits_processor(input_ids, next_token_logits)
+        next_token_scores = logits_warper(input_ids, next_token_scores)
+
+        # sample
+        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+
+        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError(
+                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
+                )
+            next_token = next_token * unfinished_sequences + pad_token_id * (
+                1 - unfinished_sequences
+            )
+
+        input_ids = torch.cat([input_ids, next_token[:, None]], dim=-1)
+
+        model_kwargs["past_key_values"] = None
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [
+                    attention_mask,
+                    attention_mask.new_ones((attention_mask.shape[0], 1)),
+                ],
+                dim=-1,
+            )
+
+        truncated_input_ids.append(input_ids[:, 0])
+        input_ids = input_ids[:, 1:]
+        model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, 1:]
+
+        new_word = tokenizer.decode(
+            next_token.cpu().numpy(),
+            add_special_tokens=False,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True,
+        )
+
+        res_tokens.append(next_token)
+        if new_word == "<0x0A>":
+            print("\n", end="", flush=True)
+        else:
+            print(f"{new_word}", end=" ", flush=True)
+
+        part_str = decode_tokens(tokenizer, res_tokens)
+        yield part_str
+
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id_tensor is not None:
+            unfinished_sequences = unfinished_sequences.mul(
+                next_token.tile(eos_token_id_tensor.shape[0], 1)
+                .ne(eos_token_id_tensor.unsqueeze(1))
+                .prod(dim=0)
+            )
+            # stop when each sentence is finished
+            if unfinished_sequences.max() == 0 or stopping_criteria(
+                input_ids, scores
+            ):
+                break
+        timesRan = timesRan + 1
+
+    end = time.time()
+    print(
+        "\n\nTime taken is {:.2f} seconds/token\n".format(
+            (end - start) / timesRan
+        )
+    )
+
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    res_str = decode_tokens(tokenizer, res_tokens)
+    yield res_str


 def pad_or_truncate_inputs(
@@ -498,233 +694,6 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
            )
        return records

-    def generate_new_token(self):
-        model_inputs = self.model.prepare_inputs_for_generation(
-            self.input_ids, **self.model_kwargs
-        )
-
-        outputs = h2ogpt_model.forward(
-            model_inputs["input_ids"], model_inputs["attention_mask"]
-        )
-
-        if args.precision == "fp16":
-            outputs = outputs.to(dtype=torch.float32)
-        next_token_logits = outputs
-
-        # pre-process distribution
-        next_token_scores = self.logits_processor(
-            self.input_ids, next_token_logits
-        )
-        next_token_scores = self.logits_warper(
-            self.input_ids, next_token_scores
-        )
-
-        # sample
-        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
-
-        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-        # finished sentences should have their next token be a padding token
-        if self.eos_token_id is not None:
-            if self.pad_token_id is None:
-                raise ValueError(
-                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
-                )
-            next_token = (
-                next_token * self.unfinished_sequences
-                + self.pad_token_id * (1 - self.unfinished_sequences)
-            )
-
-        self.input_ids = torch.cat(
-            [self.input_ids, next_token[:, None]], dim=-1
-        )
-
-        self.model_kwargs["past_key_values"] = None
-        if "attention_mask" in self.model_kwargs:
-            attention_mask = self.model_kwargs["attention_mask"]
-            self.model_kwargs["attention_mask"] = torch.cat(
-                [
-                    attention_mask,
-                    attention_mask.new_ones((attention_mask.shape[0], 1)),
-                ],
-                dim=-1,
-            )
-
-        self.truncated_input_ids.append(self.input_ids[:, 0])
-        self.input_ids = self.input_ids[:, 1:]
-        self.model_kwargs["attention_mask"] = self.model_kwargs[
-            "attention_mask"
-        ][:, 1:]
-
-        return next_token
-
-    def generate_token(self, **generate_kwargs):
-        del generate_kwargs["max_time"]
-        self.truncated_input_ids = []
-
-        generation_config_ = GenerationConfig.from_model_config(
-            self.model.config
-        )
-        generation_config = copy.deepcopy(generation_config_)
-        self.model_kwargs = generation_config.update(**generate_kwargs)
-
-        logits_processor = LogitsProcessorList()
-        self.stopping_criteria = (
-            self.stopping_criteria
-            if self.stopping_criteria is not None
-            else StoppingCriteriaList()
-        )
-
-        eos_token_id = generation_config.eos_token_id
-        generation_config.pad_token_id = eos_token_id
-
-        (
-            inputs_tensor,
-            model_input_name,
-            self.model_kwargs,
-        ) = self.model._prepare_model_inputs(
-            None, generation_config.bos_token_id, self.model_kwargs
-        )
-        batch_size = inputs_tensor.shape[0]
-
-        self.model_kwargs[
-            "output_attentions"
-        ] = generation_config.output_attentions
-        self.model_kwargs[
-            "output_hidden_states"
-        ] = generation_config.output_hidden_states
-        self.model_kwargs["use_cache"] = generation_config.use_cache
-
-        self.input_ids = (
-            inputs_tensor
-            if model_input_name == "input_ids"
-            else self.model_kwargs.pop("input_ids")
-        )
-
-        input_ids_seq_length = self.input_ids.shape[-1]
-
-        generation_config.max_length = (
-            generation_config.max_new_tokens + input_ids_seq_length
-        )
-
-        self.logits_processor = self.model._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=inputs_tensor,
-            prefix_allowed_tokens_fn=None,
-            logits_processor=logits_processor,
-        )
-
-        self.stopping_criteria = self.model._get_stopping_criteria(
-            generation_config=generation_config,
-            stopping_criteria=self.stopping_criteria,
-        )
-
-        self.logits_warper = self.model._get_logits_warper(generation_config)
-
-        (
-            self.input_ids,
-            self.model_kwargs,
-        ) = self.model._expand_inputs_for_generation(
-            input_ids=self.input_ids,
-            expand_size=generation_config.num_return_sequences,  # 1
-            is_encoder_decoder=self.model.config.is_encoder_decoder,  # False
-            **self.model_kwargs,
-        )
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        self.eos_token_id_tensor = (
-            torch.tensor(eos_token_id).to(device=tensor_device)
-            if eos_token_id is not None
-            else None
-        )
-
-        self.pad_token_id = generation_config.pad_token_id
-        self.eos_token_id = eos_token_id
-
-        output_scores = generation_config.output_scores  # False
-        output_attentions = generation_config.output_attentions  # False
-        output_hidden_states = generation_config.output_hidden_states  # False
-        return_dict_in_generate = (
-            generation_config.return_dict_in_generate  # False
-        )
-
-        # init attention / hidden states / scores tuples
-        self.scores = (
-            () if (return_dict_in_generate and output_scores) else None
-        )
-        decoder_attentions = (
-            () if (return_dict_in_generate and output_attentions) else None
-        )
-        cross_attentions = (
-            () if (return_dict_in_generate and output_attentions) else None
-        )
-        decoder_hidden_states = (
-            () if (return_dict_in_generate and output_hidden_states) else None
-        )
-
-        # keep track of which sequences are already finished
-        self.unfinished_sequences = torch.ones(
-            self.input_ids.shape[0],
-            dtype=torch.long,
-            device=self.input_ids.device,
-        )
-
-        timesRan = 0
-        import time
-
-        start = time.time()
-        print("\n")
-
-        while True:
-            next_token = self.generate_new_token()
-            new_word = self.tokenizer.decode(
-                next_token.cpu().numpy(),
-                add_special_tokens=False,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=True,
-            )
-
-            print(f"{new_word}", end="", flush=True)
-
-            # if eos_token was found in one sentence, set sentence to finished
-            if self.eos_token_id_tensor is not None:
-                self.unfinished_sequences = self.unfinished_sequences.mul(
-                    next_token.tile(self.eos_token_id_tensor.shape[0], 1)
-                    .ne(self.eos_token_id_tensor.unsqueeze(1))
-                    .prod(dim=0)
-                )
-                # stop when each sentence is finished
-                if (
-                    self.unfinished_sequences.max() == 0
-                    or self.stopping_criteria(self.input_ids, self.scores)
-                ):
-                    break
-            timesRan = timesRan + 1
-
-        end = time.time()
-        print(
-            "\n\nTime taken is {:.2f} seconds/token\n".format(
-                (end - start) / timesRan
-            )
-        )
-
-        self.input_ids = torch.cat(
-            [
-                torch.tensor(self.truncated_input_ids)
-                .to(device=tensor_device)
-                .unsqueeze(dim=0),
-                self.input_ids,
-            ],
-            dim=-1,
-        )
-
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        return self.input_ids
-
    def _forward(self, model_inputs, **generate_kwargs):
        if self.can_stop:
            stopping_criteria = get_stopping(
@@ -784,19 +753,13 @@ class H2OTextGenerationPipeline(TextGenerationPipeline):
        input_ids, attention_mask = pad_or_truncate_inputs(
            input_ids, attention_mask, max_padding_length=max_padding_length
        )
-        self.stopping_criteria = generate_kwargs["stopping_criteria"]

-        generated_sequence = self.generate_token(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-        out_b = generated_sequence.shape[0]
-        generated_sequence = generated_sequence.reshape(
-            in_b, out_b // in_b, *generated_sequence.shape[1:]
-        )
-        return {
-            "generated_sequence": generated_sequence,
+        return_dict = {
+            "model": self.model,
+            "tokenizer": self.tokenizer,
            "input_ids": input_ids,
-            "prompt_text": prompt_text,
+            "attention_mask": attention_mask,
+            "attention_mask": attention_mask,
        }
+        return_dict = {**return_dict, **generate_kwargs}
+        return return_dict
--- a/apps/language_models/langchain/langchain_requirements.txt
+++ b/apps/language_models/langchain/langchain_requirements.txt
@@ -65,8 +65,8 @@ tiktoken==0.4.0
 openai==0.27.8

 # optional for chat with PDF
-langchain==0.0.202
-pypdf==3.12.2
+langchain==0.0.329
+pypdf==3.17.0
 # avoid textract, requires old six
 #textract==1.6.5

--- a/apps/language_models/langchain/make_db.py
+++ b/apps/language_models/langchain/make_db.py
@@ -1,5 +1,4 @@
 import os
-import fire

 from gpt_langchain import (
    path_to_docs,
@@ -202,7 +201,3 @@ def make_db_main(
    if verbose:
        print("DONE", flush=True)
    return db, collection_name
-
-
-if __name__ == "__main__":
-    fire.Fire(make_db_main)
--- a/apps/language_models/scripts/llama_ir_conversion_utils.py
+++ b/apps/language_models/scripts/llama_ir_conversion_utils.py
@@ -0,0 +1,442 @@
+from pathlib import Path
+import argparse
+from argparse import RawTextHelpFormatter
+import re, gc
+
+"""
+    This script can be used as a standalone utility to convert IRs to dynamic + combine them.
+    Following are the various ways this script can be used :-
+        a. To convert a single Linalg IR to dynamic IR:
+            --dynamic --first_ir_path=<PATH TO FIRST IR>
+        b. To convert two Linalg IRs to dynamic IR:
+            --dynamic --first_ir_path=<PATH TO SECOND IR> --first_ir_path=<PATH TO SECOND IR>
+        c. To combine two Linalg IRs into one:
+            --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>
+        d. To convert both IRs into dynamic as well as combine the IRs:
+            --dynamic --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>
+
+    NOTE: For dynamic you'll also need to provide the following set of flags:-
+           i. For First Llama : --dynamic_input_size (DEFAULT: 19)
+          ii. For Second Llama: --model_name (DEFAULT: llama2_7b)
+                                --precision (DEFAULT: 'int4')
+          You may use --save_dynamic to also save the dynamic IR in option d above.
+          Else for option a. and b. the dynamic IR(s) will get saved by default.
+"""
+
+
+def combine_mlir_scripts(
+    first_vicuna_mlir,
+    second_vicuna_mlir,
+    output_name,
+    return_ir=True,
+):
+    print(f"[DEBUG] combining first and second mlir")
+    print(f"[DEBUG] output_name = {output_name}")
+    maps1 = []
+    maps2 = []
+    constants = set()
+    f1 = []
+    f2 = []
+
+    print(f"[DEBUG] processing first vicuna mlir")
+    first_vicuna_mlir = first_vicuna_mlir.splitlines()
+    while first_vicuna_mlir:
+        line = first_vicuna_mlir.pop(0)
+        if re.search("#map\d*\s*=", line):
+            maps1.append(line)
+        elif re.search("arith.constant", line):
+            constants.add(line)
+        elif not re.search("module", line):
+            line = re.sub("forward", "first_vicuna_forward", line)
+            f1.append(line)
+    f1 = f1[:-1]
+    del first_vicuna_mlir
+    gc.collect()
+
+    for i, map_line in enumerate(maps1):
+        map_var = map_line.split(" ")[0]
+        map_line = re.sub(f"{map_var}(?!\d)", map_var + "_0", map_line)
+        maps1[i] = map_line
+        f1 = [
+            re.sub(f"{map_var}(?!\d)", map_var + "_0", func_line)
+            for func_line in f1
+        ]
+
+    print(f"[DEBUG] processing second vicuna mlir")
+    second_vicuna_mlir = second_vicuna_mlir.splitlines()
+    while second_vicuna_mlir:
+        line = second_vicuna_mlir.pop(0)
+        if re.search("#map\d*\s*=", line):
+            maps2.append(line)
+        elif "global_seed" in line:
+            continue
+        elif re.search("arith.constant", line):
+            constants.add(line)
+        elif not re.search("module", line):
+            line = re.sub("forward", "second_vicuna_forward", line)
+            f2.append(line)
+    f2 = f2[:-1]
+    del second_vicuna_mlir
+    gc.collect()
+
+    for i, map_line in enumerate(maps2):
+        map_var = map_line.split(" ")[0]
+        map_line = re.sub(f"{map_var}(?!\d)", map_var + "_1", map_line)
+        maps2[i] = map_line
+        f2 = [
+            re.sub(f"{map_var}(?!\d)", map_var + "_1", func_line)
+            for func_line in f2
+        ]
+
+    module_start = 'module attributes {torch.debug_module_name = "_lambda"} {'
+    module_end = "}"
+
+    global_vars = []
+    vnames = []
+    global_var_loading1 = []
+    global_var_loading2 = []
+
+    print(f"[DEBUG] processing constants")
+    counter = 0
+    constants = list(constants)
+    while constants:
+        constant = constants.pop(0)
+        vname, vbody = constant.split("=")
+        vname = re.sub("%", "", vname)
+        vname = vname.strip()
+        vbody = re.sub("arith.constant", "", vbody)
+        vbody = vbody.strip()
+        if len(vbody.split(":")) < 2:
+            print(constant)
+        vdtype = vbody.split(":")[-1].strip()
+        fixed_vdtype = vdtype
+        if "c1_i64" in vname:
+            print(constant)
+            counter += 1
+        if counter == 2:
+            counter = 0
+            print("detected duplicate")
+            continue
+        vnames.append(vname)
+        if "true" not in vname:
+            global_vars.append(
+                f"ml_program.global private @{vname}({vbody}) : {fixed_vdtype}"
+            )
+            global_var_loading1.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
+            )
+            global_var_loading2.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
+            )
+        else:
+            global_vars.append(
+                f"ml_program.global private @{vname}({vbody}) : i1"
+            )
+            global_var_loading1.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
+            )
+            global_var_loading2.append(
+                f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
+            )
+
+    new_f1, new_f2 = [], []
+
+    print(f"[DEBUG] processing f1")
+    for line in f1:
+        if "func.func" in line:
+            new_f1.append(line)
+            for global_var in global_var_loading1:
+                new_f1.append(global_var)
+        else:
+            new_f1.append(line)
+
+    print(f"[DEBUG] processing f2")
+    for line in f2:
+        if "func.func" in line:
+            new_f2.append(line)
+            for global_var in global_var_loading2:
+                if (
+                    "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
+                    in global_var
+                ):
+                    print(global_var)
+                new_f2.append(global_var)
+        else:
+            new_f2.append(line)
+
+    f1 = new_f1
+    f2 = new_f2
+
+    del new_f1
+    del new_f2
+    gc.collect()
+
+    print(
+        [
+            "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64" in x
+            for x in [maps1, maps2, global_vars, f1, f2]
+        ]
+    )
+
+    # doing it this way rather than assembling the whole string
+    # to prevent OOM with 64GiB RAM when encoding the file.
+
+    print(f"[DEBUG] Saving mlir to {output_name}")
+    with open(output_name, "w+") as f_:
+        f_.writelines(line + "\n" for line in maps1)
+        f_.writelines(line + "\n" for line in maps2)
+        f_.writelines(line + "\n" for line in [module_start])
+        f_.writelines(line + "\n" for line in global_vars)
+        f_.writelines(line + "\n" for line in f1)
+        f_.writelines(line + "\n" for line in f2)
+        f_.writelines(line + "\n" for line in [module_end])
+
+    del maps1
+    del maps2
+    del module_start
+    del global_vars
+    del f1
+    del f2
+    del module_end
+    gc.collect()
+
+    if return_ir:
+        print(f"[DEBUG] Reading combined mlir back in")
+        with open(output_name, "rb") as f:
+            return f.read()
+
+
+def write_in_dynamic_inputs0(module, dynamic_input_size):
+    print("[DEBUG] writing dynamic inputs to first vicuna")
+    # Current solution for ensuring mlir files support dynamic inputs
+    # TODO: find a more elegant way to implement this
+    new_lines = []
+    module = module.splitlines()
+    while module:
+        line = module.pop(0)
+        line = re.sub(f"{dynamic_input_size}x", "?x", line)
+        if "?x" in line:
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
+        line = re.sub(f" {dynamic_input_size},", " %dim,", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
+            )
+        if "arith.cmpi" in line:
+            line = re.sub(f"c{dynamic_input_size}", "dim", line)
+        if "%0 = tensor.empty(%dim) : tensor<?xi64>" in line:
+            new_lines.append("%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>")
+        if "%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>" in line:
+            continue
+
+        new_lines.append(line)
+    return "\n".join(new_lines)
+
+
+def write_in_dynamic_inputs1(module, model_name, precision):
+    print("[DEBUG] writing dynamic inputs to second vicuna")
+
+    def remove_constant_dim(line):
+        if "c19_i64" in line:
+            line = re.sub("c19_i64", "dim_i64", line)
+        if "19x" in line:
+            line = re.sub("19x", "?x", line)
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
+        if "tensor.empty" in line and "?x?" in line:
+            line = re.sub(
+                "tensor.empty\(%dim\)",
+                "tensor.empty(%dim, %dim)",
+                line,
+            )
+        if "arith.cmpi" in line:
+            line = re.sub("c19", "dim", line)
+        if " 19," in line:
+            line = re.sub(" 19,", " %dim,", line)
+        if "x20x" in line or "<20x" in line:
+            line = re.sub("20x", "?x", line)
+            line = re.sub("tensor.empty\(\)", "tensor.empty(%dimp1)", line)
+        if " 20," in line:
+            line = re.sub(" 20,", " %dimp1,", line)
+        return line
+
+    module = module.splitlines()
+    new_lines = []
+
+    # Using a while loop and the pop method to avoid creating a copy of module
+    if "llama2_13b" in model_name:
+        pkv_tensor_shape = "tensor<1x40x?x128x"
+    elif "llama2_70b" in model_name:
+        pkv_tensor_shape = "tensor<1x8x?x128x"
+    else:
+        pkv_tensor_shape = "tensor<1x32x?x128x"
+    if precision in ["fp16", "int4", "int8"]:
+        pkv_tensor_shape += "f16>"
+    else:
+        pkv_tensor_shape += "f32>"
+
+    while module:
+        line = module.pop(0)
+        if "%c19_i64 = arith.constant 19 : i64" in line:
+            new_lines.append("%c2 = arith.constant 2 : index")
+            new_lines.append(
+                f"%dim_4_int = tensor.dim %arg1, %c2 : {pkv_tensor_shape}"
+            )
+            new_lines.append(
+                "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
+            )
+            continue
+        if "%c2 = arith.constant 2 : index" in line:
+            continue
+        if "%c20_i64 = arith.constant 20 : i64" in line:
+            new_lines.append("%c1_i64 = arith.constant 1 : i64")
+            new_lines.append("%c20_i64 = arith.addi %dim_i64, %c1_i64 : i64")
+            new_lines.append(
+                "%dimp1 = arith.index_cast %c20_i64 : i64 to index"
+            )
+            continue
+        line = remove_constant_dim(line)
+        new_lines.append(line)
+
+    return "\n".join(new_lines)
+
+
+def save_dynamic_ir(ir_to_save, output_file):
+    if not ir_to_save:
+        return
+    # We only get string output from the dynamic conversion utility.
+    from contextlib import redirect_stdout
+
+    with open(output_file, "w") as f:
+        with redirect_stdout(f):
+            print(ir_to_save)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="llama ir utility",
+        description="\tThis script can be used as a standalone utility to convert IRs to dynamic + combine them.\n"
+        + "\tFollowing are the various ways this script can be used :-\n"
+        + "\t\ta. To convert a single Linalg IR to dynamic IR:\n"
+        + "\t\t\t--dynamic --first_ir_path=<PATH TO FIRST IR>\n"
+        + "\t\tb. To convert two Linalg IRs to dynamic IR:\n"
+        + "\t\t\t--dynamic --first_ir_path=<PATH TO SECOND IR> --first_ir_path=<PATH TO SECOND IR>\n"
+        + "\t\tc. To combine two Linalg IRs into one:\n"
+        + "\t\t\t--combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>\n"
+        + "\t\td. To convert both IRs into dynamic as well as combine the IRs:\n"
+        + "\t\t\t--dynamic --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>\n\n"
+        + "\tNOTE: For dynamic you'll also need to provide the following set of flags:-\n"
+        + "\t\t i. For First Llama : --dynamic_input_size (DEFAULT: 19)\n"
+        + "\t\tii. For Second Llama: --model_name (DEFAULT: llama2_7b)\n"
+        + "\t\t\t--precision (DEFAULT: 'int4')\n"
+        + "\t      You may use --save_dynamic to also save the dynamic IR in option d above.\n"
+        + "\t      Else for option a. and b. the dynamic IR(s) will get saved by default.\n",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--precision",
+        "-p",
+        default="int4",
+        choices=["fp32", "fp16", "int8", "int4"],
+        help="Precision of the concerned IR",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="llama2_7b",
+        choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"],
+        help="Specify which model to run.",
+    )
+    parser.add_argument(
+        "--first_ir_path",
+        default=None,
+        help="path to first llama mlir file",
+    )
+    parser.add_argument(
+        "--second_ir_path",
+        default=None,
+        help="path to second llama mlir file",
+    )
+    parser.add_argument(
+        "--dynamic_input_size",
+        type=int,
+        default=19,
+        help="Specify the static input size to replace with dynamic dim.",
+    )
+    parser.add_argument(
+        "--dynamic",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Converts the IR(s) to dynamic",
+    )
+    parser.add_argument(
+        "--save_dynamic",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Save the individual IR(s) after converting to dynamic",
+    )
+    parser.add_argument(
+        "--combine",
+        default=False,
+        action=argparse.BooleanOptionalAction,
+        help="Converts the IR(s) to dynamic",
+    )
+
+    args, unknown = parser.parse_known_args()
+
+    dynamic = args.dynamic
+    combine = args.combine
+    assert (
+        dynamic or combine
+    ), "neither `dynamic` nor `combine` flag is turned on"
+    first_ir_path = args.first_ir_path
+    second_ir_path = args.second_ir_path
+    assert first_ir_path or second_ir_path, "no input ir has been provided"
+    if combine:
+        assert (
+            first_ir_path and second_ir_path
+        ), "you will need to provide both IRs to combine"
+    precision = args.precision
+    model_name = args.model_name
+    dynamic_input_size = args.dynamic_input_size
+    save_dynamic = args.save_dynamic
+
+    print(f"Dynamic conversion utility is turned {'ON' if dynamic else 'OFF'}")
+    print(f"Combining IR utility is turned {'ON' if combine else 'OFF'}")
+
+    if dynamic and not combine:
+        save_dynamic = True
+
+    first_ir = None
+    first_dynamic_ir_name = None
+    second_ir = None
+    second_dynamic_ir_name = None
+    if first_ir_path:
+        first_dynamic_ir_name = f"{Path(first_ir_path).stem}_dynamic"
+        with open(first_ir_path, "r") as f:
+            first_ir = f.read()
+    if second_ir_path:
+        second_dynamic_ir_name = f"{Path(second_ir_path).stem}_dynamic"
+        with open(second_ir_path, "r") as f:
+            second_ir = f.read()
+    if dynamic:
+        first_ir = (
+            write_in_dynamic_inputs0(first_ir, dynamic_input_size)
+            if first_ir
+            else None
+        )
+        second_ir = (
+            write_in_dynamic_inputs1(second_ir, model_name, precision)
+            if second_ir
+            else None
+        )
+        if save_dynamic:
+            save_dynamic_ir(first_ir, f"{first_dynamic_ir_name}.mlir")
+            save_dynamic_ir(second_ir, f"{second_dynamic_ir_name}.mlir")
+
+    if combine:
+        combine_mlir_scripts(
+            first_ir,
+            second_ir,
+            f"{model_name}_{precision}.mlir",
+            return_ir=False,
+        )
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -46,6 +46,7 @@ def compile_stableLM(
    model_vmfb_name,
    device="cuda",
    precision="fp32",
+    debug=False,
 ):
    from shark.shark_inference import SharkInference

@@ -92,7 +93,7 @@ def compile_stableLM(
    shark_module.compile()

    path = shark_module.save_module(
-        vmfb_path.parent.absolute(), vmfb_path.stem
+        vmfb_path.parent.absolute(), vmfb_path.stem, debug=debug
    )
    print("Saved vmfb at ", str(path))

--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
--- a/apps/language_models/shark_llama_cli.spec
+++ b/apps/language_models/shark_llama_cli.spec
@@ -0,0 +1,94 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+from PyInstaller.utils.hooks import collect_submodules
+from PyInstaller.utils.hooks import copy_metadata
+
+import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
+
+datas = []
+datas += collect_data_files('torch')
+datas += copy_metadata('torch')
+datas += copy_metadata('tqdm')
+datas += copy_metadata('regex')
+datas += copy_metadata('requests')
+datas += copy_metadata('packaging')
+datas += copy_metadata('filelock')
+datas += copy_metadata('numpy')
+datas += copy_metadata('tokenizers')
+datas += copy_metadata('importlib_metadata')
+datas += copy_metadata('torch-mlir')
+datas += copy_metadata('omegaconf')
+datas += copy_metadata('safetensors')
+datas += copy_metadata('huggingface-hub')
+datas += copy_metadata('sentencepiece')
+datas += copy_metadata("pyyaml")
+datas += collect_data_files("tokenizers")
+datas += collect_data_files("tiktoken")
+datas += collect_data_files("accelerate")
+datas += collect_data_files('diffusers')
+datas += collect_data_files('transformers')
+datas += collect_data_files('opencv-python')
+datas += collect_data_files('pytorch_lightning')
+datas += collect_data_files('skimage')
+datas += collect_data_files('gradio')
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('iree')
+datas += collect_data_files('google-cloud-storage')
+datas += collect_data_files('py-cpuinfo')
+datas += collect_data_files("shark", include_py_files=True)
+datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
+datas += collect_data_files("tkinter")
+datas += collect_data_files("webview")
+datas += collect_data_files("sentencepiece")
+datas += collect_data_files("jsonschema")
+datas += collect_data_files("jsonschema_specifications")
+datas += collect_data_files("cpuinfo")
+datas += collect_data_files("langchain")
+
+binaries = []
+
+block_cipher = None
+
+hiddenimports = ['shark', 'shark.shark_inference', 'apps']
+hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
+
+a = Analysis(
+    ['scripts/vicuna.py'],
+    pathex=['.'],
+    binaries=binaries,
+    datas=datas,
+    hiddenimports=hiddenimports,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='shark_llama_cli',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/apps/language_models/src/model_wrappers/falcon_sharded_model.py
+++ b/apps/language_models/src/model_wrappers/falcon_sharded_model.py
@@ -0,0 +1,675 @@
+import torch
+from typing import Optional, Tuple
+
+
+class WordEmbeddingsLayer(torch.nn.Module):
+    def __init__(self, word_embedding_layer):
+        super().__init__()
+        self.model = word_embedding_layer
+
+    def forward(self, input_ids):
+        output = self.model.forward(input=input_ids)
+        return output
+
+
+class CompiledWordEmbeddingsLayer(torch.nn.Module):
+    def __init__(self, compiled_word_embedding_layer):
+        super().__init__()
+        self.model = compiled_word_embedding_layer
+
+    def forward(self, input_ids):
+        input_ids = input_ids.detach().numpy()
+        new_input_ids = self.model("forward", input_ids)
+        new_input_ids = new_input_ids.reshape(
+            [1, new_input_ids.shape[0], new_input_ids.shape[1]]
+        )
+        return torch.tensor(new_input_ids)
+
+
+class LNFEmbeddingLayer(torch.nn.Module):
+    def __init__(self, ln_f):
+        super().__init__()
+        self.model = ln_f
+
+    def forward(self, hidden_states):
+        output = self.model.forward(input=hidden_states)
+        return output
+
+
+class CompiledLNFEmbeddingLayer(torch.nn.Module):
+    def __init__(self, ln_f):
+        super().__init__()
+        self.model = ln_f
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.detach().numpy()
+        new_hidden_states = self.model("forward", (hidden_states,))
+
+        return torch.tensor(new_hidden_states)
+
+
+class LMHeadEmbeddingLayer(torch.nn.Module):
+    def __init__(self, embedding_layer):
+        super().__init__()
+        self.model = embedding_layer
+
+    def forward(self, hidden_states):
+        output = self.model.forward(input=hidden_states)
+        return output
+
+
+class CompiledLMHeadEmbeddingLayer(torch.nn.Module):
+    def __init__(self, lm_head):
+        super().__init__()
+        self.model = lm_head
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.detach().numpy()
+        new_hidden_states = self.model("forward", (hidden_states,))
+        return torch.tensor(new_hidden_states)
+
+
+class FourWayShardingDecoderLayer(torch.nn.Module):
+    def __init__(self, decoder_layer_model, falcon_variant):
+        super().__init__()
+        self.model = decoder_layer_model
+        self.falcon_variant = falcon_variant
+
+    def forward(self, hidden_states, attention_mask):
+        new_pkvs = []
+        for layer in self.model:
+            outputs = layer(
+                hidden_states=hidden_states,
+                alibi=None,
+                attention_mask=attention_mask,
+                use_cache=True,
+            )
+            hidden_states = outputs[0]
+            new_pkvs.append(
+                (
+                    outputs[-1][0],
+                    outputs[-1][1],
+                )
+            )
+
+        (
+            (new_pkv00, new_pkv01),
+            (new_pkv10, new_pkv11),
+            (new_pkv20, new_pkv21),
+            (new_pkv30, new_pkv31),
+            (new_pkv40, new_pkv41),
+            (new_pkv50, new_pkv51),
+            (new_pkv60, new_pkv61),
+            (new_pkv70, new_pkv71),
+            (new_pkv80, new_pkv81),
+            (new_pkv90, new_pkv91),
+            (new_pkv100, new_pkv101),
+            (new_pkv110, new_pkv111),
+            (new_pkv120, new_pkv121),
+            (new_pkv130, new_pkv131),
+            (new_pkv140, new_pkv141),
+            (new_pkv150, new_pkv151),
+            (new_pkv160, new_pkv161),
+            (new_pkv170, new_pkv171),
+            (new_pkv180, new_pkv181),
+            (new_pkv190, new_pkv191),
+        ) = new_pkvs
+        result = (
+            hidden_states,
+            new_pkv00,
+            new_pkv01,
+            new_pkv10,
+            new_pkv11,
+            new_pkv20,
+            new_pkv21,
+            new_pkv30,
+            new_pkv31,
+            new_pkv40,
+            new_pkv41,
+            new_pkv50,
+            new_pkv51,
+            new_pkv60,
+            new_pkv61,
+            new_pkv70,
+            new_pkv71,
+            new_pkv80,
+            new_pkv81,
+            new_pkv90,
+            new_pkv91,
+            new_pkv100,
+            new_pkv101,
+            new_pkv110,
+            new_pkv111,
+            new_pkv120,
+            new_pkv121,
+            new_pkv130,
+            new_pkv131,
+            new_pkv140,
+            new_pkv141,
+            new_pkv150,
+            new_pkv151,
+            new_pkv160,
+            new_pkv161,
+            new_pkv170,
+            new_pkv171,
+            new_pkv180,
+            new_pkv181,
+            new_pkv190,
+            new_pkv191,
+        )
+        return result
+
+
+class CompiledFourWayShardingDecoderLayer(torch.nn.Module):
+    def __init__(
+        self, layer_id, device_idx, falcon_variant, device, precision, model
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.device_index = device_idx
+        self.falcon_variant = falcon_variant
+        self.device = device
+        self.precision = precision
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        import gc
+
+        torch.cuda.empty_cache()
+        gc.collect()
+
+        if self.model is None:
+            raise ValueError("Layer vmfb not found")
+
+        hidden_states = hidden_states.to(torch.float32).detach().numpy()
+        attention_mask = attention_mask.to(torch.float32).detach().numpy()
+
+        if alibi is not None or layer_past is not None:
+            raise ValueError("Past Key Values and alibi should be None")
+        else:
+            output = self.model(
+                "forward",
+                (
+                    hidden_states,
+                    attention_mask,
+                ),
+            )
+
+        result = (
+            torch.tensor(output[0]),
+            (
+                torch.tensor(output[1]),
+                torch.tensor(output[2]),
+            ),
+            (
+                torch.tensor(output[3]),
+                torch.tensor(output[4]),
+            ),
+            (
+                torch.tensor(output[5]),
+                torch.tensor(output[6]),
+            ),
+            (
+                torch.tensor(output[7]),
+                torch.tensor(output[8]),
+            ),
+            (
+                torch.tensor(output[9]),
+                torch.tensor(output[10]),
+            ),
+            (
+                torch.tensor(output[11]),
+                torch.tensor(output[12]),
+            ),
+            (
+                torch.tensor(output[13]),
+                torch.tensor(output[14]),
+            ),
+            (
+                torch.tensor(output[15]),
+                torch.tensor(output[16]),
+            ),
+            (
+                torch.tensor(output[17]),
+                torch.tensor(output[18]),
+            ),
+            (
+                torch.tensor(output[19]),
+                torch.tensor(output[20]),
+            ),
+            (
+                torch.tensor(output[21]),
+                torch.tensor(output[22]),
+            ),
+            (
+                torch.tensor(output[23]),
+                torch.tensor(output[24]),
+            ),
+            (
+                torch.tensor(output[25]),
+                torch.tensor(output[26]),
+            ),
+            (
+                torch.tensor(output[27]),
+                torch.tensor(output[28]),
+            ),
+            (
+                torch.tensor(output[29]),
+                torch.tensor(output[30]),
+            ),
+            (
+                torch.tensor(output[31]),
+                torch.tensor(output[32]),
+            ),
+            (
+                torch.tensor(output[33]),
+                torch.tensor(output[34]),
+            ),
+            (
+                torch.tensor(output[35]),
+                torch.tensor(output[36]),
+            ),
+            (
+                torch.tensor(output[37]),
+                torch.tensor(output[38]),
+            ),
+            (
+                torch.tensor(output[39]),
+                torch.tensor(output[40]),
+            ),
+        )
+        return result
+
+
+class TwoWayShardingDecoderLayer(torch.nn.Module):
+    def __init__(self, decoder_layer_model, falcon_variant):
+        super().__init__()
+        self.model = decoder_layer_model
+        self.falcon_variant = falcon_variant
+
+    def forward(self, hidden_states, attention_mask):
+        new_pkvs = []
+        for layer in self.model:
+            outputs = layer(
+                hidden_states=hidden_states,
+                alibi=None,
+                attention_mask=attention_mask,
+                use_cache=True,
+            )
+            hidden_states = outputs[0]
+            new_pkvs.append(
+                (
+                    outputs[-1][0],
+                    outputs[-1][1],
+                )
+            )
+
+        (
+            (new_pkv00, new_pkv01),
+            (new_pkv10, new_pkv11),
+            (new_pkv20, new_pkv21),
+            (new_pkv30, new_pkv31),
+            (new_pkv40, new_pkv41),
+            (new_pkv50, new_pkv51),
+            (new_pkv60, new_pkv61),
+            (new_pkv70, new_pkv71),
+            (new_pkv80, new_pkv81),
+            (new_pkv90, new_pkv91),
+            (new_pkv100, new_pkv101),
+            (new_pkv110, new_pkv111),
+            (new_pkv120, new_pkv121),
+            (new_pkv130, new_pkv131),
+            (new_pkv140, new_pkv141),
+            (new_pkv150, new_pkv151),
+            (new_pkv160, new_pkv161),
+            (new_pkv170, new_pkv171),
+            (new_pkv180, new_pkv181),
+            (new_pkv190, new_pkv191),
+            (new_pkv200, new_pkv201),
+            (new_pkv210, new_pkv211),
+            (new_pkv220, new_pkv221),
+            (new_pkv230, new_pkv231),
+            (new_pkv240, new_pkv241),
+            (new_pkv250, new_pkv251),
+            (new_pkv260, new_pkv261),
+            (new_pkv270, new_pkv271),
+            (new_pkv280, new_pkv281),
+            (new_pkv290, new_pkv291),
+            (new_pkv300, new_pkv301),
+            (new_pkv310, new_pkv311),
+            (new_pkv320, new_pkv321),
+            (new_pkv330, new_pkv331),
+            (new_pkv340, new_pkv341),
+            (new_pkv350, new_pkv351),
+            (new_pkv360, new_pkv361),
+            (new_pkv370, new_pkv371),
+            (new_pkv380, new_pkv381),
+            (new_pkv390, new_pkv391),
+        ) = new_pkvs
+        result = (
+            hidden_states,
+            new_pkv00,
+            new_pkv01,
+            new_pkv10,
+            new_pkv11,
+            new_pkv20,
+            new_pkv21,
+            new_pkv30,
+            new_pkv31,
+            new_pkv40,
+            new_pkv41,
+            new_pkv50,
+            new_pkv51,
+            new_pkv60,
+            new_pkv61,
+            new_pkv70,
+            new_pkv71,
+            new_pkv80,
+            new_pkv81,
+            new_pkv90,
+            new_pkv91,
+            new_pkv100,
+            new_pkv101,
+            new_pkv110,
+            new_pkv111,
+            new_pkv120,
+            new_pkv121,
+            new_pkv130,
+            new_pkv131,
+            new_pkv140,
+            new_pkv141,
+            new_pkv150,
+            new_pkv151,
+            new_pkv160,
+            new_pkv161,
+            new_pkv170,
+            new_pkv171,
+            new_pkv180,
+            new_pkv181,
+            new_pkv190,
+            new_pkv191,
+            new_pkv200,
+            new_pkv201,
+            new_pkv210,
+            new_pkv211,
+            new_pkv220,
+            new_pkv221,
+            new_pkv230,
+            new_pkv231,
+            new_pkv240,
+            new_pkv241,
+            new_pkv250,
+            new_pkv251,
+            new_pkv260,
+            new_pkv261,
+            new_pkv270,
+            new_pkv271,
+            new_pkv280,
+            new_pkv281,
+            new_pkv290,
+            new_pkv291,
+            new_pkv300,
+            new_pkv301,
+            new_pkv310,
+            new_pkv311,
+            new_pkv320,
+            new_pkv321,
+            new_pkv330,
+            new_pkv331,
+            new_pkv340,
+            new_pkv341,
+            new_pkv350,
+            new_pkv351,
+            new_pkv360,
+            new_pkv361,
+            new_pkv370,
+            new_pkv371,
+            new_pkv380,
+            new_pkv381,
+            new_pkv390,
+            new_pkv391,
+        )
+        return result
+
+
+class CompiledTwoWayShardingDecoderLayer(torch.nn.Module):
+    def __init__(
+        self, layer_id, device_idx, falcon_variant, device, precision, model
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.device_index = device_idx
+        self.falcon_variant = falcon_variant
+        self.device = device
+        self.precision = precision
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        import gc
+
+        torch.cuda.empty_cache()
+        gc.collect()
+
+        if self.model is None:
+            raise ValueError("Layer vmfb not found")
+
+        hidden_states = hidden_states.to(torch.float32).detach().numpy()
+        attention_mask = attention_mask.to(torch.float32).detach().numpy()
+
+        if alibi is not None or layer_past is not None:
+            raise ValueError("Past Key Values and alibi should be None")
+        else:
+            output = self.model(
+                "forward",
+                (
+                    hidden_states,
+                    attention_mask,
+                ),
+            )
+
+        result = (
+            torch.tensor(output[0]),
+            (
+                torch.tensor(output[1]),
+                torch.tensor(output[2]),
+            ),
+            (
+                torch.tensor(output[3]),
+                torch.tensor(output[4]),
+            ),
+            (
+                torch.tensor(output[5]),
+                torch.tensor(output[6]),
+            ),
+            (
+                torch.tensor(output[7]),
+                torch.tensor(output[8]),
+            ),
+            (
+                torch.tensor(output[9]),
+                torch.tensor(output[10]),
+            ),
+            (
+                torch.tensor(output[11]),
+                torch.tensor(output[12]),
+            ),
+            (
+                torch.tensor(output[13]),
+                torch.tensor(output[14]),
+            ),
+            (
+                torch.tensor(output[15]),
+                torch.tensor(output[16]),
+            ),
+            (
+                torch.tensor(output[17]),
+                torch.tensor(output[18]),
+            ),
+            (
+                torch.tensor(output[19]),
+                torch.tensor(output[20]),
+            ),
+            (
+                torch.tensor(output[21]),
+                torch.tensor(output[22]),
+            ),
+            (
+                torch.tensor(output[23]),
+                torch.tensor(output[24]),
+            ),
+            (
+                torch.tensor(output[25]),
+                torch.tensor(output[26]),
+            ),
+            (
+                torch.tensor(output[27]),
+                torch.tensor(output[28]),
+            ),
+            (
+                torch.tensor(output[29]),
+                torch.tensor(output[30]),
+            ),
+            (
+                torch.tensor(output[31]),
+                torch.tensor(output[32]),
+            ),
+            (
+                torch.tensor(output[33]),
+                torch.tensor(output[34]),
+            ),
+            (
+                torch.tensor(output[35]),
+                torch.tensor(output[36]),
+            ),
+            (
+                torch.tensor(output[37]),
+                torch.tensor(output[38]),
+            ),
+            (
+                torch.tensor(output[39]),
+                torch.tensor(output[40]),
+            ),
+            (
+                torch.tensor(output[41]),
+                torch.tensor(output[42]),
+            ),
+            (
+                torch.tensor(output[43]),
+                torch.tensor(output[44]),
+            ),
+            (
+                torch.tensor(output[45]),
+                torch.tensor(output[46]),
+            ),
+            (
+                torch.tensor(output[47]),
+                torch.tensor(output[48]),
+            ),
+            (
+                torch.tensor(output[49]),
+                torch.tensor(output[50]),
+            ),
+            (
+                torch.tensor(output[51]),
+                torch.tensor(output[52]),
+            ),
+            (
+                torch.tensor(output[53]),
+                torch.tensor(output[54]),
+            ),
+            (
+                torch.tensor(output[55]),
+                torch.tensor(output[56]),
+            ),
+            (
+                torch.tensor(output[57]),
+                torch.tensor(output[58]),
+            ),
+            (
+                torch.tensor(output[59]),
+                torch.tensor(output[60]),
+            ),
+            (
+                torch.tensor(output[61]),
+                torch.tensor(output[62]),
+            ),
+            (
+                torch.tensor(output[63]),
+                torch.tensor(output[64]),
+            ),
+            (
+                torch.tensor(output[65]),
+                torch.tensor(output[66]),
+            ),
+            (
+                torch.tensor(output[67]),
+                torch.tensor(output[68]),
+            ),
+            (
+                torch.tensor(output[69]),
+                torch.tensor(output[70]),
+            ),
+            (
+                torch.tensor(output[71]),
+                torch.tensor(output[72]),
+            ),
+            (
+                torch.tensor(output[73]),
+                torch.tensor(output[74]),
+            ),
+            (
+                torch.tensor(output[75]),
+                torch.tensor(output[76]),
+            ),
+            (
+                torch.tensor(output[77]),
+                torch.tensor(output[78]),
+            ),
+            (
+                torch.tensor(output[79]),
+                torch.tensor(output[80]),
+            ),
+        )
+        return result
+
+
+class ShardedFalconModel:
+    def __init__(self, model, layers, word_embeddings, ln_f, lm_head):
+        super().__init__()
+        self.model = model
+        self.model.transformer.h = torch.nn.modules.container.ModuleList(
+            layers
+        )
+        self.model.transformer.word_embeddings = word_embeddings
+        self.model.transformer.ln_f = ln_f
+        self.model.lm_head = lm_head
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+    ):
+        return self.model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        ).logits[:, -1, :]
--- a/apps/language_models/src/model_wrappers/minigpt4.py
+++ b/apps/language_models/src/model_wrappers/minigpt4.py
@@ -5,7 +5,7 @@ from typing import List, Any
 from transformers import StoppingCriteria


-from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.common.generative.quantize import quantize_model
 from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


@@ -37,7 +37,7 @@ class VisionModel(torch.nn.Module):
                dtype=torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
@@ -52,7 +52,7 @@ class VisionModel(torch.nn.Module):
                dtype=torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
@@ -93,7 +93,7 @@ class FirstLlamaModel(torch.nn.Module):
                dtype=torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
@@ -157,7 +157,7 @@ class SecondLlamaModel(torch.nn.Module):
                dtype=torch.float32,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -0,0 +1,876 @@
+import argparse
+import json
+import re
+from io import BytesIO
+from pathlib import Path
+from tqdm import tqdm
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import iree.runtime
+import itertools
+import subprocess
+
+import torch
+import torch_mlir
+from torch_mlir import TensorPlaceholder
+from torch_mlir.compiler_utils import run_pipeline_with_repro_report
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    LlamaPreTrainedModel,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+
+from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
+from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
+    FirstVicunaLayer,
+    SecondVicunaLayer,
+    CompiledVicunaLayer,
+    ShardedVicunaModel,
+    LMHead,
+    LMHeadCompiled,
+    VicunaEmbedding,
+    VicunaEmbeddingCompiled,
+    VicunaNorm,
+    VicunaNormCompiled,
+)
+from apps.language_models.src.model_wrappers.vicuna_model import (
+    FirstVicuna,
+    SecondVicuna7B,
+)
+from apps.language_models.utils import (
+    get_vmfb_from_path,
+)
+from shark.shark_downloader import download_public_file
+from shark.shark_importer import get_f16_inputs
+from shark.shark_inference import SharkInference
+
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaRMSNorm,
+    _make_causal_mask,
+    _expand_mask,
+)
+from torch import nn
+from time import time
+
+
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(config)
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self,
+        attention_mask,
+        input_shape,
+        inputs_embeds,
+        past_key_values_length,
+    ):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        t1 = time()
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = (
+            use_cache if use_cache is not None else self.config.use_cache
+        )
+
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = (
+                seq_length_with_past + past_key_values_length
+            )
+
+        if position_ids is None:
+            device = (
+                input_ids.device
+                if input_ids is not None
+                else inputs_embeds.device
+            )
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.compressedlayers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = (
+                past_key_values[8 * idx : 8 * (idx + 1)]
+                if past_key_values is not None
+                else None
+            )
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer.forward(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[1:],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        try:
+            hidden_states = np.asarray(hidden_states, hidden_states.dtype)
+        except:
+            _ = 10
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        next_cache = tuple(itertools.chain.from_iterable(next_cache))
+        print(f"Token generated in {time() - t1} seconds")
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class EightLayerLayerSV(torch.nn.Module):
+    def __init__(self, layers):
+        super().__init__()
+        assert len(layers) == 8
+        self.layers = layers
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        pkv00,
+        pkv01,
+        pkv10,
+        pkv11,
+        pkv20,
+        pkv21,
+        pkv30,
+        pkv31,
+        pkv40,
+        pkv41,
+        pkv50,
+        pkv51,
+        pkv60,
+        pkv61,
+        pkv70,
+        pkv71,
+    ):
+        pkvs = [
+            (pkv00, pkv01),
+            (pkv10, pkv11),
+            (pkv20, pkv21),
+            (pkv30, pkv31),
+            (pkv40, pkv41),
+            (pkv50, pkv51),
+            (pkv60, pkv61),
+            (pkv70, pkv71),
+        ]
+        new_pkvs = []
+        for layer, pkv in zip(self.layers, pkvs):
+            outputs = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=(
+                    pkv[0],
+                    pkv[1],
+                ),
+                use_cache=True,
+            )
+
+            hidden_states = outputs[0]
+            new_pkvs.append(
+                (
+                    outputs[-1][0],
+                    outputs[-1][1],
+                )
+            )
+        (
+            (new_pkv00, new_pkv01),
+            (new_pkv10, new_pkv11),
+            (new_pkv20, new_pkv21),
+            (new_pkv30, new_pkv31),
+            (new_pkv40, new_pkv41),
+            (new_pkv50, new_pkv51),
+            (new_pkv60, new_pkv61),
+            (new_pkv70, new_pkv71),
+        ) = new_pkvs
+        return (
+            hidden_states,
+            new_pkv00,
+            new_pkv01,
+            new_pkv10,
+            new_pkv11,
+            new_pkv20,
+            new_pkv21,
+            new_pkv30,
+            new_pkv31,
+            new_pkv40,
+            new_pkv41,
+            new_pkv50,
+            new_pkv51,
+            new_pkv60,
+            new_pkv61,
+            new_pkv70,
+            new_pkv71,
+        )
+
+
+class EightLayerLayerFV(torch.nn.Module):
+    def __init__(self, layers):
+        super().__init__()
+        assert len(layers) == 8
+        self.layers = layers
+
+    def forward(self, hidden_states, attention_mask, position_ids):
+        new_pkvs = []
+        for layer in self.layers:
+            outputs = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=None,
+                use_cache=True,
+            )
+
+            hidden_states = outputs[0]
+            new_pkvs.append(
+                (
+                    outputs[-1][0],
+                    outputs[-1][1],
+                )
+            )
+        (
+            (new_pkv00, new_pkv01),
+            (new_pkv10, new_pkv11),
+            (new_pkv20, new_pkv21),
+            (new_pkv30, new_pkv31),
+            (new_pkv40, new_pkv41),
+            (new_pkv50, new_pkv51),
+            (new_pkv60, new_pkv61),
+            (new_pkv70, new_pkv71),
+        ) = new_pkvs
+        return (
+            hidden_states,
+            new_pkv00,
+            new_pkv01,
+            new_pkv10,
+            new_pkv11,
+            new_pkv20,
+            new_pkv21,
+            new_pkv30,
+            new_pkv31,
+            new_pkv40,
+            new_pkv41,
+            new_pkv50,
+            new_pkv51,
+            new_pkv60,
+            new_pkv61,
+            new_pkv70,
+            new_pkv71,
+        )
+
+
+class CompiledEightLayerLayerSV(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        hidden_states = hidden_states.detach()
+        attention_mask = attention_mask.detach()
+        position_ids = position_ids.detach()
+        (
+            (pkv00, pkv01),
+            (pkv10, pkv11),
+            (pkv20, pkv21),
+            (pkv30, pkv31),
+            (pkv40, pkv41),
+            (pkv50, pkv51),
+            (pkv60, pkv61),
+            (pkv70, pkv71),
+        ) = past_key_value
+        pkv00 = pkv00.detatch()
+        pkv01 = pkv01.detatch()
+        pkv10 = pkv10.detatch()
+        pkv11 = pkv11.detatch()
+        pkv20 = pkv20.detatch()
+        pkv21 = pkv21.detatch()
+        pkv30 = pkv30.detatch()
+        pkv31 = pkv31.detatch()
+        pkv40 = pkv40.detatch()
+        pkv41 = pkv41.detatch()
+        pkv50 = pkv50.detatch()
+        pkv51 = pkv51.detatch()
+        pkv60 = pkv60.detatch()
+        pkv61 = pkv61.detatch()
+        pkv70 = pkv70.detatch()
+        pkv71 = pkv71.detatch()
+
+        output = self.model(
+            "forward",
+            (
+                hidden_states,
+                attention_mask,
+                position_ids,
+                pkv00,
+                pkv01,
+                pkv10,
+                pkv11,
+                pkv20,
+                pkv21,
+                pkv30,
+                pkv31,
+                pkv40,
+                pkv41,
+                pkv50,
+                pkv51,
+                pkv60,
+                pkv61,
+                pkv70,
+                pkv71,
+            ),
+            send_to_host=False,
+        )
+        return (
+            output[0],
+            (output[1][0], output[1][1]),
+            (output[2][0], output[2][1]),
+            (output[3][0], output[3][1]),
+            (output[4][0], output[4][1]),
+            (output[5][0], output[5][1]),
+            (output[6][0], output[6][1]),
+            (output[7][0], output[7][1]),
+            (output[8][0], output[8][1]),
+        )
+
+
+def forward_compressed(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+):
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError(
+            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+        )
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape
+    elif inputs_embeds is not None:
+        batch_size, seq_length, _ = inputs_embeds.shape
+    else:
+        raise ValueError(
+            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+        )
+
+    seq_length_with_past = seq_length
+    past_key_values_length = 0
+
+    if past_key_values is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+        seq_length_with_past = seq_length_with_past + past_key_values_length
+
+    if position_ids is None:
+        device = (
+            input_ids.device if input_ids is not None else inputs_embeds.device
+        )
+        position_ids = torch.arange(
+            past_key_values_length,
+            seq_length + past_key_values_length,
+            dtype=torch.long,
+            device=device,
+        )
+        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+    else:
+        position_ids = position_ids.view(-1, seq_length).long()
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+    # embed positions
+    if attention_mask is None:
+        attention_mask = torch.ones(
+            (batch_size, seq_length_with_past),
+            dtype=torch.bool,
+            device=inputs_embeds.device,
+        )
+    attention_mask = self._prepare_decoder_attention_mask(
+        attention_mask,
+        (batch_size, seq_length),
+        inputs_embeds,
+        past_key_values_length,
+    )
+
+    hidden_states = inputs_embeds
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = () if use_cache else None
+
+    for idx, decoder_layer in enumerate(self.compressedlayers):
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        past_key_value = (
+            past_key_values[8 * idx : 8 * (idx + 1)]
+            if past_key_values is not None
+            else None
+        )
+
+        if self.gradient_checkpointing and self.training:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    # None for past_key_value
+                    return module(*inputs, output_attentions, None)
+
+                return custom_forward
+
+            layer_outputs = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(decoder_layer),
+                hidden_states,
+                attention_mask,
+                position_ids,
+                None,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache += (
+                layer_outputs[2 if output_attentions else 1],
+            )
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if not return_dict:
+        return tuple(
+            v
+            for v in [
+                hidden_states,
+                next_cache,
+                all_hidden_states,
+                all_self_attns,
+            ]
+            if v is not None
+        )
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+
+class CompiledEightLayerLayer(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value=None,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        t2 = time()
+        if past_key_value is None:
+            try:
+                hidden_states = np.asarray(hidden_states, hidden_states.dtype)
+            except:
+                pass
+            attention_mask = attention_mask.detach()
+            position_ids = position_ids.detach()
+            t1 = time()
+
+            output = self.model(
+                "first_vicuna_forward",
+                (hidden_states, attention_mask, position_ids),
+                send_to_host=False,
+            )
+            output2 = (
+                output[0],
+                (
+                    output[1],
+                    output[2],
+                ),
+                (
+                    output[3],
+                    output[4],
+                ),
+                (
+                    output[5],
+                    output[6],
+                ),
+                (
+                    output[7],
+                    output[8],
+                ),
+                (
+                    output[9],
+                    output[10],
+                ),
+                (
+                    output[11],
+                    output[12],
+                ),
+                (
+                    output[13],
+                    output[14],
+                ),
+                (
+                    output[15],
+                    output[16],
+                ),
+            )
+            return output2
+        else:
+            (
+                (pkv00, pkv01),
+                (pkv10, pkv11),
+                (pkv20, pkv21),
+                (pkv30, pkv31),
+                (pkv40, pkv41),
+                (pkv50, pkv51),
+                (pkv60, pkv61),
+                (pkv70, pkv71),
+            ) = past_key_value
+
+            try:
+                hidden_states = hidden_states.detach()
+                attention_mask = attention_mask.detach()
+                position_ids = position_ids.detach()
+                pkv00 = pkv00.detach()
+                pkv01 = pkv01.detach()
+                pkv10 = pkv10.detach()
+                pkv11 = pkv11.detach()
+                pkv20 = pkv20.detach()
+                pkv21 = pkv21.detach()
+                pkv30 = pkv30.detach()
+                pkv31 = pkv31.detach()
+                pkv40 = pkv40.detach()
+                pkv41 = pkv41.detach()
+                pkv50 = pkv50.detach()
+                pkv51 = pkv51.detach()
+                pkv60 = pkv60.detach()
+                pkv61 = pkv61.detach()
+                pkv70 = pkv70.detach()
+                pkv71 = pkv71.detach()
+            except:
+                x = 10
+
+            t1 = time()
+            if type(hidden_states) == iree.runtime.array_interop.DeviceArray:
+                hidden_states = np.array(hidden_states, hidden_states.dtype)
+                hidden_states = torch.tensor(hidden_states)
+                hidden_states = hidden_states.detach()
+
+            output = self.model(
+                "second_vicuna_forward",
+                (
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    pkv00,
+                    pkv01,
+                    pkv10,
+                    pkv11,
+                    pkv20,
+                    pkv21,
+                    pkv30,
+                    pkv31,
+                    pkv40,
+                    pkv41,
+                    pkv50,
+                    pkv51,
+                    pkv60,
+                    pkv61,
+                    pkv70,
+                    pkv71,
+                ),
+                send_to_host=False,
+            )
+            print(f"{time() - t1}")
+            del pkv00
+            del pkv01
+            del pkv10
+            del pkv11
+            del pkv20
+            del pkv21
+            del pkv30
+            del pkv31
+            del pkv40
+            del pkv41
+            del pkv50
+            del pkv51
+            del pkv60
+            del pkv61
+            del pkv70
+            del pkv71
+            output2 = (
+                output[0],
+                (
+                    output[1],
+                    output[2],
+                ),
+                (
+                    output[3],
+                    output[4],
+                ),
+                (
+                    output[5],
+                    output[6],
+                ),
+                (
+                    output[7],
+                    output[8],
+                ),
+                (
+                    output[9],
+                    output[10],
+                ),
+                (
+                    output[11],
+                    output[12],
+                ),
+                (
+                    output[13],
+                    output[14],
+                ),
+                (
+                    output[15],
+                    output[16],
+                ),
+            )
+            return output2
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -1,15 +1,13 @@
 import torch
 from transformers import AutoModelForCausalLM

-from brevitas_examples.llm.llm_quant.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-

 class FirstVicuna(torch.nn.Module):
    def __init__(
        self,
        model_path,
        precision="fp32",
+        accumulates="fp32",
        weight_group_size=128,
        model_name="vicuna",
        hf_auth_token: str = None,
@@ -18,18 +16,29 @@ class FirstVicuna(torch.nn.Module):
        kwargs = {"torch_dtype": torch.float32}
        if "llama2" in model_name:
            kwargs["use_auth_token"] = hf_auth_token
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.common.generative.quantize import (
+                quantize_model,
+            )
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("First Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=self.accumulates,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
@@ -40,7 +49,9 @@ class FirstVicuna(torch.nn.Module):
    def forward(self, input_ids):
        op = self.model(input_ids=input_ids, use_cache=True)
        return_vals = []
-        return_vals.append(op.logits)
+        token = torch.argmax(op.logits[:, -1, :], dim=1)
+        return_vals.append(token)
+
        temp_past_key_values = op.past_key_values
        for item in temp_past_key_values:
            return_vals.append(item[0])
@@ -48,11 +59,12 @@ class FirstVicuna(torch.nn.Module):
        return tuple(return_vals)


-class SecondVicuna(torch.nn.Module):
+class SecondVicuna7B(torch.nn.Module):
    def __init__(
        self,
        model_path,
        precision="fp32",
+        accumulates="fp32",
        weight_group_size=128,
        model_name="vicuna",
        hf_auth_token: str = None,
@@ -64,15 +76,26 @@ class SecondVicuna(torch.nn.Module):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
+        print(f"[DEBUG] model_path : {model_path}")
        if precision in ["int4", "int8"]:
+            from brevitas_examples.common.generative.quantize import (
+                quantize_model,
+            )
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
            print("Second Vicuna applying weight quantization..")
            weight_bit_width = 4 if precision == "int4" else 8
            quantize_model(
                get_model_impl(self.model).layers,
-                dtype=torch.float32,
+                dtype=self.accumulates,
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
-                weight_scale_precision="float",
+                weight_scale_precision="float_scale",
                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
@@ -148,8 +171,6 @@ class SecondVicuna(torch.nn.Module):
        i63,
        i64,
    ):
-        # input_ids = input_tuple[0]
-        # input_tuple = torch.unbind(pkv, dim=0)
        token = i0
        past_key_values = (
            (i1, i2),
@@ -282,6 +303,846 @@ class SecondVicuna(torch.nn.Module):
            input_ids=token, use_cache=True, past_key_values=past_key_values
        )
        return_vals = []
+        token = torch.argmax(op.logits[:, -1, :], dim=1)
+        return_vals.append(token)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondVicuna13B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="int8",
+        accumulates="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.common.generative.quantize import (
+                quantize_model,
+            )
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=self.accumulates,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float_scale",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
+        return_vals.append(op.logits)
+        temp_past_key_values = op.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondVicuna70B(torch.nn.Module):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        accumulates="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
+        super().__init__()
+        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **kwargs
+        )
+        self.accumulates = (
+            torch.float32 if accumulates == "fp32" else torch.float16
+        )
+        print(f"[DEBUG] model_path : {model_path}")
+        if precision in ["int4", "int8"]:
+            from brevitas_examples.common.generative.quantize import (
+                quantize_model,
+            )
+            from brevitas_examples.llm.llm_quant.run_utils import (
+                get_model_impl,
+            )
+
+            print("Second Vicuna applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                get_model_impl(self.model).layers,
+                dtype=self.accumulates,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float_scale",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        i0,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+        i65,
+        i66,
+        i67,
+        i68,
+        i69,
+        i70,
+        i71,
+        i72,
+        i73,
+        i74,
+        i75,
+        i76,
+        i77,
+        i78,
+        i79,
+        i80,
+        i81,
+        i82,
+        i83,
+        i84,
+        i85,
+        i86,
+        i87,
+        i88,
+        i89,
+        i90,
+        i91,
+        i92,
+        i93,
+        i94,
+        i95,
+        i96,
+        i97,
+        i98,
+        i99,
+        i100,
+        i101,
+        i102,
+        i103,
+        i104,
+        i105,
+        i106,
+        i107,
+        i108,
+        i109,
+        i110,
+        i111,
+        i112,
+        i113,
+        i114,
+        i115,
+        i116,
+        i117,
+        i118,
+        i119,
+        i120,
+        i121,
+        i122,
+        i123,
+        i124,
+        i125,
+        i126,
+        i127,
+        i128,
+        i129,
+        i130,
+        i131,
+        i132,
+        i133,
+        i134,
+        i135,
+        i136,
+        i137,
+        i138,
+        i139,
+        i140,
+        i141,
+        i142,
+        i143,
+        i144,
+        i145,
+        i146,
+        i147,
+        i148,
+        i149,
+        i150,
+        i151,
+        i152,
+        i153,
+        i154,
+        i155,
+        i156,
+        i157,
+        i158,
+        i159,
+        i160,
+    ):
+        token = i0
+        past_key_values = (
+            (i1, i2),
+            (
+                i3,
+                i4,
+            ),
+            (
+                i5,
+                i6,
+            ),
+            (
+                i7,
+                i8,
+            ),
+            (
+                i9,
+                i10,
+            ),
+            (
+                i11,
+                i12,
+            ),
+            (
+                i13,
+                i14,
+            ),
+            (
+                i15,
+                i16,
+            ),
+            (
+                i17,
+                i18,
+            ),
+            (
+                i19,
+                i20,
+            ),
+            (
+                i21,
+                i22,
+            ),
+            (
+                i23,
+                i24,
+            ),
+            (
+                i25,
+                i26,
+            ),
+            (
+                i27,
+                i28,
+            ),
+            (
+                i29,
+                i30,
+            ),
+            (
+                i31,
+                i32,
+            ),
+            (
+                i33,
+                i34,
+            ),
+            (
+                i35,
+                i36,
+            ),
+            (
+                i37,
+                i38,
+            ),
+            (
+                i39,
+                i40,
+            ),
+            (
+                i41,
+                i42,
+            ),
+            (
+                i43,
+                i44,
+            ),
+            (
+                i45,
+                i46,
+            ),
+            (
+                i47,
+                i48,
+            ),
+            (
+                i49,
+                i50,
+            ),
+            (
+                i51,
+                i52,
+            ),
+            (
+                i53,
+                i54,
+            ),
+            (
+                i55,
+                i56,
+            ),
+            (
+                i57,
+                i58,
+            ),
+            (
+                i59,
+                i60,
+            ),
+            (
+                i61,
+                i62,
+            ),
+            (
+                i63,
+                i64,
+            ),
+            (
+                i65,
+                i66,
+            ),
+            (
+                i67,
+                i68,
+            ),
+            (
+                i69,
+                i70,
+            ),
+            (
+                i71,
+                i72,
+            ),
+            (
+                i73,
+                i74,
+            ),
+            (
+                i75,
+                i76,
+            ),
+            (
+                i77,
+                i78,
+            ),
+            (
+                i79,
+                i80,
+            ),
+            (
+                i81,
+                i82,
+            ),
+            (
+                i83,
+                i84,
+            ),
+            (
+                i85,
+                i86,
+            ),
+            (
+                i87,
+                i88,
+            ),
+            (
+                i89,
+                i90,
+            ),
+            (
+                i91,
+                i92,
+            ),
+            (
+                i93,
+                i94,
+            ),
+            (
+                i95,
+                i96,
+            ),
+            (
+                i97,
+                i98,
+            ),
+            (
+                i99,
+                i100,
+            ),
+            (
+                i101,
+                i102,
+            ),
+            (
+                i103,
+                i104,
+            ),
+            (
+                i105,
+                i106,
+            ),
+            (
+                i107,
+                i108,
+            ),
+            (
+                i109,
+                i110,
+            ),
+            (
+                i111,
+                i112,
+            ),
+            (
+                i113,
+                i114,
+            ),
+            (
+                i115,
+                i116,
+            ),
+            (
+                i117,
+                i118,
+            ),
+            (
+                i119,
+                i120,
+            ),
+            (
+                i121,
+                i122,
+            ),
+            (
+                i123,
+                i124,
+            ),
+            (
+                i125,
+                i126,
+            ),
+            (
+                i127,
+                i128,
+            ),
+            (
+                i129,
+                i130,
+            ),
+            (
+                i131,
+                i132,
+            ),
+            (
+                i133,
+                i134,
+            ),
+            (
+                i135,
+                i136,
+            ),
+            (
+                i137,
+                i138,
+            ),
+            (
+                i139,
+                i140,
+            ),
+            (
+                i141,
+                i142,
+            ),
+            (
+                i143,
+                i144,
+            ),
+            (
+                i145,
+                i146,
+            ),
+            (
+                i147,
+                i148,
+            ),
+            (
+                i149,
+                i150,
+            ),
+            (
+                i151,
+                i152,
+            ),
+            (
+                i153,
+                i154,
+            ),
+            (
+                i155,
+                i156,
+            ),
+            (
+                i157,
+                i158,
+            ),
+            (
+                i159,
+                i160,
+            ),
+        )
+        op = self.model(
+            input_ids=token, use_cache=True, past_key_values=past_key_values
+        )
+        return_vals = []
        return_vals.append(op.logits)
        temp_past_key_values = op.past_key_values
        for item in temp_past_key_values:
@@ -298,15 +1159,17 @@ class CombinedModel(torch.nn.Module):
    ):
        super().__init__()
        self.first_vicuna = FirstVicuna(first_vicuna_model_path)
-        self.second_vicuna = SecondVicuna(second_vicuna_model_path)
+        # NOT using this path for 13B currently, hence using `SecondVicuna7B`.
+        self.second_vicuna = SecondVicuna7B(second_vicuna_model_path)

    def forward(self, input_ids):
-        first_output = self.first_vicuna(input_ids=input_ids, use_cache=True)
-        logits = first_output[0]
-        pkv = first_output[1:]
-
-        token = torch.argmax(torch.tensor(logits)[:, -1, :], dim=1)
-        token = token.to(torch.int64).reshape([1, 1])
-        secondVicunaInput = (token,) + tuple(pkv)
-        second_output = self.second_vicuna(secondVicunaInput)
+        first_output = self.first_vicuna(input_ids=input_ids)
+        # generate second vicuna
+        compilation_input_ids = torch.zeros([1, 1], dtype=torch.int64)
+        pkv = tuple(
+            (torch.zeros([1, 32, 19, 128], dtype=torch.float32))
+            for _ in range(64)
+        )
+        secondVicunaCompileInput = (compilation_input_ids,) + pkv
+        second_output = self.second_vicuna(*secondVicunaCompileInput)
        return second_output
--- a/apps/language_models/src/model_wrappers/vicuna_model_gpu.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model_gpu.py
--- a/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
@@ -1,4 +1,5 @@
 import torch
+import time


 class FirstVicunaLayer(torch.nn.Module):
@@ -66,7 +67,6 @@ class ShardedVicunaModel(torch.nn.Module):
    def __init__(self, model, layers, lmhead, embedding, norm):
        super().__init__()
        self.model = model
-        assert len(layers) == len(model.model.layers)
        self.model.model.config.use_cache = True
        self.model.model.config.output_attentions = False
        self.layers = layers
@@ -110,9 +110,11 @@ class LMHeadCompiled(torch.nn.Module):
        self.model = shark_module

    def forward(self, hidden_states):
-        hidden_states = hidden_states.detach()
+        hidden_states_sample = hidden_states.detach()
+
        output = self.model("forward", (hidden_states,))
        output = torch.tensor(output)
+
        return output


@@ -132,9 +134,13 @@ class VicunaNormCompiled(torch.nn.Module):
        self.model = shark_module

    def forward(self, hidden_states):
-        hidden_states.detach()
-        output = self.model("forward", (hidden_states,))
+        try:
+            hidden_states.detach()
+        except:
+            pass
+        output = self.model("forward", (hidden_states,), send_to_host=True)
        output = torch.tensor(output)
+
        return output


@@ -155,15 +161,18 @@ class VicunaEmbeddingCompiled(torch.nn.Module):

    def forward(self, input_ids):
        input_ids.detach()
-        output = self.model("forward", (input_ids,))
+        output = self.model("forward", (input_ids,), send_to_host=True)
        output = torch.tensor(output)
+
        return output


 class CompiledVicunaLayer(torch.nn.Module):
-    def __init__(self, shark_module):
+    def __init__(self, shark_module, idx, breakpoints):
        super().__init__()
        self.model = shark_module
+        self.idx = idx
+        self.breakpoints = breakpoints

    def forward(
        self,
@@ -174,10 +183,11 @@ class CompiledVicunaLayer(torch.nn.Module):
        output_attentions=False,
        use_cache=True,
    ):
+        if self.breakpoints is None:
+            is_breakpoint = False
+        else:
+            is_breakpoint = self.idx + 1 in self.breakpoints
        if past_key_value is None:
-            hidden_states = hidden_states.detach()
-            attention_mask = attention_mask.detach()
-            position_ids = position_ids.detach()
            output = self.model(
                "first_vicuna_forward",
                (
@@ -185,11 +195,17 @@ class CompiledVicunaLayer(torch.nn.Module):
                    attention_mask,
                    position_ids,
                ),
+                send_to_host=is_breakpoint,
            )

-            output0 = torch.tensor(output[0])
-            output1 = torch.tensor(output[1])
-            output2 = torch.tensor(output[2])
+            if is_breakpoint:
+                output0 = torch.tensor(output[0])
+                output1 = torch.tensor(output[1])
+                output2 = torch.tensor(output[2])
+            else:
+                output0 = output[0]
+                output1 = output[1]
+                output2 = output[2]

            return (
                output0,
@@ -199,11 +215,8 @@ class CompiledVicunaLayer(torch.nn.Module):
                ),
            )
        else:
-            hidden_states = hidden_states.detach()
-            attention_mask = attention_mask.detach()
-            position_ids = position_ids.detach()
-            pkv0 = past_key_value[0].detach()
-            pkv1 = past_key_value[1].detach()
+            pkv0 = past_key_value[0]
+            pkv1 = past_key_value[1]
            output = self.model(
                "second_vicuna_forward",
                (
@@ -213,11 +226,17 @@ class CompiledVicunaLayer(torch.nn.Module):
                    pkv0,
                    pkv1,
                ),
+                send_to_host=is_breakpoint,
            )

-            output0 = torch.tensor(output[0])
-            output1 = torch.tensor(output[1])
-            output2 = torch.tensor(output[2])
+            if is_breakpoint:
+                output0 = torch.tensor(output[0])
+                output1 = torch.tensor(output[1])
+                output2 = torch.tensor(output[2])
+            else:
+                output0 = output[0]
+                output1 = output[1]
+                output2 = output[2]

            return (
                output0,
--- a/apps/language_models/src/pipelines/SharkLLMBase.py
+++ b/apps/language_models/src/pipelines/SharkLLMBase.py
@@ -3,7 +3,10 @@ from abc import ABC, abstractmethod

 class SharkLLMBase(ABC):
    def __init__(
-        self, model_name, hf_model_path=None, max_num_tokens=512
+        self,
+        model_name,
+        hf_model_path=None,
+        max_num_tokens=512,
    ) -> None:
        self.model_name = model_name
        self.hf_model_path = hf_model_path
--- a/apps/language_models/src/pipelines/falcon_pipeline.py
+++ b/apps/language_models/src/pipelines/falcon_pipeline.py
@@ -1,4 +1,17 @@
 from apps.language_models.src.model_wrappers.falcon_model import FalconModel
+from apps.language_models.src.model_wrappers.falcon_sharded_model import (
+    WordEmbeddingsLayer,
+    CompiledWordEmbeddingsLayer,
+    LNFEmbeddingLayer,
+    CompiledLNFEmbeddingLayer,
+    LMHeadEmbeddingLayer,
+    CompiledLMHeadEmbeddingLayer,
+    FourWayShardingDecoderLayer,
+    TwoWayShardingDecoderLayer,
+    CompiledFourWayShardingDecoderLayer,
+    CompiledTwoWayShardingDecoderLayer,
+    ShardedFalconModel,
+)
 from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
 from apps.language_models.utils import (
    get_vmfb_from_path,
@@ -7,30 +20,39 @@ from io import BytesIO
 from pathlib import Path
 from contextlib import redirect_stdout
 from shark.shark_downloader import download_public_file
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from shark.shark_inference import SharkInference
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
 from transformers.generation import (
    GenerationConfig,
    LogitsProcessorList,
    StoppingCriteriaList,
 )
 import copy
-
+import time
 import re
 import torch
 import torch_mlir
 import os
 import argparse
+import gc

 parser = argparse.ArgumentParser(
    prog="falcon runner",
    description="runs a falcon model",
 )

-parser.add_argument("--falcon_variant_to_use", default="7b", help="7b, 40b")
 parser.add_argument(
-    "--precision", "-p", default="fp16", help="fp32, fp16, int8, int4"
+    "--falcon_variant_to_use", default="7b", help="7b, 40b, 180b"
+)
+parser.add_argument(
+    "--compressed",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Do the compression of sharded layers",
+)
+parser.add_argument(
+    "--precision", "-p", default="fp16", choices=["fp32", "fp16", "int4"]
 )
 parser.add_argument("--device", "-d", default="cuda", help="vulkan, cpu, cuda")
 parser.add_argument(
@@ -49,7 +71,7 @@ parser.add_argument(
 )
 parser.add_argument(
    "--load_mlir_from_shark_tank",
-    default=False,
+    default=True,
    action=argparse.BooleanOptionalAction,
    help="download precompile mlir from shark tank",
 )
@@ -59,32 +81,74 @@ parser.add_argument(
    action=argparse.BooleanOptionalAction,
    help="Run model in cli mode",
 )
+parser.add_argument(
+    "--hf_auth_token",
+    type=str,
+    default=None,
+    help="Specify your own huggingface authentication token for falcon-180B model.",
+)
+parser.add_argument(
+    "-s",
+    "--sharded",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Run model as sharded",
+)
+parser.add_argument(
+    "--num_shards",
+    type=int,
+    default=4,
+    choices=[2, 4],
+    help="Number of shards.",
+)


-class Falcon(SharkLLMBase):
+class ShardedFalcon(SharkLLMBase):
    def __init__(
        self,
        model_name,
-        hf_model_path,
+        hf_model_path="tiiuae/falcon-7b-instruct",
+        hf_auth_token: str = None,
        max_num_tokens=150,
        device="cuda",
        precision="fp32",
        falcon_mlir_path=None,
        falcon_vmfb_path=None,
+        debug=False,
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
+        print("hf_model_path: ", self.hf_model_path)
+
+        if (
+            "180b" in self.model_name
+            and precision != "int4"
+            and hf_auth_token == None
+        ):
+            raise ValueError(
+                """ HF auth token required for falcon-180b. Pass it using
+                --hf_auth_token flag. You can ask for the access to the model
+                here: https://huggingface.co/tiiuae/falcon-180B-chat."""
+            )
+
+        if args.sharded and "180b" not in self.model_name:
+            raise ValueError("Sharding supported only for Falcon-180B")
+
+        self.hf_auth_token = hf_auth_token
        self.max_padding_length = 100
        self.device = device
        self.precision = precision
        self.falcon_vmfb_path = falcon_vmfb_path
        self.falcon_mlir_path = falcon_mlir_path
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
-        self.shark_model = self.compile()
        self.src_model = self.get_src_model()
+        self.shark_model = self.compile()

    def get_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(
-            self.hf_model_path, trust_remote_code=True
+            self.hf_model_path,
+            trust_remote_code=True,
+            token=self.hf_auth_token,
        )
        tokenizer.padding_side = "left"
        tokenizer.pad_token_id = 11
@@ -92,13 +156,535 @@ class Falcon(SharkLLMBase):

    def get_src_model(self):
        print("Loading src model: ", self.model_name)
-        kwargs = {"torch_dtype": torch.float, "trust_remote_code": True}
+        kwargs = {
+            "torch_dtype": torch.float32,
+            "trust_remote_code": True,
+            "token": self.hf_auth_token,
+        }
+        if self.precision == "int4":
+            quantization_config = GPTQConfig(bits=4, disable_exllama=True)
+            kwargs["quantization_config"] = quantization_config
+            kwargs["device_map"] = "cpu"
        falcon_model = AutoModelForCausalLM.from_pretrained(
            self.hf_model_path, **kwargs
        )
        return falcon_model

-    def compile_falcon(self):
+    def compile_layer(
+        self, layer, falconCompileInput, layer_id, device_idx=None
+    ):
+        self.falcon_mlir_path = Path(
+            f"falcon_{args.falcon_variant_to_use}_layer_{layer_id}_{self.precision}.mlir"
+        )
+        self.falcon_vmfb_path = Path(
+            f"falcon_{args.falcon_variant_to_use}_layer_{layer_id}_{self.precision}_{self.device}.vmfb"
+        )
+
+        if args.use_precompiled_model:
+            if not self.falcon_vmfb_path.exists():
+                # Downloading VMFB from shark_tank
+                print(f"[DEBUG] Trying to download vmfb from shark_tank")
+                download_public_file(
+                    f"gs://shark_tank/falcon/sharded/falcon_{args.falcon_variant_to_use}/vmfb/"
+                    + str(self.falcon_vmfb_path),
+                    self.falcon_vmfb_path.absolute(),
+                    single_file=True,
+                )
+            vmfb = get_vmfb_from_path(
+                self.falcon_vmfb_path,
+                self.device,
+                "linalg",
+                device_id=device_idx,
+            )
+            if vmfb is not None:
+                return vmfb, device_idx
+
+        print(f"[DEBUG] vmfb not found at {self.falcon_vmfb_path.absolute()}")
+        if self.falcon_mlir_path.exists():
+            print(f"[DEBUG] mlir found at {self.falcon_mlir_path.absolute()}")
+            with open(self.falcon_mlir_path, "rb") as f:
+                bytecode = f.read()
+        else:
+            mlir_generated = False
+            print(
+                f"[DEBUG] mlir not found at {self.falcon_mlir_path.absolute()}"
+            )
+            if args.load_mlir_from_shark_tank:
+                # Downloading MLIR from shark_tank
+                print(f"[DEBUG] Trying to download mlir from shark_tank")
+                download_public_file(
+                    f"gs://shark_tank/falcon/sharded/falcon_{args.falcon_variant_to_use}/mlir/"
+                    + str(self.falcon_mlir_path),
+                    self.falcon_mlir_path.absolute(),
+                    single_file=True,
+                )
+                if self.falcon_mlir_path.exists():
+                    print(
+                        f"[DEBUG] mlir found at {self.falcon_mlir_path.absolute()}"
+                    )
+                    with open(self.falcon_mlir_path, "rb") as f:
+                        bytecode = f.read()
+                    mlir_generated = True
+
+            if not mlir_generated:
+                print(f"[DEBUG] generating MLIR locally")
+                if layer_id == "word_embeddings":
+                    f16_input_mask = [False]
+                elif layer_id in ["ln_f", "lm_head"]:
+                    f16_input_mask = [True]
+                elif "_" in layer_id or type(layer_id) == int:
+                    f16_input_mask = [True, True]
+                else:
+                    raise ValueError("Unsupported layer: ", layer_id)
+
+                print(f"[DEBUG] generating torchscript graph")
+                ts_graph = import_with_fx(
+                    layer,
+                    falconCompileInput,
+                    is_f16=True,
+                    f16_input_mask=f16_input_mask,
+                    mlir_type="torchscript",
+                    is_gptq=True,
+                )
+                del layer
+
+                print(f"[DEBUG] generating torch mlir")
+                module = torch_mlir.compile(
+                    ts_graph,
+                    falconCompileInput,
+                    torch_mlir.OutputType.LINALG_ON_TENSORS,
+                    use_tracing=False,
+                    verbose=False,
+                )
+                del ts_graph
+
+                print(f"[DEBUG] converting to bytecode")
+                bytecode_stream = BytesIO()
+                module.operation.write_bytecode(bytecode_stream)
+                bytecode = bytecode_stream.getvalue()
+                del module
+
+                f_ = open(self.falcon_mlir_path, "wb")
+                f_.write(bytecode)
+                print("Saved falcon mlir at ", str(self.falcon_mlir_path))
+                f_.close()
+                del bytecode
+
+        shark_module = SharkInference(
+            mlir_module=self.falcon_mlir_path,
+            device=self.device,
+            mlir_dialect="linalg",
+            device_idx=device_idx,
+        )
+        path = shark_module.save_module(
+            self.falcon_vmfb_path.parent.absolute(),
+            self.falcon_vmfb_path.stem,
+            extra_args=[
+                "--iree-vm-target-truncate-unsupported-floats",
+                "--iree-codegen-check-ir-before-llvm-conversion=false",
+                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+            ]
+            + [
+                "--iree-llvmcpu-use-fast-min-max-ops",
+            ]
+            if self.precision == "int4"
+            else [],
+            debug=self.debug,
+        )
+        print("Saved falcon vmfb at ", str(path))
+        shark_module.load_module(path)
+
+        return shark_module, device_idx
+
+    def compile(self):
+        sample_input_ids = torch.zeros([100], dtype=torch.int64)
+        sample_attention_mask = torch.zeros(
+            [1, 1, 100, 100], dtype=torch.float32
+        )
+        num_group_layers = int(
+            20 * (4 / args.num_shards)
+        )  # 4 is the number of default shards
+        sample_hidden_states = torch.zeros(
+            [1, 100, 14848], dtype=torch.float32
+        )
+
+        # Determine number of available devices
+        num_devices = 1
+        if self.device == "rocm":
+            import iree.runtime as ireert
+
+            haldriver = ireert.get_driver(self.device)
+            num_devices = len(haldriver.query_available_devices())
+            if num_devices < 2:
+                raise ValueError(
+                    "Cannot run Falcon-180B on a single ROCM device."
+                )
+
+        lm_head = LMHeadEmbeddingLayer(self.src_model.lm_head)
+        print("Compiling Layer lm_head")
+        shark_lm_head, _ = self.compile_layer(
+            lm_head,
+            [sample_hidden_states],
+            "lm_head",
+            device_idx=(0 % num_devices) % args.num_shards
+            if self.device == "rocm"
+            else None,
+        )
+        shark_lm_head = CompiledLMHeadEmbeddingLayer(shark_lm_head)
+
+        word_embedding = WordEmbeddingsLayer(
+            self.src_model.transformer.word_embeddings
+        )
+        print("Compiling Layer word_embeddings")
+        shark_word_embedding, _ = self.compile_layer(
+            word_embedding,
+            [sample_input_ids],
+            "word_embeddings",
+            device_idx=(1 % num_devices) % args.num_shards
+            if self.device == "rocm"
+            else None,
+        )
+        shark_word_embedding = CompiledWordEmbeddingsLayer(
+            shark_word_embedding
+        )
+
+        ln_f = LNFEmbeddingLayer(self.src_model.transformer.ln_f)
+        print("Compiling Layer ln_f")
+        shark_ln_f, _ = self.compile_layer(
+            ln_f,
+            [sample_hidden_states],
+            "ln_f",
+            device_idx=(2 % num_devices) % args.num_shards
+            if self.device == "rocm"
+            else None,
+        )
+        shark_ln_f = CompiledLNFEmbeddingLayer(shark_ln_f)
+
+        shark_layers = []
+        for i in range(
+            int(len(self.src_model.transformer.h) / num_group_layers)
+        ):
+            device_idx = i % num_devices if self.device == "rocm" else None
+            layer_id = i
+            layer_id = (
+                str(i * num_group_layers)
+                + "_"
+                + str((i + 1) * num_group_layers)
+            )
+            pytorch_class = FourWayShardingDecoderLayer
+            compiled_class = CompiledFourWayShardingDecoderLayer
+            if args.num_shards == 2:
+                pytorch_class = TwoWayShardingDecoderLayer
+                compiled_class = CompiledTwoWayShardingDecoderLayer
+
+            print("Compiling Layer {}".format(layer_id))
+            layer_i = self.src_model.transformer.h[
+                i * num_group_layers : (i + 1) * num_group_layers
+            ]
+
+            pytorch_layer_i = pytorch_class(
+                layer_i, args.falcon_variant_to_use
+            )
+            shark_module, device_idx = self.compile_layer(
+                pytorch_layer_i,
+                [sample_hidden_states, sample_attention_mask],
+                layer_id,
+                device_idx=device_idx,
+            )
+            shark_layer_i = compiled_class(
+                layer_id,
+                device_idx,
+                args.falcon_variant_to_use,
+                self.device,
+                self.precision,
+                shark_module,
+            )
+            shark_layers.append(shark_layer_i)
+
+        sharded_model = ShardedFalconModel(
+            self.src_model,
+            shark_layers,
+            shark_word_embedding,
+            shark_ln_f,
+            shark_lm_head,
+        )
+        return sharded_model
+
+    def generate(self, prompt):
+        model_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.max_padding_length,
+            add_special_tokens=False,
+            return_tensors="pt",
+        )
+        model_inputs["prompt_text"] = prompt
+
+        input_ids = model_inputs["input_ids"]
+        attention_mask = model_inputs.get("attention_mask", None)
+
+        # Allow empty prompts
+        if input_ids.shape[1] == 0:
+            input_ids = None
+            attention_mask = None
+
+        generate_kwargs = {
+            "max_length": self.max_num_tokens,
+            "do_sample": True,
+            "top_k": 10,
+            "num_return_sequences": 1,
+            "eos_token_id": 11,
+        }
+        generate_kwargs["input_ids"] = input_ids
+        generate_kwargs["attention_mask"] = attention_mask
+        generation_config_ = GenerationConfig.from_model_config(
+            self.src_model.config
+        )
+        generation_config = copy.deepcopy(generation_config_)
+        model_kwargs = generation_config.update(**generate_kwargs)
+
+        logits_processor = LogitsProcessorList()
+        stopping_criteria = StoppingCriteriaList()
+
+        eos_token_id = generation_config.eos_token_id
+        generation_config.pad_token_id = eos_token_id
+
+        (
+            inputs_tensor,
+            model_input_name,
+            model_kwargs,
+        ) = self.src_model._prepare_model_inputs(
+            None, generation_config.bos_token_id, model_kwargs
+        )
+
+        model_kwargs["output_attentions"] = generation_config.output_attentions
+        model_kwargs[
+            "output_hidden_states"
+        ] = generation_config.output_hidden_states
+        model_kwargs["use_cache"] = generation_config.use_cache
+
+        input_ids = (
+            inputs_tensor
+            if model_input_name == "input_ids"
+            else model_kwargs.pop("input_ids")
+        )
+
+        self.logits_processor = self.src_model._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids.shape[-1],
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+        )
+
+        self.stopping_criteria = self.src_model._get_stopping_criteria(
+            generation_config=generation_config,
+            stopping_criteria=stopping_criteria,
+        )
+
+        self.logits_warper = self.src_model._get_logits_warper(
+            generation_config
+        )
+
+        (
+            self.input_ids,
+            self.model_kwargs,
+        ) = self.src_model._expand_inputs_for_generation(
+            input_ids=input_ids,
+            expand_size=generation_config.num_return_sequences,  # 1
+            is_encoder_decoder=self.src_model.config.is_encoder_decoder,  # False
+            **model_kwargs,
+        )
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        self.eos_token_id_tensor = (
+            torch.tensor(eos_token_id) if eos_token_id is not None else None
+        )
+
+        self.pad_token_id = generation_config.pad_token_id
+        self.eos_token_id = eos_token_id
+
+        output_scores = generation_config.output_scores  # False
+        return_dict_in_generate = (
+            generation_config.return_dict_in_generate  # False
+        )
+
+        # init attention / hidden states / scores tuples
+        self.scores = (
+            () if (return_dict_in_generate and output_scores) else None
+        )
+
+        # keep track of which sequences are already finished
+        self.unfinished_sequences = torch.ones(
+            input_ids.shape[0], dtype=torch.long, device=input_ids.device
+        )
+
+        all_text = prompt
+
+        start = time.time()
+        count = 0
+        for i in range(self.max_num_tokens - 1):
+            count = count + 1
+
+            next_token = self.generate_new_token()
+            new_word = self.tokenizer.decode(
+                next_token.cpu().numpy(),
+                add_special_tokens=False,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
+
+            all_text = all_text + new_word
+
+            print(f"{new_word}", end="", flush=True)
+            print(f"{all_text}", end="", flush=True)
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if self.eos_token_id_tensor is not None:
+                self.unfinished_sequences = self.unfinished_sequences.mul(
+                    next_token.tile(self.eos_token_id_tensor.shape[0], 1)
+                    .ne(self.eos_token_id_tensor.unsqueeze(1))
+                    .prod(dim=0)
+                )
+                # stop when each sentence is finished
+                if (
+                    self.unfinished_sequences.max() == 0
+                    or self.stopping_criteria(input_ids, self.scores)
+                ):
+                    break
+
+        end = time.time()
+        print(
+            "\n\nTime taken is {:.2f} seconds/token\n".format(
+                (end - start) / count
+            )
+        )
+
+        torch.cuda.empty_cache()
+        gc.collect()
+
+        return all_text
+
+    def generate_new_token(self):
+        model_inputs = self.src_model.prepare_inputs_for_generation(
+            self.input_ids, **self.model_kwargs
+        )
+        outputs = self.shark_model.forward(
+            input_ids=model_inputs["input_ids"],
+            attention_mask=model_inputs["attention_mask"],
+        )
+        if self.precision in ["fp16", "int4"]:
+            outputs = outputs.to(dtype=torch.float32)
+        next_token_logits = outputs
+
+        # pre-process distribution
+        next_token_scores = self.logits_processor(
+            self.input_ids, next_token_logits
+        )
+        next_token_scores = self.logits_warper(
+            self.input_ids, next_token_scores
+        )
+
+        # sample
+        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+
+        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+        # finished sentences should have their next token be a padding token
+        if self.eos_token_id is not None:
+            if self.pad_token_id is None:
+                raise ValueError(
+                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
+                )
+            next_token = (
+                next_token * self.unfinished_sequences
+                + self.pad_token_id * (1 - self.unfinished_sequences)
+            )
+
+        self.input_ids = torch.cat(
+            [self.input_ids, next_token[:, None]], dim=-1
+        )
+
+        self.model_kwargs["past_key_values"] = None
+        if "attention_mask" in self.model_kwargs:
+            attention_mask = self.model_kwargs["attention_mask"]
+            self.model_kwargs["attention_mask"] = torch.cat(
+                [
+                    attention_mask,
+                    attention_mask.new_ones((attention_mask.shape[0], 1)),
+                ],
+                dim=-1,
+            )
+
+        self.input_ids = self.input_ids[:, 1:]
+        self.model_kwargs["attention_mask"] = self.model_kwargs[
+            "attention_mask"
+        ][:, 1:]
+
+        return next_token
+
+
+class UnshardedFalcon(SharkLLMBase):
+    def __init__(
+        self,
+        model_name,
+        hf_model_path="tiiuae/falcon-7b-instruct",
+        hf_auth_token: str = "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",
+        max_num_tokens=150,
+        device="cuda",
+        precision="fp32",
+        falcon_mlir_path=None,
+        falcon_vmfb_path=None,
+        debug=False,
+    ) -> None:
+        super().__init__(model_name, hf_model_path, max_num_tokens)
+        print("hf_model_path: ", self.hf_model_path)
+
+        if "180b" in self.model_name and hf_auth_token == None:
+            raise ValueError(
+                """ HF auth token required for falcon-180b. Pass it using
+                --hf_auth_token flag. You can ask for the access to the model
+                here: https://huggingface.co/tiiuae/falcon-180B-chat."""
+            )
+        self.hf_auth_token = hf_auth_token
+        self.max_padding_length = 100
+        self.device = device
+        self.precision = precision
+        self.falcon_vmfb_path = falcon_vmfb_path
+        self.falcon_mlir_path = falcon_mlir_path
+        self.debug = debug
+        self.tokenizer = self.get_tokenizer()
+        self.src_model = self.get_src_model()
+        self.shark_model = self.compile()
+
+    def get_tokenizer(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.hf_model_path,
+            trust_remote_code=True,
+            token=self.hf_auth_token,
+        )
+        tokenizer.padding_side = "left"
+        tokenizer.pad_token_id = 11
+        return tokenizer
+
+    def get_src_model(self):
+        print("Loading src model: ", self.model_name)
+        kwargs = {
+            "torch_dtype": torch.float32,
+            "trust_remote_code": True,
+            "token": self.hf_auth_token,
+        }
+        if self.precision == "int4":
+            quantization_config = GPTQConfig(bits=4, disable_exllama=True)
+            kwargs["quantization_config"] = quantization_config
+            kwargs["device_map"] = "cpu"
+        falcon_model = AutoModelForCausalLM.from_pretrained(
+            self.hf_model_path, **kwargs
+        )
+        return falcon_model
+
+    def compile(self):
        if args.use_precompiled_model:
            if not self.falcon_vmfb_path.exists():
                # Downloading VMFB from shark_tank
@@ -120,37 +706,37 @@ class Falcon(SharkLLMBase):
            if vmfb is not None:
                return vmfb

-        print(
-            f"[DEBUG] vmfb not found at {self.falcon_vmfb_path.absolute()}. Trying to work with"
-            f"[DEBUG] mlir path { self.falcon_mlir_path} {'exists' if self.falcon_mlir_path.exists() else 'does not exist'}"
-        )
+        print(f"[DEBUG] vmfb not found at {self.falcon_vmfb_path.absolute()}")
        if self.falcon_mlir_path.exists():
+            print(f"[DEBUG] mlir found at {self.falcon_mlir_path.absolute()}")
            with open(self.falcon_mlir_path, "rb") as f:
                bytecode = f.read()
        else:
            mlir_generated = False
-            # Downloading MLIR from shark_tank
-            download_public_file(
-                "gs://shark_tank/falcon/"
-                + "falcon_"
-                + args.falcon_variant_to_use
-                + "_"
-                + self.precision
-                + ".mlir",
-                self.falcon_mlir_path.absolute(),
-                single_file=True,
+            print(
+                f"[DEBUG] mlir not found at {self.falcon_mlir_path.absolute()}"
            )
-            if self.falcon_mlir_path.exists():
-                with open(self.falcon_mlir_path, "rb") as f:
-                    bytecode = f.read()
-                mlir_generated = True
-            else:
-                raise ValueError(
-                    f"MLIR not found at {self.falcon_mlir_path.absolute()}"
-                    " after downloading! Please check path and try again"
+            if args.load_mlir_from_shark_tank:
+                # Downloading MLIR from shark_tank
+                print(f"[DEBUG] Trying to download mlir from shark_tank")
+                download_public_file(
+                    "gs://shark_tank/falcon/"
+                    + "falcon_"
+                    + args.falcon_variant_to_use
+                    + "_"
+                    + self.precision
+                    + ".mlir",
+                    self.falcon_mlir_path.absolute(),
+                    single_file=True,
                )
+                if self.falcon_mlir_path.exists():
+                    print(
+                        f"[DEBUG] mlir found at {self.falcon_mlir_path.absolute()}"
+                    )
+                    mlir_generated = True

            if not mlir_generated:
+                print(f"[DEBUG] generating MLIR locally")
                compilation_input_ids = torch.randint(
                    low=1, high=10000, size=(1, 100)
                )
@@ -167,9 +753,10 @@ class Falcon(SharkLLMBase):
                ts_graph = import_with_fx(
                    model,
                    falconCompileInput,
-                    is_f16=self.precision == "fp16",
+                    is_f16=self.precision in ["fp16", "int4"],
                    f16_input_mask=[False, False],
                    mlir_type="torchscript",
+                    is_gptq=self.precision == "int4",
                )
                del model
                print(f"[DEBUG] generating torch mlir")
@@ -189,35 +776,37 @@ class Falcon(SharkLLMBase):
                bytecode = bytecode_stream.getvalue()
                del module

-                print(f"[DEBUG] writing mlir to file")
-                with open(f"{self.model_name}.mlir", "wb") as f_:
-                    with redirect_stdout(f_):
-                        print(module.operation.get_asm())
+                f_ = open(self.falcon_mlir_path, "wb")
+                f_.write(bytecode)
+                print("Saved falcon mlir at ", str(self.falcon_mlir_path))
                f_.close()
+                del bytecode

        shark_module = SharkInference(
-            mlir_module=bytecode, device=self.device, mlir_dialect="linalg"
+            mlir_module=self.falcon_mlir_path,
+            device=self.device,
+            mlir_dialect="linalg",
        )
        path = shark_module.save_module(
            self.falcon_vmfb_path.parent.absolute(),
            self.falcon_vmfb_path.stem,
            extra_args=[
-                "--iree-hal-dump-executable-sources-to=ies",
                "--iree-vm-target-truncate-unsupported-floats",
                "--iree-codegen-check-ir-before-llvm-conversion=false",
                "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
-                "--iree-spirv-index-bits=64",
-            ],
+            ]
+            + [
+                "--iree-llvmcpu-use-fast-min-max-ops",
+            ]
+            if self.precision == "int4"
+            else [],
+            debug=self.debug,
        )
        print("Saved falcon vmfb at ", str(path))
        shark_module.load_module(path)

        return shark_module

-    def compile(self):
-        falcon_shark_model = self.compile_falcon()
-        return falcon_shark_model
-
    def generate(self, prompt):
        model_inputs = self.tokenizer(
            prompt,
@@ -345,7 +934,11 @@ class Falcon(SharkLLMBase):

        all_text = prompt

+        start = time.time()
+        count = 0
        for i in range(self.max_num_tokens - 1):
+            count = count + 1
+
            next_token = self.generate_new_token()
            new_word = self.tokenizer.decode(
                next_token.cpu().numpy(),
@@ -372,6 +965,13 @@ class Falcon(SharkLLMBase):
                ):
                    break

+        end = time.time()
+        print(
+            "\n\nTime taken is {:.2f} seconds/token\n".format(
+                (end - start) / count
+            )
+        )
+
        torch.cuda.empty_cache()
        gc.collect()

@@ -387,7 +987,7 @@ class Falcon(SharkLLMBase):
                (model_inputs["input_ids"], model_inputs["attention_mask"]),
            )
        )
-        if self.precision == "fp16":
+        if self.precision in ["fp16", "int4"]:
            outputs = outputs.to(dtype=torch.float32)
        next_token_logits = outputs

@@ -466,18 +1066,39 @@ if __name__ == "__main__":
        else Path(args.falcon_vmfb_path)
    )

-    falcon = Falcon(
-        "falcon_" + args.falcon_variant_to_use,
-        hf_model_path="tiiuae/falcon-"
-        + args.falcon_variant_to_use
-        + "-instruct",
-        device=args.device,
-        precision=args.precision,
-        falcon_mlir_path=falcon_mlir_path,
-        falcon_vmfb_path=falcon_vmfb_path,
-    )
+    if args.precision == "int4":
+        if args.falcon_variant_to_use == "180b":
+            hf_model_path_value = "TheBloke/Falcon-180B-Chat-GPTQ"
+        else:
+            hf_model_path_value = (
+                "TheBloke/falcon-"
+                + args.falcon_variant_to_use
+                + "-instruct-GPTQ"
+            )
+    else:
+        if args.falcon_variant_to_use == "180b":
+            hf_model_path_value = "tiiuae/falcon-180B-chat"
+        else:
+            hf_model_path_value = (
+                "tiiuae/falcon-" + args.falcon_variant_to_use + "-instruct"
+            )

-    import gc
+    if not args.sharded:
+        falcon = UnshardedFalcon(
+            model_name="falcon_" + args.falcon_variant_to_use,
+            hf_model_path=hf_model_path_value,
+            device=args.device,
+            precision=args.precision,
+            falcon_mlir_path=falcon_mlir_path,
+            falcon_vmfb_path=falcon_vmfb_path,
+        )
+    else:
+        falcon = ShardedFalcon(
+            model_name="falcon_" + args.falcon_variant_to_use,
+            hf_model_path=hf_model_path_value,
+            device=args.device,
+            precision=args.precision,
+        )

    default_prompt_text = "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:"
    continue_execution = True
@@ -497,7 +1118,11 @@ if __name__ == "__main__":
            prompt = input("Please enter the prompt text: ")
        print("\nPrompt Text: ", prompt)

-        res_str = falcon.generate(prompt)
+        prompt_template = f"""A helpful assistant who helps the user with any questions asked.
+        User: {prompt}
+        Assistant:"""
+
+        res_str = falcon.generate(prompt_template)
        torch.cuda.empty_cache()
        gc.collect()
        print(
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
@@ -126,17 +126,18 @@ def is_url(input_url):
 import os
 import tempfile
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 import torch
 import torch_mlir
 from torch_mlir.compiler_utils import run_pipeline_with_repro_report
 from typing import List, Tuple
 from io import BytesIO
-from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.common.generative.quantize import quantize_model
 from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


-def brevitas〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
+# fmt: off
+def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
    if len(lhs) == 3 and len(rhs) == 2:
        return [lhs[0], lhs[1], rhs[0]]
    elif len(lhs) == 2 and len(rhs) == 2:
@@ -145,20 +146,21 @@ def brevitas〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rh
        raise ValueError("Input shapes not supported.")


-def brevitas〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
+def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
    # output dtype is the dtype of the lhs float input
    lhs_rank, lhs_dtype = lhs_rank_dtype
    return lhs_dtype


-def brevitas〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
+def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
    return


 brevitas_matmul_rhs_group_quant_library = [
-    brevitas〇matmul_rhs_group_quant〡shape,
-    brevitas〇matmul_rhs_group_quant〡dtype,
-    brevitas〇matmul_rhs_group_quant〡has_value_semantics]
+    quant〇matmul_rhs_group_quant〡shape,
+    quant〇matmul_rhs_group_quant〡dtype,
+    quant〇matmul_rhs_group_quant〡has_value_semantics]
+# fmt: on


 def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):
@@ -176,7 +178,7 @@ def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):


 def compile_module(
-    shark_module, extended_model_name, generate_vmfb, extra_args=[]
+    shark_module, extended_model_name, generate_vmfb, extra_args=[], debug=False,
 ):
    if generate_vmfb:
        vmfb_path = os.path.join(os.getcwd(), extended_model_name + ".vmfb")
@@ -188,7 +190,7 @@ def compile_module(
                "No vmfb found. Compiling and saving to {}".format(vmfb_path)
            )
            path = shark_module.save_module(
-                os.getcwd(), extended_model_name, extra_args
+                os.getcwd(), extended_model_name, extra_args, debug=debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -197,7 +199,7 @@ def compile_module(


 def compile_int_precision(
-    model, inputs, precision, device, generate_vmfb, extended_model_name
+    model, inputs, precision, device, generate_vmfb, extended_model_name, debug=False
 ):
    torchscript_module = import_with_fx(
        model,
@@ -209,7 +211,7 @@ def compile_int_precision(
        torchscript_module,
        inputs,
        output_type="torch",
-        backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+        backend_legal_ops=["quant.matmul_rhs_group_quant"],
        extra_library=brevitas_matmul_rhs_group_quant_library,
        use_tracing=False,
        verbose=False,
@@ -217,7 +219,7 @@ def compile_int_precision(
    print(f"[DEBUG] converting torch to linalg")
    run_pipeline_with_repro_report(
        mlir_module,
-        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
    )
    from contextlib import redirect_stdout
@@ -233,6 +235,12 @@ def compile_int_precision(
    mlir_module = BytesIO(mlir_module)
    bytecode = mlir_module.read()
    print(f"Elided IR written for {extended_model_name}")
+    bytecode = save_mlir(
+        bytecode,
+        model_name=extended_model_name,
+        frontend="torch",
+        dir=os.getcwd(),
+    )
    return bytecode
    shark_module = SharkInference(
        mlir_module=bytecode, device=device, mlir_dialect="tm_tensor"
@@ -249,6 +257,7 @@ def compile_int_precision(
            extended_model_name=extended_model_name,
            generate_vmfb=generate_vmfb,
            extra_args=extra_args,
+            debug=debug,
        ),
        bytecode,
    )
@@ -292,6 +301,7 @@ def shark_compile_through_fx_int(
        device,
        generate_or_load_vmfb,
        extended_model_name,
+        debug,
    )
    extra_args = [
        "--iree-hal-dump-executable-sources-to=ies",
--- a/apps/language_models/src/pipelines/stablelm_pipeline.py
+++ b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -4,13 +4,49 @@ from transformers import AutoTokenizer, StoppingCriteria, AutoModelForCausalLM
 from io import BytesIO
 from pathlib import Path
 from apps.language_models.utils import (
-    get_torch_mlir_module_bytecode,
    get_vmfb_from_path,
 )
 from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
 from apps.language_models.src.model_wrappers.stablelm_model import (
    StableLMModel,
 )
+import argparse
+
+parser = argparse.ArgumentParser(
+    prog="stablelm runner",
+    description="runs a StableLM model",
+)
+
+parser.add_argument(
+    "--precision", "-p", default="fp16", choices=["fp32", "fp16", "int4"]
+)
+parser.add_argument("--device", "-d", default="cuda", help="vulkan, cpu, cuda")
+parser.add_argument(
+    "--stablelm_vmfb_path", default=None, help="path to StableLM's vmfb"
+)
+parser.add_argument(
+    "--stablelm_mlir_path",
+    default=None,
+    help="path to StableLM's mlir file",
+)
+parser.add_argument(
+    "--use_precompiled_model",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="use the precompiled vmfb",
+)
+parser.add_argument(
+    "--load_mlir_from_shark_tank",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="download precompile mlir from shark tank",
+)
+parser.add_argument(
+    "--hf_auth_token",
+    type=str,
+    default=None,
+    help="Specify your own huggingface authentication token for stablelm-3B model.",
+)


 class StopOnTokens(StoppingCriteria):
@@ -29,14 +65,24 @@ class SharkStableLM(SharkLLMBase):
        self,
        model_name,
        hf_model_path="stabilityai/stablelm-tuned-alpha-3b",
-        max_num_tokens=512,
+        max_num_tokens=256,
        device="cuda",
        precision="fp32",
+        debug="False",
    ) -> None:
        super().__init__(model_name, hf_model_path, max_num_tokens)
        self.max_sequence_len = 256
        self.device = device
+        if precision != "int4" and args.hf_auth_token == None:
+            raise ValueError(
+                """ HF auth token required for StableLM-3B. Pass it using
+                --hf_auth_token flag. You can ask for the access to the model
+                here: https://huggingface.co/tiiuae/falcon-180B-chat."""
+            )
+        self.hf_auth_token = args.hf_auth_token
+
        self.precision = precision
+        self.debug = debug
        self.tokenizer = self.get_tokenizer()
        self.shark_model = self.compile()

@@ -48,9 +94,23 @@ class SharkStableLM(SharkLLMBase):
        return False

    def get_src_model(self):
+        kwargs = {}
+        if self.precision == "int4":
+            self.hf_model_path = "TheBloke/stablelm-zephyr-3b-GPTQ"
+            from transformers import GPTQConfig
+
+            quantization_config = GPTQConfig(bits=4, disable_exllama=True)
+            kwargs["quantization_config"] = quantization_config
+            kwargs["device_map"] = "cpu"
+        print("[DEBUG] Loading Model")
        model = AutoModelForCausalLM.from_pretrained(
-            self.hf_model_path, torch_dtype=torch.float32
+            self.hf_model_path,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            use_auth_token=self.hf_auth_token,
+            **kwargs,
        )
+        print("[DEBUG] Model loaded successfully")
        return model

    def get_model_inputs(self):
@@ -59,9 +119,7 @@ class SharkStableLM(SharkLLMBase):
        return input_ids, attention_mask

    def compile(self):
-        tmp_model_name = (
-            f"stableLM_linalg_{self.precision}_seqLen{self.max_sequence_len}"
-        )
+        tmp_model_name = f"{self.model_name}_linalg_{self.precision}_seqLen{self.max_sequence_len}"

        # device = "cuda"  # "cpu"
        # TODO: vmfb and mlir name should include precision and device
@@ -81,13 +139,19 @@ class SharkStableLM(SharkLLMBase):
        print(
            f"[DEBUG] mlir path {mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
        )
-        if mlir_path.exists():
-            with open(mlir_path, "rb") as f:
-                bytecode = f.read()
-        else:
+        if not mlir_path.exists():
            model = StableLMModel(self.get_src_model())
            model_inputs = self.get_model_inputs()
-            ts_graph = get_torch_mlir_module_bytecode(model, model_inputs)
+            from shark.shark_importer import import_with_fx
+
+            ts_graph = import_with_fx(
+                model,
+                model_inputs,
+                is_f16=True if self.precision in ["fp16"] else False,
+                precision=self.precision,
+                f16_input_mask=[False, False],
+                mlir_type="torchscript",
+            )
            module = torch_mlir.compile(
                ts_graph,
                [*model_inputs],
@@ -98,34 +162,43 @@ class SharkStableLM(SharkLLMBase):
            bytecode_stream = BytesIO()
            module.operation.write_bytecode(bytecode_stream)
            bytecode = bytecode_stream.getvalue()
-        f_ = open(tmp_model_name + ".mlir", "wb")
-        f_.write(bytecode)
-        print("Saved mlir")
-        f_.close()
+            f_ = open(mlir_path, "wb")
+            f_.write(bytecode)
+            print("Saved mlir at: ", mlir_path)
+            f_.close()
+            del bytecode

        from shark.shark_inference import SharkInference

        shark_module = SharkInference(
-            mlir_module=bytecode, device=self.device, mlir_dialect="tm_tensor"
+            mlir_module=mlir_path, device=self.device, mlir_dialect="tm_tensor"
        )
        shark_module.compile()

        path = shark_module.save_module(
-            vmfb_path.parent.absolute(), vmfb_path.stem
+            vmfb_path.parent.absolute(), vmfb_path.stem, debug=self.debug
        )
        print("Saved vmfb at ", str(path))

        return shark_module

    def get_tokenizer(self):
-        tok = AutoTokenizer.from_pretrained(self.hf_model_path)
+        tok = AutoTokenizer.from_pretrained(
+            self.hf_model_path,
+            use_auth_token=self.hf_auth_token,
+        )
        tok.add_special_tokens({"pad_token": "<PAD>"})
        # print("[DEBUG] Sucessfully loaded the tokenizer to the memory")
        return tok

    def generate(self, prompt):
        words_list = []
+        import time
+
+        start = time.time()
+        count = 0
        for i in range(self.max_num_tokens):
+            count = count + 1
            params = {
                "new_text": prompt,
            }
@@ -143,6 +216,12 @@ class SharkStableLM(SharkLLMBase):
            if detok == "":
                break
            prompt = prompt + detok
+        end = time.time()
+        print(
+            "\n\nTime  taken is {:.2f} tokens/second\n".format(
+                count / (end - start)
+            )
+        )
        return words_list

    def generate_new_token(self, params):
@@ -176,10 +255,46 @@ class SharkStableLM(SharkLLMBase):
        return ret_dict


-# Initialize a StopOnTokens object
-system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
-"""
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    stable_lm = SharkStableLM(
+        model_name="stablelm_zephyr_3b",
+        hf_model_path="stabilityai/stablelm-zephyr-3b",
+        device=args.device,
+        precision=args.precision,
+    )
+
+    default_prompt_text = "The weather is always wonderful"
+    continue_execution = True
+
+    print("\n-----\nScript executing for the following config: \n")
+    print("StableLM Model: ", stable_lm.hf_model_path)
+    print("Precision:      ", args.precision)
+    print("Device:         ", args.device)
+
+    while continue_execution:
+        use_default_prompt = input(
+            "\nDo you wish to use the default prompt text? Y/N ?: "
+        )
+        if use_default_prompt in ["Y", "y"]:
+            prompt = default_prompt_text
+        else:
+            prompt = input("Please enter the prompt text: ")
+        print("\nPrompt Text: ", prompt)
+
+        res_str = stable_lm.generate(prompt)
+        torch.cuda.empty_cache()
+        import gc
+
+        gc.collect()
+        print(
+            "\n\n-----\nHere's the complete formatted result: \n\n",
+            prompt + "".join(res_str),
+        )
+        continue_execution = input(
+            "\nDo you wish to run script one more time? Y/N ?: "
+        )
+        continue_execution = (
+            True if continue_execution in ["Y", "y"] else False
+        )
--- a/apps/language_models/utils.py
+++ b/apps/language_models/utils.py
@@ -8,7 +8,7 @@ from shark.shark_downloader import download_public_file

 # expects a Path / str as arg
 # returns None if path not found or SharkInference module
-def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
+def get_vmfb_from_path(vmfb_path, device, mlir_dialect, device_id=None):
    if not isinstance(vmfb_path, Path):
        vmfb_path = Path(vmfb_path)

@@ -20,7 +20,7 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
    print("Loading vmfb from: ", vmfb_path)
    print("Device from get_vmfb_from_path - ", device)
    shark_module = SharkInference(
-        None, device=device, mlir_dialect=mlir_dialect
+        None, device=device, mlir_dialect=mlir_dialect, device_idx=device_id
    )
    shark_module.load_module(vmfb_path)
    print("Successfully loaded vmfb")
@@ -28,7 +28,13 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):


 def get_vmfb_from_config(
-    shark_container, model, precision, device, vmfb_path, padding=None
+    shark_container,
+    model,
+    precision,
+    device,
+    vmfb_path,
+    padding=None,
+    device_id=None,
 ):
    vmfb_url = (
        f"gs://shark_tank/{shark_container}/{model}_{precision}_{device}"
@@ -37,4 +43,6 @@ def get_vmfb_from_config(
        vmfb_url = vmfb_url + f"_{padding}"
    vmfb_url = vmfb_url + ".vmfb"
    download_public_file(vmfb_url, vmfb_path.absolute(), single_file=True)
-    return get_vmfb_from_path(vmfb_path, device, "tm_tensor")
+    return get_vmfb_from_path(
+        vmfb_path, device, "tm_tensor", device_id=device_id
+    )
--- a/apps/shark_studio/api/llm.py
+++ b/apps/shark_studio/api/llm.py
@@ -0,0 +1,91 @@
+from turbine_models.custom_models import stateless_llama
+from shark.iree_utils.compile_utils import get_iree_compiled_module
+from apps.shark_studio.api.utils import get_resource_path
+import iree.runtime as ireert
+import gc
+import torch
+
+llm_model_map = {
+    "llama2_7b": {
+        "initializer": stateless_llama.export_transformer_model,
+        "hf_model_name": "meta-llama/Llama-2-7b-chat-hf",
+        "stop_token": 2,
+        "max_tokens": 4096,
+    }
+}
+
+
+class LanguageModel:
+    def __init__(
+        self, model_name, hf_auth_token=None, device=None, precision="fp32"
+    ):
+        print(llm_model_map[model_name])
+        self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
+        self.torch_ir, self.tokenizer = llm_model_map[model_name][
+            "initializer"
+        ](self.hf_model_name, hf_auth_token, compile_to="torch")
+        self.tempfile_name = get_resource_path("llm.torch.tempfile")
+        with open(self.tempfile_name, "w+") as f:
+            f.write(self.torch_ir)
+        del self.torch_ir
+        gc.collect()
+
+        self.device = device
+        self.precision = precision
+        self.max_tokens = llm_model_map[model_name]["max_tokens"]
+        self.iree_module_dict = None
+        self.compile()
+
+    def compile(self) -> None:
+        # this comes with keys: "vmfb", "config", and "temp_file_to_unlink".
+        self.iree_module_dict = get_iree_compiled_module(
+            self.tempfile_name, device=self.device, frontend="torch"
+        )
+        # TODO: delete the temp file
+
+    def chat(self, prompt):
+        history = []
+        for iter in range(self.max_tokens):
+            input_tensor = self.tokenizer(
+                prompt, return_tensors="pt"
+            ).input_ids
+            device_inputs = [
+                ireert.asdevicearray(
+                    self.iree_module_dict["config"], input_tensor
+                )
+            ]
+            if iter == 0:
+                token = torch.tensor(
+                    self.iree_module_dict["vmfb"]["run_initialize"](
+                        *device_inputs
+                    ).to_host()[0][0]
+                )
+            else:
+                token = torch.tensor(
+                    self.iree_module_dict["vmfb"]["run_forward"](
+                        *device_inputs
+                    ).to_host()[0][0]
+                )
+
+            history.append(token)
+            yield self.tokenizer.decode(history)
+
+            if token == llm_model_map["llama2_7b"]["stop_token"]:
+                break
+
+        for i in range(len(history)):
+            if type(history[i]) != int:
+                history[i] = int(history[i])
+        result_output = self.tokenizer.decode(history)
+        yield result_output
+
+
+if __name__ == "__main__":
+    lm = LanguageModel(
+        "llama2_7b",
+        hf_auth_token="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",
+        device="cpu-task",
+    )
+    print("model loaded")
+    for i in lm.chat("Hello, I am a robot."):
+        print(i)
--- a/apps/shark_studio/api/utils.py
+++ b/apps/shark_studio/api/utils.py
@@ -0,0 +1,14 @@
+import os
+import sys
+
+
+def get_available_devices():
+    return ["cpu-task"]
+
+
+def get_resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    )
+    return os.path.join(base_path, relative_path)
--- a/apps/shark_studio/web/index.py
+++ b/apps/shark_studio/web/index.py
@@ -0,0 +1,428 @@
+from multiprocessing import Process, freeze_support
+import os
+import sys
+import logging
+from ui.chat import chat_element
+
+if sys.platform == "darwin":
+    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
+    # import before IREE to avoid MLIR library issues
+    import torch_mlir
+
+# import PIL, transformers, sentencepiece  # ensures inclusion in pysintaller exe generation
+# from apps.stable_diffusion.src import args, clear_all
+# import apps.stable_diffusion.web.utils.global_obj as global_obj
+
+
+def launch_app(address):
+    from tkinter import Tk
+    import webview
+
+    window = Tk()
+
+    # get screen width and height of display and make it more reasonably
+    # sized as we aren't making it full-screen or maximized
+    width = int(window.winfo_screenwidth() * 0.81)
+    height = int(window.winfo_screenheight() * 0.91)
+    webview.create_window(
+        "SHARK AI Studio",
+        url=address,
+        width=width,
+        height=height,
+        text_select=True,
+    )
+    webview.start(private_mode=False, storage_path=os.getcwd())
+
+
+if __name__ == "__main__":
+    # if args.debug:
+    logging.basicConfig(level=logging.DEBUG)
+    # required to do multiprocessing in a pyinstaller freeze
+    freeze_support()
+    #    if args.api or "api" in args.ui.split(","):
+    #        from apps.stable_diffusion.web.ui import (
+    #            txt2img_api,
+    #            img2img_api,
+    #            upscaler_api,
+    #            inpaint_api,
+    #            outpaint_api,
+    #            llm_chat_api,
+    #        )
+    #
+    #        from fastapi import FastAPI, APIRouter
+    #        import uvicorn
+    #
+    #        # init global sd pipeline and config
+    #        global_obj._init()
+    #
+    #        app = FastAPI()
+    #        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/outpaint", outpaint_api, methods=["post"])
+    #        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
+    #
+    #        # chat APIs needed for compatibility with multiple extensions using OpenAI API
+    #        app.add_api_route(
+    #            "/v1/chat/completions", llm_chat_api, methods=["post"]
+    #        )
+    #        app.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
+    #        app.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
+    #        app.add_api_route("/completions", llm_chat_api, methods=["post"])
+    #        app.add_api_route(
+    #            "/v1/engines/codegen/completions", llm_chat_api, methods=["post"]
+    #        )
+    #        app.include_router(APIRouter())
+    #        uvicorn.run(app, host="0.0.0.0", port=args.server_port)
+    #        sys.exit(0)
+    #
+    # Setup to use shark_tmp for gradio's temporary image files and clear any
+    # existing temporary images there if they exist. Then we can import gradio.
+    # It has to be in this order or gradio ignores what we've set up.
+    # from apps.stable_diffusion.web.utils.gradio_configs import (
+    #    config_gradio_tmp_imgs_folder,
+    # )
+
+    # config_gradio_tmp_imgs_folder()
+    import gradio as gr
+
+    # Create custom models folders if they don't exist
+    # from apps.stable_diffusion.web.ui.utils import create_custom_models_folders
+
+    # create_custom_models_folders()
+
+    def resource_path(relative_path):
+        """Get absolute path to resource, works for dev and for PyInstaller"""
+        base_path = getattr(
+            sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+        )
+        return os.path.join(base_path, relative_path)
+
+    dark_theme = resource_path("ui/css/sd_dark_theme.css")
+
+    # from apps.stable_diffusion.web.ui import (
+    # txt2img_web,
+    # txt2img_custom_model,
+    # txt2img_gallery,
+    # txt2img_png_info_img,
+    # txt2img_status,
+    # txt2img_sendto_img2img,
+    # txt2img_sendto_inpaint,
+    # txt2img_sendto_outpaint,
+    # txt2img_sendto_upscaler,
+    ## h2ogpt_upload,
+    ## h2ogpt_web,
+    # img2img_web,
+    # img2img_custom_model,
+    # img2img_gallery,
+    # img2img_init_image,
+    # img2img_status,
+    # img2img_sendto_inpaint,
+    # img2img_sendto_outpaint,
+    # img2img_sendto_upscaler,
+    # inpaint_web,
+    # inpaint_custom_model,
+    # inpaint_gallery,
+    # inpaint_init_image,
+    # inpaint_status,
+    # inpaint_sendto_img2img,
+    # inpaint_sendto_outpaint,
+    # inpaint_sendto_upscaler,
+    # outpaint_web,
+    # outpaint_custom_model,
+    # outpaint_gallery,
+    # outpaint_init_image,
+    # outpaint_status,
+    # outpaint_sendto_img2img,
+    # outpaint_sendto_inpaint,
+    # outpaint_sendto_upscaler,
+    # upscaler_web,
+    # upscaler_custom_model,
+    # upscaler_gallery,
+    # upscaler_init_image,
+    # upscaler_status,
+    # upscaler_sendto_img2img,
+    # upscaler_sendto_inpaint,
+    # upscaler_sendto_outpaint,
+    ##  lora_train_web,
+    ##  model_web,
+    ##  model_config_web,
+    # hf_models,
+    # modelmanager_sendto_txt2img,
+    # modelmanager_sendto_img2img,
+    # modelmanager_sendto_inpaint,
+    # modelmanager_sendto_outpaint,
+    # modelmanager_sendto_upscaler,
+    # stablelm_chat,
+    # minigpt4_web,
+    # outputgallery_web,
+    # outputgallery_tab_select,
+    # outputgallery_watch,
+    # outputgallery_filename,
+    # outputgallery_sendto_txt2img,
+    # outputgallery_sendto_img2img,
+    # outputgallery_sendto_inpaint,
+    # outputgallery_sendto_outpaint,
+    # outputgallery_sendto_upscaler,
+    # )
+
+    # init global sd pipeline and config
+    # global_obj._init()
+
+    def register_button_click(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                x[0]["name"] if len(x) != 0 else None,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    def register_modelmanager_button(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                "None",
+                x,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    def register_outputgallery_button(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                x,
+                gr.Tabs.update(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    with gr.Blocks(
+        css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
+    ) as sd_web:
+        with gr.Tabs() as tabs:
+            # NOTE: If adding, removing, or re-ordering tabs, make sure that they
+            # have a unique id that doesn't clash with any of the other tabs,
+            # and that the order in the code here is the order they should
+            # appear in the ui, as the id value doesn't determine the order.
+
+            # Where possible, avoid changing the id of any tab that is the
+            # destination of one of the 'send to' buttons. If you do have to change
+            # that id, make sure you update the relevant register_button_click calls
+            # further down with the new id.
+            # with gr.TabItem(label="Text-to-Image", id=0):
+            #    txt2img_web.render()
+            # with gr.TabItem(label="Image-to-Image", id=1):
+            #    img2img_web.render()
+            # with gr.TabItem(label="Inpainting", id=2):
+            #    inpaint_web.render()
+            # with gr.TabItem(label="Outpainting", id=3):
+            #    outpaint_web.render()
+            # with gr.TabItem(label="Upscaler", id=4):
+            #    upscaler_web.render()
+            # if args.output_gallery:
+            #    with gr.TabItem(label="Output Gallery", id=5) as og_tab:
+            #        outputgallery_web.render()
+
+            #    # extra output gallery configuration
+            #    outputgallery_tab_select(og_tab.select)
+            #    outputgallery_watch(
+            #        [
+            #            txt2img_status,
+            #            img2img_status,
+            #            inpaint_status,
+            #            outpaint_status,
+            #            upscaler_status,
+            #        ]
+            #    )
+            ##  with gr.TabItem(label="Model Manager", id=6):
+            ##      model_web.render()
+            ##  with gr.TabItem(label="LoRA Training (Experimental)", id=7):
+            ##      lora_train_web.render()
+            with gr.TabItem(label="Chat Bot", id=0):
+                chat_element.render()
+            ##  with gr.TabItem(
+            ##      label="Generate Sharding Config (Experimental)", id=9
+            ##  ):
+            ##      model_config_web.render()
+            # with gr.TabItem(label="MultiModal (Experimental)", id=10):
+            #    minigpt4_web.render()
+            # with gr.TabItem(label="DocuChat Upload", id=11):
+            #     h2ogpt_upload.render()
+            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
+            #     h2ogpt_web.render()
+
+        # send to buttons
+        # register_button_click(
+        #    txt2img_sendto_img2img,
+        #    1,
+        #    [txt2img_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    txt2img_sendto_inpaint,
+        #    2,
+        #    [txt2img_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    txt2img_sendto_outpaint,
+        #    3,
+        #    [txt2img_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    txt2img_sendto_upscaler,
+        #    4,
+        #    [txt2img_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    img2img_sendto_inpaint,
+        #    2,
+        #    [img2img_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    img2img_sendto_outpaint,
+        #    3,
+        #    [img2img_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    img2img_sendto_upscaler,
+        #    4,
+        #    [img2img_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    inpaint_sendto_img2img,
+        #    1,
+        #    [inpaint_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    inpaint_sendto_outpaint,
+        #    3,
+        #    [inpaint_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    inpaint_sendto_upscaler,
+        #    4,
+        #    [inpaint_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    outpaint_sendto_img2img,
+        #    1,
+        #    [outpaint_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    outpaint_sendto_inpaint,
+        #    2,
+        #    [outpaint_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    outpaint_sendto_upscaler,
+        #    4,
+        #    [outpaint_gallery],
+        #    [upscaler_init_image, tabs],
+        # )
+        # register_button_click(
+        #    upscaler_sendto_img2img,
+        #    1,
+        #    [upscaler_gallery],
+        #    [img2img_init_image, tabs],
+        # )
+        # register_button_click(
+        #    upscaler_sendto_inpaint,
+        #    2,
+        #    [upscaler_gallery],
+        #    [inpaint_init_image, tabs],
+        # )
+        # register_button_click(
+        #    upscaler_sendto_outpaint,
+        #    3,
+        #    [upscaler_gallery],
+        #    [outpaint_init_image, tabs],
+        # )
+        # if args.output_gallery:
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_txt2img,
+        #        0,
+        #        [outputgallery_filename],
+        #        [txt2img_png_info_img, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_img2img,
+        #        1,
+        #        [outputgallery_filename],
+        #        [img2img_init_image, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_inpaint,
+        #        2,
+        #        [outputgallery_filename],
+        #        [inpaint_init_image, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_outpaint,
+        #        3,
+        #        [outputgallery_filename],
+        #        [outpaint_init_image, tabs],
+        #    )
+        #    register_outputgallery_button(
+        #        outputgallery_sendto_upscaler,
+        #        4,
+        #        [outputgallery_filename],
+        #        [upscaler_init_image, tabs],
+        #    )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_txt2img,
+        #    0,
+        #    [hf_models],
+        #    [txt2img_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_img2img,
+        #    1,
+        #    [hf_models],
+        #    [img2img_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_inpaint,
+        #    2,
+        #    [hf_models],
+        #    [inpaint_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_outpaint,
+        #    3,
+        #    [hf_models],
+        #    [outpaint_custom_model, tabs],
+        # )
+        # register_modelmanager_button(
+        #    modelmanager_sendto_upscaler,
+        #    4,
+        #    [hf_models],
+        #    [upscaler_custom_model, tabs],
+        # )
+
+    sd_web.queue()
+    # if args.ui == "app":
+    #    t = Process(
+    #        target=launch_app, args=[f"http://localhost:{args.server_port}"]
+    #    )
+    #    t.start()
+    sd_web.launch(
+        share=True,
+        inbrowser=True,
+        server_name="0.0.0.0",
+        server_port=11911,  # args.server_port,
+    )
--- a/apps/shark_studio/web/ui/init.py
+++ b/apps/shark_studio/web/ui/init.py
--- a/apps/shark_studio/web/ui/chat.py
+++ b/apps/shark_studio/web/ui/chat.py
@@ -0,0 +1,517 @@
+import gradio as gr
+import os
+from pathlib import Path
+from datetime import datetime as dt
+import json
+import sys
+from apps.shark_studio.api.utils import (
+    get_available_devices,
+)
+from apps.shark_studio.api.llm import (
+    llm_model_map,
+    LanguageModel,
+)
+
+
+def user(message, history):
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+
+
+language_model = None
+
+
+# NOTE: Each `model_name` should have its own start message
+start_message = {
+    "llama2_7b": (
+        "You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe. Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
+        "illegal content. Please ensure that your responses are socially "
+        "unbiased and positive in nature. If a question does not make any "
+        "sense, or is not factually coherent, explain why instead of "
+        "answering something not correct. If you don't know the answer "
+        "to a question, please don't share false information."
+    ),
+    "llama2_13b": (
+        "You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe. Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
+        "illegal content. Please ensure that your responses are socially "
+        "unbiased and positive in nature. If a question does not make any "
+        "sense, or is not factually coherent, explain why instead of "
+        "answering something not correct. If you don't know the answer "
+        "to a question, please don't share false information."
+    ),
+    "llama2_70b": (
+        "You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe. Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
+        "illegal content. Please ensure that your responses are socially "
+        "unbiased and positive in nature. If a question does not make any "
+        "sense, or is not factually coherent, explain why instead of "
+        "answering something not correct. If you don't know the answer "
+        "to a question, please don't share false information."
+    ),
+    "vicuna": (
+        "A chat between a curious user and an artificial intelligence "
+        "assistant. The assistant gives helpful, detailed, and "
+        "polite answers to the user's questions.\n"
+    ),
+}
+
+
+def create_prompt(model_name, history, prompt_prefix):
+    return ""
+    system_message = ""
+    if prompt_prefix:
+        system_message = start_message[model_name]
+
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [f"{B_INST} {item[0]} {E_INST} {item[1]} " for item in history[1:]]
+        )
+        if prompt_prefix:
+            msg = f"{B_INST} {B_SYS}{system_message}{E_SYS}{history[0][0]} {E_INST} {history[0][1]} {conversation}"
+        else:
+            msg = f"{B_INST} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+    elif model_name in ["vicuna"]:
+        conversation = "".join(
+            [
+                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
+                for item in history
+            ]
+        )
+        msg = system_message + conversation
+        msg = msg.strip()
+    else:
+        conversation = "".join(
+            ["".join([item[0], item[1]]) for item in history]
+        )
+        msg = system_message + conversation
+        msg = msg.strip()
+    return msg
+
+
+def get_default_config():
+    return False
+    import torch
+    from transformers import AutoTokenizer
+
+    hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
+    compilation_prompt = "".join(["0" for _ in range(17)])
+    compilation_input_ids = tokenizer(
+        compilation_prompt,
+        return_tensors="pt",
+    ).input_ids
+    compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
+        [1, 19]
+    )
+    firstVicunaCompileInput = (compilation_input_ids,)
+    from apps.language_models.src.model_wrappers.vicuna_model import (
+        CombinedModel,
+    )
+    from shark.shark_generate_model_config import GenerateConfigFile
+
+    model = CombinedModel()
+    c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
+    c.split_into_layers()
+
+
+# model_vmfb_key = ""
+
+
+def chat_fn(
+    prompt_prefix,
+    history,
+    model,
+    device,
+    precision,
+    download_vmfb,
+    config_file,
+    cli=False,
+    progress=gr.Progress(),
+):
+    global language_model
+    if language_model is None:
+        language_model = LanguageModel(
+            model, device=device, precision=precision
+        )
+
+    language_model.chat(prompt_prefix)
+    return "", ""
+    global past_key_values
+    global model_vmfb_key
+
+    device_id = None
+    model_name, model_path = list(map(str.strip, model.split("=>")))
+    if "cuda" in device:
+        device = "cuda"
+    elif "sync" in device:
+        device = "cpu-sync"
+    elif "task" in device:
+        device = "cpu-task"
+    elif "vulkan" in device:
+        device_id = int(device.split("://")[1])
+        device = "vulkan"
+    elif "rocm" in device:
+        device = "rocm"
+    else:
+        print("unrecognized device")
+
+    from apps.language_models.scripts.vicuna import ShardedVicuna
+    from apps.language_models.scripts.vicuna import UnshardedVicuna
+    from apps.stable_diffusion.src import args
+
+    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{device_id}#{precision}#{download_vmfb}"
+    if vicuna_model is None or new_model_vmfb_key != model_vmfb_key:
+        model_vmfb_key = new_model_vmfb_key
+        max_toks = 128 if model_name == "codegen" else 512
+
+        # get iree flags that need to be overridden, from commandline args
+        _extra_args = []
+        # vulkan target triple
+        vulkan_target_triple = args.iree_vulkan_target_triple
+        from shark.iree_utils.vulkan_utils import (
+            get_all_vulkan_devices,
+            get_vulkan_target_triple,
+        )
+
+        if device == "vulkan":
+            vulkaninfo_list = get_all_vulkan_devices()
+            if vulkan_target_triple == "":
+                # We already have the device_id extracted via WebUI, so we directly use
+                # that to find the target triple.
+                vulkan_target_triple = get_vulkan_target_triple(
+                    vulkaninfo_list[device_id]
+                )
+            _extra_args.append(
+                f"-iree-vulkan-target-triple={vulkan_target_triple}"
+            )
+            if "rdna" in vulkan_target_triple:
+                flags_to_add = [
+                    "--iree-spirv-index-bits=64",
+                ]
+                _extra_args = _extra_args + flags_to_add
+
+            if device_id is None:
+                id = 0
+                for device in vulkaninfo_list:
+                    target_triple = get_vulkan_target_triple(
+                        vulkaninfo_list[id]
+                    )
+                    if target_triple == vulkan_target_triple:
+                        device_id = id
+                        break
+                    id += 1
+
+                assert (
+                    device_id
+                ), f"no vulkan hardware for target-triple '{vulkan_target_triple}' exists"
+            print(f"Will use vulkan target triple : {vulkan_target_triple}")
+
+        elif "rocm" in device:
+            # add iree rocm flags
+            _extra_args.append(
+                f"--iree-rocm-target-chip={args.iree_rocm_target_chip}"
+            )
+            print(f"extra args = {_extra_args}")
+
+        if model_name == "vicuna4":
+            vicuna_model = ShardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                compressed=True,
+                extra_args_cmd=_extra_args,
+            )
+        else:
+            #  if config_file is None:
+            vicuna_model = UnshardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                hf_auth_token=args.hf_auth_token,
+                device=device,
+                vulkan_target_triple=vulkan_target_triple,
+                precision=precision,
+                max_num_tokens=max_toks,
+                download_vmfb=download_vmfb,
+                load_mlir_from_shark_tank=True,
+                extra_args_cmd=_extra_args,
+                device_id=device_id,
+            )
+
+    if vicuna_model is None:
+        sys.exit("Unable to instantiate the model object, exiting.")
+
+    prompt = create_prompt(model_name, history, prompt_prefix)
+
+    partial_text = ""
+    token_count = 0
+    total_time_ms = 0.001  # In order to avoid divide by zero error
+    prefill_time = 0
+    is_first = True
+    for text, msg, exec_time in progress.tqdm(
+        vicuna_model.generate(prompt, cli=cli),
+        desc="generating response",
+    ):
+        if msg is None:
+            if is_first:
+                prefill_time = exec_time
+                is_first = False
+            else:
+                total_time_ms += exec_time
+                token_count += 1
+            partial_text += text + " "
+            history[-1][1] = partial_text
+            yield history, f"Prefill: {prefill_time:.2f}"
+        elif "formatted" in msg:
+            history[-1][1] = text
+            tokens_per_sec = (token_count / total_time_ms) * 1000
+            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
+        else:
+            sys.exit(
+                "unexpected message from the vicuna generate call, exiting."
+            )
+
+    return history, ""
+
+
+def llm_chat_api(InputData: dict):
+    return None
+    print(f"Input keys : {InputData.keys()}")
+    # print(f"model : {InputData['model']}")
+    is_chat_completion_api = (
+        "messages" in InputData.keys()
+    )  # else it is the legacy `completion` api
+    # For Debugging input data from API
+    # if is_chat_completion_api:
+    #     print(f"message -> role : {InputData['messages'][0]['role']}")
+    #     print(f"message -> content : {InputData['messages'][0]['content']}")
+    # else:
+    #     print(f"prompt : {InputData['prompt']}")
+    # print(f"max_tokens : {InputData['max_tokens']}") # Default to 128 for now
+    global vicuna_model
+    model_name = (
+        InputData["model"] if "model" in InputData.keys() else "codegen"
+    )
+    model_path = llm_model_map[model_name]
+    device = "cpu-task"
+    precision = "fp16"
+    max_toks = (
+        None
+        if "max_tokens" not in InputData.keys()
+        else InputData["max_tokens"]
+    )
+    if max_toks is None:
+        max_toks = 128 if model_name == "codegen" else 512
+
+    # make it working for codegen first
+    from apps.language_models.scripts.vicuna import (
+        UnshardedVicuna,
+    )
+
+    device_id = None
+    if vicuna_model == 0:
+        if "cuda" in device:
+            device = "cuda"
+        elif "sync" in device:
+            device = "cpu-sync"
+        elif "task" in device:
+            device = "cpu-task"
+        elif "vulkan" in device:
+            device_id = int(device.split("://")[1])
+            device = "vulkan"
+        else:
+            print("unrecognized device")
+
+        vicuna_model = UnshardedVicuna(
+            model_name,
+            hf_model_path=model_path,
+            device=device,
+            precision=precision,
+            max_num_tokens=max_toks,
+            download_vmfb=True,
+            load_mlir_from_shark_tank=True,
+            device_id=device_id,
+        )
+
+    # TODO: add role dict for different models
+    if is_chat_completion_api:
+        # TODO: add funtionality for multiple messages
+        prompt = create_prompt(
+            model_name, [(InputData["messages"][0]["content"], "")]
+        )
+    else:
+        prompt = InputData["prompt"]
+    print("prompt = ", prompt)
+
+    res = vicuna_model.generate(prompt)
+    res_op = None
+    for op in res:
+        res_op = op
+
+    if is_chat_completion_api:
+        choices = [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": res_op,  # since we are yeilding the result
+                },
+                "finish_reason": "stop",  # or length
+            }
+        ]
+    else:
+        choices = [
+            {
+                "text": res_op,
+                "index": 0,
+                "logprobs": None,
+                "finish_reason": "stop",  # or length
+            }
+        ]
+    end_time = dt.now().strftime("%Y%m%d%H%M%S%f")
+    return {
+        "id": end_time,
+        "object": "chat.completion"
+        if is_chat_completion_api
+        else "text_completion",
+        "created": int(end_time),
+        "choices": choices,
+    }
+
+
+def view_json_file(file_obj):
+    content = ""
+    with open(file_obj.name, "r") as fopen:
+        content = fopen.read()
+    return content
+
+
+with gr.Blocks(title="Chat") as chat_element:
+    with gr.Row():
+        model_choices = list(llm_model_map.keys())
+        model = gr.Dropdown(
+            label="Select Model",
+            value=model_choices[0],
+            choices=model_choices,
+            allow_custom_value=True,
+        )
+        supported_devices = get_available_devices()
+        enabled = True
+        if len(supported_devices) == 0:
+            supported_devices = ["cpu-task"]
+        supported_devices = [x for x in supported_devices if "sync" not in x]
+        device = gr.Dropdown(
+            label="Device",
+            value=supported_devices[0],
+            choices=supported_devices,
+            interactive=enabled,
+            allow_custom_value=True,
+        )
+        precision = gr.Radio(
+            label="Precision",
+            value="int4",
+            choices=[
+                # "int4",
+                # "int8",
+                # "fp16",
+                "fp32",
+            ],
+            visible=False,
+        )
+        tokens_time = gr.Textbox(label="Tokens generated per second")
+        with gr.Column():
+            download_vmfb = gr.Checkbox(
+                label="Download vmfb from Shark tank if available",
+                value=True,
+                interactive=True,
+            )
+            prompt_prefix = gr.Checkbox(
+                label="Add System Prompt",
+                value=False,
+                interactive=True,
+            )
+
+    chatbot = gr.Chatbot(height=500)
+    with gr.Row():
+        with gr.Column():
+            msg = gr.Textbox(
+                label="Chat Message Box",
+                placeholder="Chat Message Box",
+                show_label=False,
+                interactive=enabled,
+                container=False,
+            )
+        with gr.Column():
+            with gr.Row():
+                submit = gr.Button("Submit", interactive=enabled)
+                stop = gr.Button("Stop", interactive=enabled)
+                clear = gr.Button("Clear", interactive=enabled)
+
+    with gr.Row(visible=False):
+        with gr.Group():
+            config_file = gr.File(
+                label="Upload sharding configuration", visible=False
+            )
+            json_view_button = gr.Button(label="View as JSON", visible=False)
+        json_view = gr.JSON(interactive=True, visible=False)
+        json_view_button.click(
+            fn=view_json_file, inputs=[config_file], outputs=[json_view]
+        )
+    submit_event = msg.submit(
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
+    ).then(
+        fn=chat_fn,
+        inputs=[
+            prompt_prefix,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
+        queue=True,
+    )
+    submit_click_event = submit.click(
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
+    ).then(
+        fn=chat_fn,
+        inputs=[
+            prompt_prefix,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
+        queue=True,
+    )
+    stop.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, [chatbot], queue=False)
--- a/apps/stable_diffusion/profiling_with_iree.md
+++ b/apps/stable_diffusion/profiling_with_iree.md
@@ -7,16 +7,16 @@ Compile Commands FP32/FP16:

 ```shell
 Vulkan AMD: 
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux /path/to/input/mlir -o /path/to/output/vmfb

 #  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
 #  use –iree-input-type=auto or "mhlo_legacy" or "stablehlo" for TF models

 CUDA NVIDIA:
-iree-compile --iree-input-type=none --iree-hal-target-backends=cuda --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=cuda /path/to/input/mlir -o /path/to/output/vmfb

 CPU:
-iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu  --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 /path/to/input/mlir -o /path/to/output/vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu /path/to/input/mlir -o /path/to/output/vmfb
 ```


--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -105,6 +105,7 @@ def main():
        cpu_scheduling,
        args.max_embeddings_multiples,
        use_stencil=use_stencil,
+        control_mode=args.control_mode,
    )
    total_time = time.time() - start_time
    text_output = f"prompt={args.prompts}"
--- a/apps/stable_diffusion/scripts/train_lora_word.py
+++ b/apps/stable_diffusion/scripts/train_lora_word.py
@@ -34,7 +34,7 @@ from PIL import Image
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 from diffusers.loaders import AttnProcsLayers
-from diffusers.models.cross_attention import LoRACrossAttnProcessor
+from diffusers.models.attention_processor import LoRAXFormersAttnProcessor

 import torch_mlir
 from torch_mlir.dynamo import make_simple_dynamo_backend
@@ -287,7 +287,7 @@ def lora_train(
                block_id = int(name[len("down_blocks.")])
                hidden_size = unet.config.block_out_channels[block_id]

-            lora_attn_procs[name] = LoRACrossAttnProcessor(
+            lora_attn_procs[name] = LoRAXFormersAttnProcessor(
                hidden_size=hidden_size,
                cross_attention_dim=cross_attention_dim,
            )
--- a/apps/stable_diffusion/scripts/txt2img_sdxl.py
+++ b/apps/stable_diffusion/scripts/txt2img_sdxl.py
@@ -0,0 +1,96 @@
+import torch
+import time
+from apps.stable_diffusion.src import (
+    args,
+    Text2ImageSDXLPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    clear_all,
+    save_output_img,
+)
+
+
+def main():
+    if args.clear_all:
+        clear_all()
+
+    # TODO: prompt_embeds and text_embeds form base_model.json requires fixing
+    args.precision = "fp16"
+    args.height = 1024
+    args.width = 1024
+    args.max_length = 77
+    args.scheduler = "DDIM"
+    print(
+        "Using default supported configuration for SDXL :-\nprecision=fp16, width*height= 1024*1024, max_length=77 and scheduler=DDIM"
+    )
+    dtype = torch.float32 if args.precision == "fp32" else torch.half
+    cpu_scheduling = not args.scheduler.startswith("Shark")
+    set_init_device_flags()
+    schedulers = get_schedulers(args.hf_model_id)
+    scheduler_obj = schedulers[args.scheduler]
+    seed = args.seed
+    txt2img_obj = Text2ImageSDXLPipeline.from_pretrained(
+        scheduler=scheduler_obj,
+        import_mlir=args.import_mlir,
+        model_id=args.hf_model_id,
+        ckpt_loc=args.ckpt_loc,
+        precision=args.precision,
+        max_length=args.max_length,
+        batch_size=args.batch_size,
+        height=args.height,
+        width=args.width,
+        use_base_vae=args.use_base_vae,
+        use_tuned=args.use_tuned,
+        custom_vae=args.custom_vae,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        debug=args.import_debug if args.import_mlir else False,
+        use_lora=args.use_lora,
+        use_quantize=args.use_quantize,
+        ondemand=args.ondemand,
+    )
+
+    seeds = utils.batch_seeds(seed, args.batch_count, args.repeatable_seeds)
+    for current_batch in range(args.batch_count):
+        start_time = time.time()
+        generated_imgs = txt2img_obj.generate_images(
+            args.prompts,
+            args.negative_prompts,
+            args.batch_size,
+            args.height,
+            args.width,
+            args.steps,
+            args.guidance_scale,
+            seeds[current_batch],
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+            args.max_embeddings_multiples,
+        )
+        total_time = time.time() - start_time
+        text_output = f"prompt={args.prompts}"
+        text_output += f"\nnegative prompt={args.negative_prompts}"
+        text_output += (
+            f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
+        )
+        text_output += f"\nscheduler={args.scheduler}, device={args.device}"
+        text_output += (
+            f"\nsteps={args.steps}, guidance_scale={args.guidance_scale},"
+        )
+        text_output += (
+            f"seed={seeds[current_batch]}, size={args.height}x{args.width}"
+        )
+        text_output += (
+            f", batch size={args.batch_size}, max_length={args.max_length}"
+        )
+        # TODO: if using --batch_count=x txt2img_obj.log will output on each display every iteration infos from the start
+        text_output += txt2img_obj.log
+        text_output += f"\nTotal image generation time: {total_time:.4f}sec"
+
+        save_output_img(generated_imgs[0], seed)
+        print(text_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -19,6 +19,9 @@ a = Analysis(
    win_private_assemblies=False,
    cipher=block_cipher,
    noarchive=False,
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    },
 )
 pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)

--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -15,8 +15,8 @@ pathex = [

 # datafiles for pyinstaller
 datas = []
-datas += collect_data_files("torch")
 datas += copy_metadata("torch")
+datas += copy_metadata("tokenizers")
 datas += copy_metadata("tqdm")
 datas += copy_metadata("regex")
 datas += copy_metadata("requests")
@@ -30,32 +30,38 @@ datas += copy_metadata("safetensors")
 datas += copy_metadata("Pillow")
 datas += copy_metadata("sentencepiece")
 datas += copy_metadata("pyyaml")
+datas += copy_metadata("huggingface-hub")
+datas += copy_metadata("gradio")
+datas += collect_data_files("torch")
 datas += collect_data_files("tokenizers")
 datas += collect_data_files("tiktoken")
 datas += collect_data_files("accelerate")
 datas += collect_data_files("diffusers")
 datas += collect_data_files("transformers")
 datas += collect_data_files("pytorch_lightning")
-datas += collect_data_files("opencv_python")
 datas += collect_data_files("skimage")
 datas += collect_data_files("gradio")
 datas += collect_data_files("gradio_client")
 datas += collect_data_files("iree")
-datas += collect_data_files("google_cloud_storage")
-datas += collect_data_files("shark")
+datas += collect_data_files("shark", include_py_files=True)
 datas += collect_data_files("timm", include_py_files=True)
+datas += collect_data_files("tqdm")
 datas += collect_data_files("tkinter")
 datas += collect_data_files("webview")
 datas += collect_data_files("sentencepiece")
 datas += collect_data_files("jsonschema")
 datas += collect_data_files("jsonschema_specifications")
 datas += collect_data_files("cpuinfo")
+datas += collect_data_files("langchain")
+datas += collect_data_files("cv2")
+datas += collect_data_files("einops")
 datas += [
    ("src/utils/resources/prompts.json", "resources"),
    ("src/utils/resources/model_db.json", "resources"),
    ("src/utils/resources/opt_flags.json", "resources"),
    ("src/utils/resources/base_model.json", "resources"),
    ("web/ui/css/*", "ui/css"),
+    ("web/ui/js/*", "ui/js"),
    ("web/ui/logos/*", "logos"),
    (
        "../language_models/src/pipelines/minigpt4_utils/configs/*",
@@ -71,7 +77,15 @@ datas += [
 # hidden imports for pyinstaller
 hiddenimports = ["shark", "shark.shark_inference", "apps"]
 hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("gradio") if "tests" not in x]
 hiddenimports += [
-    x for x in collect_submodules("transformers") if "tests" not in x
+    x for x in collect_submodules("diffusers") if "tests" not in x
+]
+blacklist = ["tests", "convert"]
+hiddenimports += [
+    x
+    for x in collect_submodules("transformers")
+    if not any(kw in x for kw in blacklist)
 ]
 hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
+hiddenimports += ["iree._runtime"]
--- a/apps/stable_diffusion/src/init.py
+++ b/apps/stable_diffusion/src/init.py
@@ -9,6 +9,7 @@ from apps.stable_diffusion.src.utils import (
 )
 from apps.stable_diffusion.src.pipelines import (
    Text2ImagePipeline,
+    Text2ImageSDXLPipeline,
    Image2ImagePipeline,
    InpaintPipeline,
    OutpaintPipeline,
--- a/apps/stable_diffusion/src/models/model_wrappers.py
+++ b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -1,5 +1,5 @@
 from diffusers import AutoencoderKL, UNet2DConditionModel, ControlNetModel
-from transformers import CLIPTextModel
+from transformers import CLIPTextModel, CLIPTextModelWithProjection
 from collections import defaultdict
 from pathlib import Path
 import torch
@@ -8,6 +8,7 @@ import traceback
 import subprocess
 import sys
 import os
+import requests
 from apps.stable_diffusion.src.utils import (
    compile_through_fx,
    get_opt_flags,
@@ -16,12 +17,15 @@ from apps.stable_diffusion.src.utils import (
    preprocessCKPT,
    convert_original_vae,
    get_path_to_diffusers_checkpoint,
+    get_civitai_checkpoint,
    fetch_and_update_base_model_id,
    get_path_stem,
    get_extended_name,
    get_stencil_model_id,
    update_lora_weight,
 )
+from shark.shark_downloader import download_public_file
+from shark.shark_inference import SharkInference


 # These shapes are parameter dependent.
@@ -53,6 +57,10 @@ def replace_shape_str(shape, max_len, width, height, batch_size):
                    new_shape.append(math.ceil(height / div_val))
                elif "width" in shape[i]:
                    new_shape.append(math.ceil(width / div_val))
+            elif "+" in shape[i]:
+                # Currently this case only hits for SDXL. So, in case any other
+                # case requires this operator, change this.
+                new_shape.append(height + width)
        else:
            new_shape.append(shape[i])
    return new_shape
@@ -65,6 +73,70 @@ def check_compilation(model, model_name):
        )


+def shark_compile_after_ir(
+    module_name,
+    device,
+    vmfb_path,
+    precision,
+    ir_path=None,
+):
+    if ir_path:
+        print(f"[DEBUG] mlir found at {ir_path.absolute()}")
+
+    module = SharkInference(
+        mlir_module=ir_path,
+        device=device,
+        mlir_dialect="tm_tensor",
+    )
+    print(f"Will get extra flag for {module_name} and precision = {precision}")
+    path = module.save_module(
+        vmfb_path.parent.absolute(),
+        vmfb_path.stem,
+        extra_args=get_opt_flags(module_name, precision=precision),
+    )
+    print(f"Saved {module_name} vmfb at {path}")
+    module.load_module(path)
+    return module
+
+
+def process_vmfb_ir_sdxl(extended_model_name, model_name, device, precision):
+    name_split = extended_model_name.split("_")
+    if "vae" in model_name:
+        name_split[5] = "fp32"
+    extended_model_name_for_vmfb = "_".join(name_split)
+    extended_model_name_for_mlir = "_".join(name_split[:-1])
+    vmfb_path = Path(extended_model_name_for_vmfb + ".vmfb")
+    if "vulkan" in device:
+        _device = args.iree_vulkan_target_triple
+        _device = _device.replace("-", "_")
+        vmfb_path = Path(extended_model_name_for_vmfb + f"_vulkan.vmfb")
+    if vmfb_path.exists():
+        shark_module = SharkInference(
+            None,
+            device=device,
+            mlir_dialect="tm_tensor",
+        )
+        print(f"loading existing vmfb from: {vmfb_path}")
+        shark_module.load_module(vmfb_path, extra_args=[])
+        return shark_module, None
+    mlir_path = Path(extended_model_name_for_mlir + ".mlir")
+    if not mlir_path.exists():
+        print(f"Looking into gs://shark_tank/SDXL/mlir/{mlir_path.name}")
+        download_public_file(
+            f"gs://shark_tank/SDXL/mlir/{mlir_path.name}",
+            mlir_path.absolute(),
+            single_file=True,
+        )
+    if mlir_path.exists():
+        return (
+            shark_compile_after_ir(
+                model_name, device, vmfb_path, precision, mlir_path
+            ),
+            None,
+        )
+    return None, None
+
+
 class SharkifyStableDiffusionModel:
    def __init__(
        self,
@@ -84,31 +156,33 @@ class SharkifyStableDiffusionModel:
        generate_vmfb: bool = True,
        is_inpaint: bool = False,
        is_upscaler: bool = False,
-        use_stencil: str = None,
+        is_sdxl: bool = False,
+        stencils: list[str] = [],
        use_lora: str = "",
+        lora_strength: float = 0.75,
        use_quantize: str = None,
        return_mlir: bool = False,
+        favored_base_models=None,
    ):
        self.check_params(max_len, width, height)
        self.max_len = max_len
+        self.is_sdxl = is_sdxl
        self.height = height // 8
        self.width = width // 8
        self.batch_size = batch_size
-        self.custom_weights = custom_weights
+        self.custom_weights = custom_weights.strip()
        self.use_quantize = use_quantize
        if custom_weights != "":
-            if "civitai" in custom_weights:
-                weights_id = custom_weights.split("/")[-1]
-                # TODO: use model name and identify file type by civitai rest api
-                weights_path = (
-                    str(Path.cwd()) + "/models/" + weights_id + ".safetensors"
-                )
-                if not os.path.isfile(weights_path):
-                    subprocess.run(
-                        ["wget", custom_weights, "-O", weights_path]
-                    )
+            if custom_weights.startswith("https://civitai.com/api/"):
+                # download the checkpoint from civitai if we don't already have it
+                weights_path = get_civitai_checkpoint(custom_weights)
+
+                # act as if we were given the local file as custom_weights originally
                custom_weights = get_path_to_diffusers_checkpoint(weights_path)
                self.custom_weights = weights_path
+
+                # needed to ensure webui sets the correct model name metadata
+                args.ckpt_loc = weights_path
            else:
                assert custom_weights.lower().endswith(
                    (".ckpt", ".safetensors")
@@ -116,10 +190,9 @@ class SharkifyStableDiffusionModel:
                custom_weights = get_path_to_diffusers_checkpoint(
                    custom_weights
                )
+
        self.model_id = model_id if custom_weights == "" else custom_weights
-        # TODO: remove the following line when stable-diffusion-2-1 works
-        if self.model_id == "stabilityai/stable-diffusion-2-1":
-            self.model_id = "stabilityai/stable-diffusion-2-1-base"
+        self.favored_base_models = favored_base_models
        self.custom_vae = custom_vae
        self.precision = precision
        self.base_vae = use_base_vae
@@ -135,6 +208,7 @@ class SharkifyStableDiffusionModel:
            + "_"
            + precision
        )
+        self.model_namedata = self.model_name
        print(f"use_tuned? sharkify: {use_tuned}")
        self.use_tuned = use_tuned
        if use_tuned:
@@ -143,12 +217,17 @@ class SharkifyStableDiffusionModel:
        self.low_cpu_mem_usage = low_cpu_mem_usage
        self.is_inpaint = is_inpaint
        self.is_upscaler = is_upscaler
-        self.use_stencil = get_stencil_model_id(use_stencil)
+        self.stencils = [get_stencil_model_id(x) for x in stencils]
        if use_lora != "":
-            self.model_name = self.model_name + "_" + get_path_stem(use_lora)
+            self.model_name = (
+                self.model_name
+                + "_"
+                + get_path_stem(use_lora)
+                + f"@{int(lora_strength*100)}"
+            )
        self.use_lora = use_lora
+        self.lora_strength = lora_strength

-        print(self.model_name)
        self.model_name = self.get_extended_name_for_all_model()
        self.debug = debug
        self.sharktank_dir = sharktank_dir
@@ -170,17 +249,22 @@ class SharkifyStableDiffusionModel:
            args.hf_model_id = self.base_model_id
        self.return_mlir = return_mlir

-    def get_extended_name_for_all_model(self):
+    def get_extended_name_for_all_model(self, model_list=None):
        model_name = {}
        sub_model_list = [
            "clip",
+            "clip2",
            "unet",
            "unet512",
            "stencil_unet",
+            "stencil_unet_512",
            "vae",
            "vae_encode",
-            "stencil_adaptor",
+            "stencil_adapter",
+            "stencil_adapter_512",
        ]
+        if model_list is not None:
+            sub_model_list = model_list
        index = 0
        for model in sub_model_list:
            sub_model = model
@@ -192,10 +276,24 @@ class SharkifyStableDiffusionModel:
                    )
                if self.base_vae:
                    sub_model = "base_vae"
-            if "stencil_adaptor" == model and self.use_stencil is not None:
-                model_config = model_config + get_path_stem(self.use_stencil)
-            model_name[model] = get_extended_name(sub_model + model_config)
-            index += 1
+            if "stencil_adapter" in model:
+                stencil_names = []
+                for i, stencil in enumerate(self.stencils):
+                    if stencil is not None:
+                        cnet_config = (
+                            self.model_namedata
+                            + "_sd15_"
+                            + stencil.split("_")[-1]
+                        )
+                        stencil_names.append(
+                            get_extended_name(sub_model + cnet_config)
+                        )
+
+                model_name[model] = stencil_names
+            else:
+                model_name[model] = get_extended_name(sub_model + model_config)
+        index += 1
+
        return model_name

    def check_params(self, max_len, width, height):
@@ -339,13 +437,113 @@ class SharkifyStableDiffusionModel:
        )
        return shark_vae, vae_mlir

-    def get_controlled_unet(self):
+    def get_vae_sdxl(self):
+        # TODO: Remove this after convergence with shark_tank. This should just be part of
+        #       opt_params.py.
+        shark_module_or_none = process_vmfb_ir_sdxl(
+            self.model_name["vae"], "vae", args.device, self.precision
+        )
+        if shark_module_or_none[0]:
+            return shark_module_or_none
+
+        class VaeModel(torch.nn.Module):
+            def __init__(
+                self,
+                model_id=self.model_id,
+                base_vae=self.base_vae,
+                custom_vae=self.custom_vae,
+                low_cpu_mem_usage=False,
+            ):
+                super().__init__()
+                self.vae = None
+                if custom_vae == "":
+                    print(f"Loading default vae, with target {model_id}")
+                    self.vae = AutoencoderKL.from_pretrained(
+                        model_id,
+                        subfolder="vae",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                elif not isinstance(custom_vae, dict):
+                    precision = "fp16" if "fp16" in custom_vae else None
+                    print(f"Loading custom vae, with target {custom_vae}")
+                    if os.path.exists(custom_vae):
+                        self.vae = AutoencoderKL.from_pretrained(
+                            custom_vae,
+                            low_cpu_mem_usage=low_cpu_mem_usage,
+                        )
+                    else:
+                        custom_vae = "/".join(
+                            [
+                                custom_vae.split("/")[-2].split("\\")[-1],
+                                custom_vae.split("/")[-1],
+                            ]
+                        )
+                        print("Using hub to get custom vae")
+                        try:
+                            self.vae = AutoencoderKL.from_pretrained(
+                                custom_vae,
+                                low_cpu_mem_usage=low_cpu_mem_usage,
+                                variant=precision,
+                            )
+                        except:
+                            self.vae = AutoencoderKL.from_pretrained(
+                                custom_vae,
+                                low_cpu_mem_usage=low_cpu_mem_usage,
+                            )
+                else:
+                    print(f"Loading custom vae, with state {custom_vae}")
+                    self.vae = AutoencoderKL.from_pretrained(
+                        model_id,
+                        subfolder="vae",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                    self.vae.load_state_dict(custom_vae)
+                self.base_vae = base_vae
+
+            def forward(self, latents):
+                image = self.vae.decode(latents / 0.13025, return_dict=False)[
+                    0
+                ]
+                return image
+
+        vae = VaeModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        inputs = tuple(self.inputs["vae"])
+        # Make sure the VAE is in float32 mode, as it overflows in float16 as per SDXL
+        # pipeline.
+        if not self.custom_vae:
+            is_f16 = False
+        elif "16" in self.custom_vae:
+            is_f16 = True
+        else:
+            is_f16 = False
+        save_dir = os.path.join(self.sharktank_dir, self.model_name["vae"])
+        if self.debug:
+            os.makedirs(save_dir, exist_ok=True)
+        shark_vae, vae_mlir = compile_through_fx(
+            vae,
+            inputs,
+            is_f16=is_f16,
+            use_tuned=self.use_tuned,
+            extended_model_name=self.model_name["vae"],
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
+            extra_args=get_opt_flags("vae", precision=self.precision),
+            base_model_id=self.base_model_id,
+            model_name="vae",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
+        )
+        return shark_vae, vae_mlir
+
+    def get_controlled_unet(self, use_large=False):
        class ControlledUnetModel(torch.nn.Module):
            def __init__(
                self,
                model_id=self.model_id,
                low_cpu_mem_usage=False,
                use_lora=self.use_lora,
+                lora_strength=self.lora_strength,
            ):
                super().__init__()
                self.unet = UNet2DConditionModel.from_pretrained(
@@ -354,8 +552,10 @@ class SharkifyStableDiffusionModel:
                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
                if use_lora != "":
-                    update_lora_weight(self.unet, use_lora, "unet")
-                self.in_channels = self.unet.in_channels
+                    update_lora_weight(
+                        self.unet, use_lora, "unet", lora_strength
+                    )
+                self.in_channels = self.unet.config.in_channels
                self.train(False)

            def forward(
@@ -377,25 +577,54 @@ class SharkifyStableDiffusionModel:
                control11,
                control12,
                control13,
+                scale1,
+                scale2,
+                scale3,
+                scale4,
+                scale5,
+                scale6,
+                scale7,
+                scale8,
+                scale9,
+                scale10,
+                scale11,
+                scale12,
+                scale13,
            ):
+                # TODO: Average pooling
+                db_res_samples = [
+                    control1,
+                    control2,
+                    control3,
+                    control4,
+                    control5,
+                    control6,
+                    control7,
+                    control8,
+                    control9,
+                    control10,
+                    control11,
+                    control12,
+                ]
+
                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
                db_res_samples = tuple(
                    [
-                        control1,
-                        control2,
-                        control3,
-                        control4,
-                        control5,
-                        control6,
-                        control7,
-                        control8,
-                        control9,
-                        control10,
-                        control11,
-                        control12,
+                        control1 * scale1,
+                        control2 * scale2,
+                        control3 * scale3,
+                        control4 * scale4,
+                        control5 * scale5,
+                        control6 * scale6,
+                        control7 * scale7,
+                        control8 * scale8,
+                        control9 * scale9,
+                        control10 * scale10,
+                        control11 * scale11,
+                        control12 * scale12,
                    ]
                )
-                mb_res_samples = control13
+                mb_res_samples = control13 * scale13
                latents = torch.cat([latent] * 2)
                unet_out = self.unet.forward(
                    latents,
@@ -415,6 +644,16 @@ class SharkifyStableDiffusionModel:
        is_f16 = True if self.precision == "fp16" else False

        inputs = tuple(self.inputs["unet"])
+        model_name = "stencil_unet"
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[:2]
+                + (torch.nn.functional.pad(inputs[2], pad),)
+                + inputs[3:]
+            )
+            model_name = "stencil_unet_512"
        input_mask = [
            True,
            True,
@@ -433,33 +672,48 @@ class SharkifyStableDiffusionModel:
            True,
            True,
            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
+            True,
        ]
        shark_controlled_unet, controlled_unet_mlir = compile_through_fx(
            unet,
            inputs,
-            extended_model_name=self.model_name["stencil_unet"],
+            extended_model_name=self.model_name[model_name],
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_unet",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
        return shark_controlled_unet, controlled_unet_mlir

-    def get_control_net(self):
+    def get_control_net(self, stencil_id, use_large=False):
+        stencil_id = get_stencil_model_id(stencil_id)
+        adapter_id, base_model_safe_id, ext_model_name = (None, None, None)
+        print(f"Importing ControlNet adapter from {stencil_id}")
+
        class StencilControlNetModel(torch.nn.Module):
-            def __init__(
-                self, model_id=self.use_stencil, low_cpu_mem_usage=False
-            ):
+            def __init__(self, model_id=stencil_id, low_cpu_mem_usage=False):
                super().__init__()
                self.cnet = ControlNetModel.from_pretrained(
                    model_id,
                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
-                self.in_channels = self.cnet.in_channels
+                self.in_channels = self.cnet.config.in_channels
                self.train(False)

            def forward(
@@ -468,6 +722,19 @@ class SharkifyStableDiffusionModel:
                timestep,
                text_embedding,
                stencil_image_input,
+                acc1,
+                acc2,
+                acc3,
+                acc4,
+                acc5,
+                acc6,
+                acc7,
+                acc8,
+                acc9,
+                acc10,
+                acc11,
+                acc12,
+                acc13,
            ):
                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
                # TODO: guidance NOT NEEDED change in `get_input_info` later
@@ -489,6 +756,20 @@ class SharkifyStableDiffusionModel:
                )
                return tuple(
                    list(down_block_res_samples) + [mid_block_res_sample]
+                ) + (
+                    acc1 + down_block_res_samples[0],
+                    acc2 + down_block_res_samples[1],
+                    acc3 + down_block_res_samples[2],
+                    acc4 + down_block_res_samples[3],
+                    acc5 + down_block_res_samples[4],
+                    acc6 + down_block_res_samples[5],
+                    acc7 + down_block_res_samples[6],
+                    acc8 + down_block_res_samples[7],
+                    acc9 + down_block_res_samples[8],
+                    acc10 + down_block_res_samples[9],
+                    acc11 + down_block_res_samples[10],
+                    acc12 + down_block_res_samples[11],
+                    acc13 + mid_block_res_sample,
                )

        scnet = StencilControlNetModel(
@@ -496,18 +777,47 @@ class SharkifyStableDiffusionModel:
        )
        is_f16 = True if self.precision == "fp16" else False

-        inputs = tuple(self.inputs["stencil_adaptor"])
-        input_mask = [True, True, True, True]
+        inputs = tuple(self.inputs["stencil_adapter"])
+        model_name = "stencil_adapter_512" if use_large else "stencil_adapter"
+        stencil_names = self.get_extended_name_for_all_model([model_name])
+        ext_model_name = stencil_names[model_name]
+        if isinstance(ext_model_name, list):
+            desired_name = None
+            print(ext_model_name)
+            for i in ext_model_name:
+                if stencil_id.split("_")[-1] in i:
+                    desired_name = i
+                else:
+                    continue
+            if desired_name:
+                ext_model_name = desired_name
+            else:
+                raise Exception(
+                    f"Could not find extended configuration for {stencil_id}"
+                )
+
+        if use_large:
+            pad = (0, 0) * (len(inputs[2].shape) - 2)
+            pad = pad + (0, 512 - inputs[2].shape[1])
+            inputs = (
+                inputs[0],
+                inputs[1],
+                torch.nn.functional.pad(inputs[2], pad),
+                *inputs[3:],
+            )
+        save_dir = os.path.join(self.sharktank_dir, ext_model_name)
+        input_mask = [True, True, True, True] + ([True] * 13)
+
        shark_cnet, cnet_mlir = compile_through_fx(
            scnet,
            inputs,
-            extended_model_name=self.model_name["stencil_adaptor"],
+            extended_model_name=ext_model_name,
            is_f16=is_f16,
            f16_input_mask=input_mask,
            use_tuned=self.use_tuned,
            extra_args=get_opt_flags("unet", precision=self.precision),
            base_model_id=self.base_model_id,
-            model_name="stencil_adaptor",
+            model_name=model_name,
            precision=self.precision,
            return_mlir=self.return_mlir,
        )
@@ -520,6 +830,7 @@ class SharkifyStableDiffusionModel:
                model_id=self.model_id,
                low_cpu_mem_usage=False,
                use_lora=self.use_lora,
+                lora_strength=self.lora_strength,
            ):
                super().__init__()
                self.unet = UNet2DConditionModel.from_pretrained(
@@ -528,7 +839,9 @@ class SharkifyStableDiffusionModel:
                    low_cpu_mem_usage=low_cpu_mem_usage,
                )
                if use_lora != "":
-                    update_lora_weight(self.unet, use_lora, "unet")
+                    update_lora_weight(
+                        self.unet, use_lora, "unet", lora_strength
+                    )
                self.in_channels = self.unet.config.in_channels
                self.train(False)
                if (
@@ -658,6 +971,101 @@ class SharkifyStableDiffusionModel:
        )
        return shark_unet, unet_mlir

+    def get_unet_sdxl(self):
+        # TODO: Remove this after convergence with shark_tank. This should just be part of
+        #       opt_params.py.
+        shark_module_or_none = process_vmfb_ir_sdxl(
+            self.model_name["unet"], "unet", args.device, self.precision
+        )
+        if shark_module_or_none[0]:
+            return shark_module_or_none
+
+        class UnetModel(torch.nn.Module):
+            def __init__(
+                self,
+                model_id=self.model_id,
+                low_cpu_mem_usage=False,
+            ):
+                super().__init__()
+                try:
+                    self.unet = UNet2DConditionModel.from_pretrained(
+                        model_id,
+                        subfolder="unet",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                        variant="fp16",
+                    )
+                except:
+                    self.unet = UNet2DConditionModel.from_pretrained(
+                        model_id,
+                        subfolder="unet",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                if (
+                    args.attention_slicing is not None
+                    and args.attention_slicing != "none"
+                ):
+                    if args.attention_slicing.isdigit():
+                        self.unet.set_attention_slice(
+                            int(args.attention_slicing)
+                        )
+                    else:
+                        self.unet.set_attention_slice(args.attention_slicing)
+
+            def forward(
+                self,
+                latent,
+                timestep,
+                prompt_embeds,
+                text_embeds,
+                time_ids,
+                guidance_scale,
+            ):
+                added_cond_kwargs = {
+                    "text_embeds": text_embeds,
+                    "time_ids": time_ids,
+                }
+                noise_pred = self.unet.forward(
+                    latent,
+                    timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=None,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                return noise_pred
+
+        unet = UnetModel(low_cpu_mem_usage=self.low_cpu_mem_usage)
+        is_f16 = True if self.precision == "fp16" else False
+        inputs = tuple(self.inputs["unet"])
+        save_dir = os.path.join(self.sharktank_dir, self.model_name["unet"])
+        input_mask = [True, True, True, True, True, True]
+        if self.debug:
+            os.makedirs(
+                save_dir,
+                exist_ok=True,
+            )
+        shark_unet, unet_mlir = compile_through_fx(
+            unet,
+            inputs,
+            extended_model_name=self.model_name["unet"],
+            is_f16=is_f16,
+            f16_input_mask=input_mask,
+            use_tuned=self.use_tuned,
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
+            extra_args=get_opt_flags("unet", precision=self.precision),
+            base_model_id=self.base_model_id,
+            model_name="unet",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
+        )
+        return shark_unet, unet_mlir
+
    def get_clip(self):
        class CLIPText(torch.nn.Module):
            def __init__(
@@ -665,6 +1073,7 @@ class SharkifyStableDiffusionModel:
                model_id=self.model_id,
                low_cpu_mem_usage=False,
                use_lora=self.use_lora,
+                lora_strength=self.lora_strength,
            ):
                super().__init__()
                self.text_encoder = CLIPTextModel.from_pretrained(
@@ -674,15 +1083,21 @@ class SharkifyStableDiffusionModel:
                )
                if use_lora != "":
                    update_lora_weight(
-                        self.text_encoder, use_lora, "text_encoder"
+                        self.text_encoder,
+                        use_lora,
+                        "text_encoder",
+                        lora_strength,
                    )

            def forward(self, input):
                return self.text_encoder(input)[0]

        clip_model = CLIPText(low_cpu_mem_usage=self.low_cpu_mem_usage)
-        save_dir = os.path.join(self.sharktank_dir, self.model_name["clip"])
+        save_dir = ""
        if self.debug:
+            save_dir = os.path.join(
+                self.sharktank_dir, self.model_name["clip"]
+            )
            os.makedirs(
                save_dir,
                exist_ok=True,
@@ -702,6 +1117,78 @@ class SharkifyStableDiffusionModel:
        )
        return shark_clip, clip_mlir

+    def get_clip_sdxl(self, clip_index=1):
+        if clip_index == 1:
+            extended_model_name = self.model_name["clip"]
+            model_name = "clip"
+        else:
+            extended_model_name = self.model_name["clip2"]
+            model_name = "clip2"
+        # TODO: Remove this after convergence with shark_tank. This should just be part of
+        #       opt_params.py.
+        shark_module_or_none = process_vmfb_ir_sdxl(
+            extended_model_name, f"clip", args.device, self.precision
+        )
+        if shark_module_or_none[0]:
+            return shark_module_or_none
+
+        class CLIPText(torch.nn.Module):
+            def __init__(
+                self,
+                model_id=self.model_id,
+                low_cpu_mem_usage=False,
+                clip_index=1,
+            ):
+                super().__init__()
+                if clip_index == 1:
+                    self.text_encoder = CLIPTextModel.from_pretrained(
+                        model_id,
+                        subfolder="text_encoder",
+                        low_cpu_mem_usage=low_cpu_mem_usage,
+                    )
+                else:
+                    self.text_encoder = (
+                        CLIPTextModelWithProjection.from_pretrained(
+                            model_id,
+                            subfolder="text_encoder_2",
+                            low_cpu_mem_usage=low_cpu_mem_usage,
+                        )
+                    )
+
+            def forward(self, input):
+                prompt_embeds = self.text_encoder(
+                    input,
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+                return prompt_embeds, pooled_prompt_embeds
+
+        clip_model = CLIPText(
+            low_cpu_mem_usage=self.low_cpu_mem_usage, clip_index=clip_index
+        )
+        save_dir = os.path.join(self.sharktank_dir, extended_model_name)
+        if self.debug:
+            os.makedirs(
+                save_dir,
+                exist_ok=True,
+            )
+        shark_clip, clip_mlir = compile_through_fx(
+            clip_model,
+            tuple(self.inputs["clip"]),
+            extended_model_name=extended_model_name,
+            debug=self.debug,
+            generate_vmfb=self.generate_vmfb,
+            save_dir=save_dir,
+            extra_args=get_opt_flags("clip", precision="fp32"),
+            base_model_id=self.base_model_id,
+            model_name="clip",
+            precision=self.precision,
+            return_mlir=self.return_mlir,
+        )
+        return shark_clip, clip_mlir
+
    def process_custom_vae(self):
        custom_vae = self.custom_vae.lower()
        if not custom_vae.endswith((".ckpt", ".safetensors")):
@@ -734,7 +1221,9 @@ class SharkifyStableDiffusionModel:
                }
                return vae_dict

-    def compile_unet_variants(self, model, use_large=False):
+    def compile_unet_variants(self, model, use_large=False, base_model=""):
+        if self.is_sdxl:
+            return self.get_unet_sdxl()
        if model == "unet":
            if self.is_upscaler:
                return self.get_unet_upscaler(use_large=use_large)
@@ -748,7 +1237,7 @@ class SharkifyStableDiffusionModel:
            else:
                return self.get_unet(use_large=use_large)
        else:
-            return self.get_controlled_unet()
+            return self.get_controlled_unet(use_large=use_large)

    def vae_encode(self):
        try:
@@ -776,21 +1265,53 @@ class SharkifyStableDiffusionModel:
        except Exception as e:
            sys.exit(e)

+    def sdxl_clip(self):
+        try:
+            self.inputs["clip"] = self.get_input_info_for(
+                base_models["sdxl_clip"]
+            )
+            compiled_clip, clip_mlir = self.get_clip_sdxl(clip_index=1)
+            compiled_clip2, clip_mlir2 = self.get_clip_sdxl(clip_index=2)
+
+            check_compilation(compiled_clip, "Clip")
+            check_compilation(compiled_clip, "Clip2")
+            if self.return_mlir:
+                return clip_mlir, clip_mlir2
+            return compiled_clip, compiled_clip2
+        except Exception as e:
+            sys.exit(e)
+
    def unet(self, use_large=False):
        try:
-            model = "stencil_unet" if self.use_stencil is not None else "unet"
+            stencil_count = 0
+            for stencil in self.stencils:
+                stencil_count += 1
+            model = "stencil_unet" if stencil_count > 0 else "unet"
            compiled_unet = None
            unet_inputs = base_models[model]

+            # if the model to run *is* a base model, then we should treat it as such
+            if self.model_to_run in unet_inputs:
+                self.base_model_id = self.model_to_run
+
            if self.base_model_id != "":
                self.inputs["unet"] = self.get_input_info_for(
                    unet_inputs[self.base_model_id]
                )
                compiled_unet, unet_mlir = self.compile_unet_variants(
-                    model, use_large=use_large
+                    model, use_large=use_large, base_model=self.base_model_id
                )
            else:
-                for model_id in unet_inputs:
+                # restrict base models to check if we were given a specific list of valid ones
+                allowed_base_model_ids = unet_inputs
+                if self.favored_base_models != None:
+                    allowed_base_model_ids = self.favored_base_models
+
+                print(f"self.favored_base_models: {self.favored_base_models}")
+                print(f"allowed_base_model_ids: {allowed_base_model_ids}")
+
+                # try compiling with each base model until we find one that works (of not)
+                for model_id in allowed_base_model_ids:
                    self.base_model_id = model_id
                    self.inputs["unet"] = self.get_input_info_for(
                        unet_inputs[model_id]
@@ -798,12 +1319,12 @@ class SharkifyStableDiffusionModel:

                    try:
                        compiled_unet, unet_mlir = self.compile_unet_variants(
-                            model, use_large=use_large
+                            model, use_large=use_large, base_model=model_id
                        )
                    except Exception as e:
                        print(e)
                        print(
-                            "Retrying with a different base model configuration"
+                            f"Retrying with a different base model configuration, as {model_id} did not work"
                        )
                        continue

@@ -837,7 +1358,10 @@ class SharkifyStableDiffusionModel:
            is_base_vae = self.base_vae
            if self.is_upscaler:
                self.base_vae = True
-            compiled_vae, vae_mlir = self.get_vae()
+            if self.is_sdxl:
+                compiled_vae, vae_mlir = self.get_vae_sdxl()
+            else:
+                compiled_vae, vae_mlir = self.get_vae()
            self.base_vae = is_base_vae

            check_compilation(compiled_vae, "Vae")
@@ -847,16 +1371,18 @@ class SharkifyStableDiffusionModel:
        except Exception as e:
            sys.exit(e)

-    def controlnet(self):
+    def controlnet(self, stencil_id, use_large=False):
        try:
-            self.inputs["stencil_adaptor"] = self.get_input_info_for(
-                base_models["stencil_adaptor"]
+            self.inputs["stencil_adapter"] = self.get_input_info_for(
+                base_models["stencil_adapter"]
+            )
+            compiled_stencil_adapter, controlnet_mlir = self.get_control_net(
+                stencil_id, use_large=use_large
            )
-            compiled_stencil_adaptor, controlnet_mlir = self.get_control_net()

-            check_compilation(compiled_stencil_adaptor, "Stencil")
+            check_compilation(compiled_stencil_adapter, "Stencil")
            if self.return_mlir:
                return controlnet_mlir
-            return compiled_stencil_adaptor
+            return compiled_stencil_adapter
        except Exception as e:
            sys.exit(e)
--- a/apps/stable_diffusion/src/models/opt_params.py
+++ b/apps/stable_diffusion/src/models/opt_params.py
@@ -123,8 +123,11 @@ def get_clip():
    return get_shark_model(bucket, model_name, iree_flags)


-def get_tokenizer():
+def get_tokenizer(subfolder="tokenizer", hf_model_id=None):
+    if hf_model_id is not None:
+        args.hf_model_id = hf_model_id
+
    tokenizer = CLIPTokenizer.from_pretrained(
-        args.hf_model_id, subfolder="tokenizer"
+        args.hf_model_id, subfolder=subfolder
    )
    return tokenizer
--- a/apps/stable_diffusion/src/pipelines/init.py
+++ b/apps/stable_diffusion/src/pipelines/init.py
@@ -1,6 +1,9 @@
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img import (
    Text2ImagePipeline,
 )
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_txt2img_sdxl import (
+    Text2ImageSDXLPipeline,
+)
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_img2img import (
    Image2ImagePipeline,
 )
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_img2img.py
@@ -29,6 +29,10 @@ from apps.stable_diffusion.src.models import (
    SharkifyStableDiffusionModel,
    get_vae_encode,
 )
+from apps.stable_diffusion.src.utils import (
+    resamplers,
+    resampler_list,
+)


 class Image2ImagePipeline(StableDiffusionPipeline):
@@ -52,9 +56,12 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
    ):
-        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
        self.vae_encode = None

    def load_vae_encode(self):
@@ -84,13 +91,21 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        num_inference_steps,
        strength,
        dtype,
+        resample_type,
    ):
        # Pre process image -> get image encoded -> process latents

        # TODO: process with variable HxW combos

-        # Pre process image
-        image = image.resize((width, height))
+        # Pre-process image
+        resample_type = (
+            resamplers[resample_type]
+            if resample_type in resampler_list
+            # Fallback to Lanczos
+            else Image.Resampling.LANCZOS
+        )
+
+        image = image.resize((width, height), resample=resample_type)
        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
        image_arr = image_arr / 255.0
        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(dtype)
@@ -146,7 +161,11 @@ class Image2ImagePipeline(StableDiffusionPipeline):
        use_base_vae,
        cpu_scheduling,
        max_embeddings_multiples,
-        use_stencil,
+        stencils,
+        images,
+        resample_type,
+        control_mode,
+        preprocessed_hints=[],
    ):
        # prompts and negative prompts must be a list.
        if isinstance(prompts, str):
@@ -186,6 +205,7 @@ class Image2ImagePipeline(StableDiffusionPipeline):
            num_inference_steps=num_inference_steps,
            strength=strength,
            dtype=dtype,
+            resample_type=resample_type,
        )

        # Get Image latents
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_inpaint.py
@@ -51,9 +51,12 @@ class InpaintPipeline(StableDiffusionPipeline):
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
    ):
-        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
        self.vae_encode = None

    def load_vae_encode(self):
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_outpaint.py
@@ -52,9 +52,12 @@ class OutpaintPipeline(StableDiffusionPipeline):
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
    ):
-        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
        self.vae_encode = None

    def load_vae_encode(self):
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_stencil.py
@@ -25,12 +25,22 @@ from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    StableDiffusionPipeline,
 )
-from apps.stable_diffusion.src.utils import controlnet_hint_conversion
+from apps.stable_diffusion.src.utils import (
+    controlnet_hint_conversion,
+    controlnet_hint_reshaping,
+)
 from apps.stable_diffusion.src.utils import (
    start_profiling,
    end_profiling,
 )
-from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel
+from apps.stable_diffusion.src.utils import (
+    resamplers,
+    resampler_list,
+)
+from apps.stable_diffusion.src.models import (
+    SharkifyStableDiffusionModel,
+    get_vae_encode,
+)


 class StencilPipeline(StableDiffusionPipeline):
@@ -54,19 +64,69 @@ class StencilPipeline(StableDiffusionPipeline):
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
+        controlnet_names: list[str],
    ):
-        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
-        self.controlnet = None
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
+        self.controlnet = [None] * len(controlnet_names)
+        self.controlnet_512 = [None] * len(controlnet_names)
+        self.controlnet_id = [str] * len(controlnet_names)
+        self.controlnet_512_id = [str] * len(controlnet_names)
+        self.controlnet_names = controlnet_names
+        self.vae_encode = None

-    def load_controlnet(self):
-        if self.controlnet is not None:
+    def load_vae_encode(self):
+        if self.vae_encode is not None:
            return
-        self.controlnet = self.sd_model.controlnet()

-    def unload_controlnet(self):
-        del self.controlnet
-        self.controlnet = None
+        if self.import_mlir or self.use_lora:
+            self.vae_encode = self.sd_model.vae_encode()
+        else:
+            try:
+                self.vae_encode = get_vae_encode()
+            except:
+                print("download pipeline failed, falling back to import_mlir")
+                self.vae_encode = self.sd_model.vae_encode()
+
+    def unload_vae_encode(self):
+        del self.vae_encode
+        self.vae_encode = None
+
+    def load_controlnet(self, index, model_name):
+        if model_name is None:
+            return
+        if (
+            self.controlnet[index] is not None
+            and self.controlnet_id[index] is not None
+            and self.controlnet_id[index] == model_name
+        ):
+            return
+        self.controlnet_id[index] = model_name
+        self.controlnet[index] = self.sd_model.controlnet(model_name)
+
+    def unload_controlnet(self, index):
+        del self.controlnet[index]
+        self.controlnet_id[index] = None
+        self.controlnet[index] = None
+
+    def load_controlnet_512(self, index, model_name):
+        if (
+            self.controlnet_512[index] is not None
+            and self.controlnet_512_id[index] == model_name
+        ):
+            return
+        self.controlnet_512_id[index] = model_name
+        self.controlnet_512[index] = self.sd_model.controlnet(
+            model_name, use_large=True
+        )
+
+    def unload_controlnet_512(self, index):
+        del self.controlnet_512[index]
+        self.controlnet_512_id[index] = None
+        self.controlnet_512[index] = None

    def prepare_latents(
        self,
@@ -93,6 +153,58 @@ class StencilPipeline(StableDiffusionPipeline):
        latents = latents * self.scheduler.init_noise_sigma
        return latents

+    def prepare_image_latents(
+        self,
+        image,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        strength,
+        dtype,
+        resample_type,
+    ):
+        # Pre process image -> get image encoded -> process latents
+
+        # TODO: process with variable HxW combos
+
+        # Pre-process image
+        resample_type = (
+            resamplers[resample_type]
+            if resample_type in resampler_list
+            # Fallback to Lanczos
+            else Image.Resampling.LANCZOS
+        )
+
+        image = image.resize((width, height), resample=resample_type)
+        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
+        image_arr = image_arr / 255.0
+        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(dtype)
+        image_arr = 2 * (image_arr - 0.5)
+
+        # set scheduler steps
+        self.scheduler.set_timesteps(num_inference_steps)
+        init_timestep = min(
+            int(num_inference_steps * strength), num_inference_steps
+        )
+        t_start = max(num_inference_steps - init_timestep, 0)
+        # timesteps reduced as per strength
+        timesteps = self.scheduler.timesteps[t_start:]
+        # new number of steps to be used as per strength will be
+        # num_inference_steps = num_inference_steps - t_start
+
+        # image encode
+        latents = self.encode_image((image_arr,))
+        latents = torch.from_numpy(latents).to(dtype)
+        # add noise to data
+        noise = torch.randn(latents.shape, generator=generator, dtype=dtype)
+        latents = self.scheduler.add_noise(
+            latents, noise, timesteps[0].repeat(1)
+        )
+
+        return latents, timesteps
+
    def produce_stencil_latents(
        self,
        latents,
@@ -101,8 +213,9 @@ class StencilPipeline(StableDiffusionPipeline):
        total_timesteps,
        dtype,
        cpu_scheduling,
-        controlnet_hint=None,
+        stencil_hints=[None],
        controlnet_conditioning_scale: float = 1.0,
+        control_mode="Balanced",  # Prompt, Balanced, or Controlnet
        mask=None,
        masked_image_latents=None,
        return_all_latents=False,
@@ -111,8 +224,24 @@ class StencilPipeline(StableDiffusionPipeline):
        latent_history = [latents]
        text_embeddings = torch.from_numpy(text_embeddings).to(dtype)
        text_embeddings_numpy = text_embeddings.detach().numpy()
-        self.load_unet()
-        self.load_controlnet()
+        assert control_mode in ["Prompt", "Balanced", "Controlnet"]
+        if text_embeddings.shape[1] <= self.model_max_length:
+            self.load_unet()
+        else:
+            self.load_unet_512()
+
+        for i, name in enumerate(self.controlnet_names):
+            use_names = []
+            if name is not None:
+                use_names.append(name)
+            else:
+                continue
+            if text_embeddings.shape[1] <= self.model_max_length:
+                self.load_controlnet(i, name)
+            else:
+                self.load_controlnet_512(i, name)
+            self.controlnet_names = use_names
+
        for i, t in tqdm(enumerate(total_timesteps)):
            step_start_time = time.time()
            timestep = torch.tensor([t]).to(dtype)
@@ -135,43 +264,167 @@ class StencilPipeline(StableDiffusionPipeline):
                ).to(dtype)
            else:
                latent_model_input_1 = latent_model_input
-            control = self.controlnet(
-                "forward",
-                (
-                    latent_model_input_1,
-                    timestep,
-                    text_embeddings,
-                    controlnet_hint,
-                ),
-                send_to_host=False,
+
+            # Multicontrolnet
+            height = latent_model_input_1.shape[2]
+            width = latent_model_input_1.shape[3]
+            dtype = latent_model_input_1.dtype
+            control_acc = (
+                [torch.zeros((2, 320, height, width), dtype=dtype)] * 3
+                + [
+                    torch.zeros(
+                        (2, 320, int(height / 2), int(width / 2)), dtype=dtype
+                    )
+                ]
+                + [
+                    torch.zeros(
+                        (2, 640, int(height / 2), int(width / 2)), dtype=dtype
+                    )
+                ]
+                * 2
+                + [
+                    torch.zeros(
+                        (2, 640, int(height / 4), int(width / 4)), dtype=dtype
+                    )
+                ]
+                + [
+                    torch.zeros(
+                        (2, 1280, int(height / 4), int(width / 4)), dtype=dtype
+                    )
+                ]
+                * 2
+                + [
+                    torch.zeros(
+                        (2, 1280, int(height / 8), int(width / 8)), dtype=dtype
+                    )
+                ]
+                * 4
            )
+            for i, controlnet_hint in enumerate(stencil_hints):
+                if controlnet_hint is None:
+                    pass
+                if text_embeddings.shape[1] <= self.model_max_length:
+                    control = self.controlnet[i](
+                        "forward",
+                        (
+                            latent_model_input_1,
+                            timestep,
+                            text_embeddings,
+                            controlnet_hint,
+                            *control_acc,
+                        ),
+                        send_to_host=False,
+                    )
+                else:
+                    control = self.controlnet_512[i](
+                        "forward",
+                        (
+                            latent_model_input_1,
+                            timestep,
+                            text_embeddings,
+                            controlnet_hint,
+                            *control_acc,
+                        ),
+                        send_to_host=False,
+                    )
+                control_acc = control[13:]
+                control = control[:13]
+
            timestep = timestep.detach().numpy()
            # Profiling Unet.
            profile_device = start_profiling(file_path="unet.rdc")
            # TODO: Pass `control` as it is to Unet. Same as TODO mentioned in model_wrappers.py.
-            noise_pred = self.unet(
-                "forward",
-                (
-                    latent_model_input,
-                    timestep,
-                    text_embeddings_numpy,
-                    guidance_scale,
-                    control[0],
-                    control[1],
-                    control[2],
-                    control[3],
-                    control[4],
-                    control[5],
-                    control[6],
-                    control[7],
-                    control[8],
-                    control[9],
-                    control[10],
-                    control[11],
-                    control[12],
-                ),
-                send_to_host=False,
-            )
+
+            dtype = latents.dtype
+            if control_mode == "Balanced":
+                control_scale = [
+                    torch.tensor(1.0, dtype=dtype) for _ in range(len(control))
+                ]
+            elif control_mode == "Prompt":
+                control_scale = [
+                    torch.tensor(0.825**x, dtype=dtype)
+                    for x in range(len(control))
+                ]
+            elif control_mode == "Controlnet":
+                control_scale = [
+                    torch.tensor(float(guidance_scale), dtype=dtype)
+                    for _ in range(len(control))
+                ]
+
+            if text_embeddings.shape[1] <= self.model_max_length:
+                noise_pred = self.unet(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                        control_scale[0],
+                        control_scale[1],
+                        control_scale[2],
+                        control_scale[3],
+                        control_scale[4],
+                        control_scale[5],
+                        control_scale[6],
+                        control_scale[7],
+                        control_scale[8],
+                        control_scale[9],
+                        control_scale[10],
+                        control_scale[11],
+                        control_scale[12],
+                    ),
+                    send_to_host=False,
+                )
+            else:
+                noise_pred = self.unet_512(
+                    "forward",
+                    (
+                        latent_model_input,
+                        timestep,
+                        text_embeddings_numpy,
+                        guidance_scale,
+                        control[0],
+                        control[1],
+                        control[2],
+                        control[3],
+                        control[4],
+                        control[5],
+                        control[6],
+                        control[7],
+                        control[8],
+                        control[9],
+                        control[10],
+                        control[11],
+                        control[12],
+                        control_scale[0],
+                        control_scale[1],
+                        control_scale[2],
+                        control_scale[3],
+                        control_scale[4],
+                        control_scale[5],
+                        control_scale[6],
+                        control_scale[7],
+                        control_scale[8],
+                        control_scale[9],
+                        control_scale[10],
+                        control_scale[11],
+                        control_scale[12],
+                    ),
+                    send_to_host=False,
+                )
            end_profiling(profile_device)

            if cpu_scheduling:
@@ -191,7 +444,10 @@ class StencilPipeline(StableDiffusionPipeline):

        if self.ondemand:
            self.unload_unet()
-            self.unload_controlnet()
+            self.unload_unet_512()
+            for i in range(len(self.controlnet_names)):
+                self.unload_controlnet(i)
+                self.unload_controlnet_512(i)
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -200,6 +456,17 @@ class StencilPipeline(StableDiffusionPipeline):
        all_latents = torch.cat(latent_history, dim=0)
        return all_latents

+    def encode_image(self, input_image):
+        self.load_vae_encode()
+        vae_encode_start = time.time()
+        latents = self.vae_encode("forward", input_image)
+        vae_inf_time = (time.time() - vae_encode_start) * 1000
+        if self.ondemand:
+            self.unload_vae_encode()
+        self.log += f"\nVAE Encode Inference time (ms): {vae_inf_time:.3f}"
+
+        return latents
+
    def generate_images(
        self,
        prompts,
@@ -217,13 +484,48 @@ class StencilPipeline(StableDiffusionPipeline):
        use_base_vae,
        cpu_scheduling,
        max_embeddings_multiples,
-        use_stencil,
+        stencils,
+        stencil_images,
+        resample_type,
+        control_mode,
+        preprocessed_hints,
    ):
        # Control Embedding check & conversion
-        # TODO: 1. Change `num_images_per_prompt`.
-        controlnet_hint = controlnet_hint_conversion(
-            image, use_stencil, height, width, dtype, num_images_per_prompt=1
-        )
+        # controlnet_hint = controlnet_hint_conversion(
+        #     image, use_stencil, height, width, dtype, num_images_per_prompt=1
+        # )
+        stencil_hints = []
+        self.sd_model.stencils = stencils
+        for i, hint in enumerate(preprocessed_hints):
+            if hint is not None:
+                hint = controlnet_hint_reshaping(
+                    hint,
+                    height,
+                    width,
+                    dtype,
+                    num_images_per_prompt=1,
+                )
+                stencil_hints.append(hint)
+
+        for i, stencil in enumerate(stencils):
+            if stencil == None:
+                continue
+            if len(stencil_hints) > i:
+                if stencil_hints[i] is not None:
+                    print(f"Using preprocessed controlnet hint for {stencil}")
+                    continue
+            image = stencil_images[i]
+            stencil_hints.append(
+                controlnet_hint_conversion(
+                    image,
+                    stencil,
+                    height,
+                    width,
+                    dtype,
+                    num_images_per_prompt=1,
+                )
+            )
+
        # prompts and negative prompts must be a list.
        if isinstance(prompts, str):
            prompts = [prompts]
@@ -251,17 +553,30 @@ class StencilPipeline(StableDiffusionPipeline):

        # guidance scale as a float32 tensor.
        guidance_scale = torch.tensor(guidance_scale).to(torch.float32)
-
-        # Prepare initial latent.
-        init_latents = self.prepare_latents(
-            batch_size=batch_size,
-            height=height,
-            width=width,
-            generator=generator,
-            num_inference_steps=num_inference_steps,
-            dtype=dtype,
-        )
-        final_timesteps = self.scheduler.timesteps
+        if image is not None:
+            # Prepare input image latent
+            init_latents, final_timesteps = self.prepare_image_latents(
+                image=image,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                generator=generator,
+                num_inference_steps=num_inference_steps,
+                strength=strength,
+                dtype=dtype,
+                resample_type=resample_type,
+            )
+        else:
+            # Prepare initial latent.
+            init_latents = self.prepare_latents(
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                generator=generator,
+                num_inference_steps=num_inference_steps,
+                dtype=dtype,
+            )
+            final_timesteps = self.scheduler.timesteps

        # Get Image latents
        latents = self.produce_stencil_latents(
@@ -271,7 +586,8 @@ class StencilPipeline(StableDiffusionPipeline):
            total_timesteps=final_timesteps,
            dtype=dtype,
            cpu_scheduling=cpu_scheduling,
-            controlnet_hint=controlnet_hint,
+            control_mode=control_mode,
+            stencil_hints=stencil_hints,
        )

        # Img latents -> PIL images
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -18,7 +18,10 @@ from diffusers import (
    KDPM2AncestralDiscreteScheduler,
    HeunDiscreteScheduler,
 )
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.schedulers import (
+    SharkEulerDiscreteScheduler,
+    SharkEulerAncestralDiscreteScheduler,
+)
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    StableDiffusionPipeline,
 )
@@ -46,9 +49,19 @@ class Text2ImagePipeline(StableDiffusionPipeline):
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
    ):
-        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
+
+    @classmethod
+    def favored_base_models(cls, model_id):
+        return [
+            "stabilityai/stable-diffusion-2-1",
+            "CompVis/stable-diffusion-v1-4",
+        ]

    def prepare_latents(
        self,
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img_sdxl.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img_sdxl.py
@@ -0,0 +1,236 @@
+import torch
+import numpy as np
+from random import randint
+from typing import Union
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    DEISMultistepScheduler,
+    DDPMScheduler,
+    DPMSolverSinglestepScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    HeunDiscreteScheduler,
+)
+from apps.stable_diffusion.src.schedulers import (
+    SharkEulerDiscreteScheduler,
+    SharkEulerAncestralDiscreteScheduler,
+)
+from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+    StableDiffusionPipeline,
+)
+from apps.stable_diffusion.src.models import SharkifyStableDiffusionModel
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Text2ImageSDXLPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            SharkEulerDiscreteScheduler,
+            SharkEulerAncestralDiscreteScheduler,
+            DEISMultistepScheduler,
+            DDPMScheduler,
+            DPMSolverSinglestepScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            HeunDiscreteScheduler,
+        ],
+        sd_model: SharkifyStableDiffusionModel,
+        import_mlir: bool,
+        use_lora: str,
+        lora_strength: float,
+        ondemand: bool,
+        is_fp32_vae: bool,
+    ):
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
+        self.is_fp32_vae = is_fp32_vae
+
+    @classmethod
+    def favored_base_models(cls, model_id):
+        if "turbo" in model_id:
+            return [
+                "stabilityai/sdxl-turbo",
+                "stabilityai/stable-diffusion-xl-base-1.0",
+            ]
+        else:
+            return [
+                "stabilityai/stable-diffusion-xl-base-1.0",
+                "stabilityai/sdxl-turbo",
+            ]
+
+    def prepare_latents(
+        self,
+        batch_size,
+        height,
+        width,
+        generator,
+        num_inference_steps,
+        dtype,
+    ):
+        latents = torch.randn(
+            (
+                batch_size,
+                4,
+                height // 8,
+                width // 8,
+            ),
+            generator=generator,
+            dtype=torch.float32,
+        ).to(dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.is_scale_input_called = True
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype
+    ):
+        add_time_ids = list(
+            original_size + crops_coords_top_left + target_size
+        )
+
+        # self.unet.config.addition_time_embed_dim IS 256.
+        # self.text_encoder_2.config.projection_dim IS 1280.
+        passed_add_embed_dim = 256 * len(add_time_ids) + 1280
+        expected_add_embed_dim = 2816
+        # self.unet.add_embedding.linear_1.in_features IS 2816.
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def generate_images(
+        self,
+        prompts,
+        neg_prompts,
+        batch_size,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        seed,
+        max_length,
+        dtype,
+        use_base_vae,
+        cpu_scheduling,
+        max_embeddings_multiples,
+    ):
+        # prompts and negative prompts must be a list.
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        if isinstance(neg_prompts, str):
+            neg_prompts = [neg_prompts]
+
+        prompts = prompts * batch_size
+        neg_prompts = neg_prompts * batch_size
+
+        # seed generator to create the inital latent noise. Also handle out of range seeds.
+        # TODO: Wouldn't it be preferable to just report an error instead of modifying the seed on the fly?
+        uint32_info = np.iinfo(np.uint32)
+        uint32_min, uint32_max = uint32_info.min, uint32_info.max
+        if seed < uint32_min or seed >= uint32_max:
+            seed = randint(uint32_min, uint32_max)
+        generator = torch.manual_seed(seed)
+
+        # Get initial latents.
+        init_latents = self.prepare_latents(
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            generator=generator,
+            num_inference_steps=num_inference_steps,
+            dtype=dtype,
+        )
+
+        # Get text embeddings.
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt_sdxl(
+            prompt=prompts,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=True,
+            negative_prompt=neg_prompts,
+        )
+
+        # Prepare timesteps.
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        timesteps = self.scheduler.timesteps
+
+        # Prepare added time ids & embeddings.
+        original_size = (height, width)
+        target_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+        )
+
+        prompt_embeds = torch.cat(
+            [negative_prompt_embeds, prompt_embeds], dim=0
+        )
+        add_text_embeds = torch.cat(
+            [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+        )
+        add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds
+        add_text_embeds = add_text_embeds.to(dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * 1, 1)
+
+        # guidance scale as a float32 tensor.
+        guidance_scale = torch.tensor(guidance_scale).to(dtype)
+        prompt_embeds = prompt_embeds.to(dtype)
+        add_time_ids = add_time_ids.to(dtype)
+
+        # Get Image latents.
+        latents = self.produce_img_latents_sdxl(
+            init_latents,
+            timesteps,
+            add_text_embeds,
+            add_time_ids,
+            prompt_embeds,
+            cpu_scheduling,
+            guidance_scale,
+            dtype,
+        )
+
+        # Img latents -> PIL images.
+        all_imgs = []
+        self.load_vae()
+        for i in range(0, latents.shape[0], batch_size):
+            imgs = self.decode_latents_sdxl(
+                latents[i : i + batch_size], is_fp32_vae=self.is_fp32_vae
+            )
+            all_imgs.extend(imgs)
+        if self.ondemand:
+            self.unload_vae()
+
+        return all_imgs
--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_upscaler.py
@@ -94,9 +94,12 @@ class UpscalerPipeline(StableDiffusionPipeline):
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
    ):
-        super().__init__(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        super().__init__(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )
        self.low_res_scheduler = low_res_scheduler
        self.status = SD_STATE_IDLE

--- a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
+++ b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -20,7 +20,10 @@ from diffusers import (
    HeunDiscreteScheduler,
 )
 from shark.shark_inference import SharkInference
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.schedulers import (
+    SharkEulerDiscreteScheduler,
+    SharkEulerAncestralDiscreteScheduler,
+)
 from apps.stable_diffusion.src.models import (
    SharkifyStableDiffusionModel,
    get_vae,
@@ -33,6 +36,8 @@ from apps.stable_diffusion.src.utils import (
    end_profiling,
 )
 import sys
+import gc
+from typing import List, Optional

 SD_STATE_IDLE = "idle"
 SD_STATE_CANCEL = "cancel"
@@ -50,6 +55,7 @@ class StableDiffusionPipeline:
            EulerAncestralDiscreteScheduler,
            DPMSolverMultistepScheduler,
            SharkEulerDiscreteScheduler,
+            SharkEulerAncestralDiscreteScheduler,
            DEISMultistepScheduler,
            DDPMScheduler,
            DPMSolverSinglestepScheduler,
@@ -59,21 +65,26 @@ class StableDiffusionPipeline:
        sd_model: SharkifyStableDiffusionModel,
        import_mlir: bool,
        use_lora: str,
+        lora_strength: float,
        ondemand: bool,
+        is_f32_vae: bool = False,
    ):
        self.vae = None
        self.text_encoder = None
+        self.text_encoder_2 = None
        self.unet = None
        self.unet_512 = None
        self.model_max_length = 77
-        self.scheduler = scheduler
        # TODO: Implement using logging python utility.
        self.log = ""
        self.status = SD_STATE_IDLE
        self.sd_model = sd_model
+        self.scheduler = scheduler
        self.import_mlir = import_mlir
        self.use_lora = use_lora
+        self.lora_strength = lora_strength
        self.ondemand = ondemand
+        self.is_f32_vae = is_f32_vae
        # TODO: Find a better workaround for fetching base_model_id early
        #  enough for CLIPTokenizer.
        try:
@@ -83,6 +94,10 @@ class StableDiffusionPipeline:
            self.unload_unet()
            self.tokenizer = get_tokenizer()

+    def favored_base_models(cls, model_id):
+        # all base models can be candidate base models for unet compilation
+        return None
+
    def load_clip(self):
        if self.text_encoder is not None:
            return
@@ -106,6 +121,34 @@ class StableDiffusionPipeline:
        del self.text_encoder
        self.text_encoder = None

+    def load_clip_sdxl(self):
+        if self.text_encoder and self.text_encoder_2:
+            return
+
+        if self.import_mlir or self.use_lora:
+            if not self.import_mlir:
+                print(
+                    "Warning: LoRA provided but import_mlir not specified. "
+                    "Importing MLIR anyways."
+                )
+            self.text_encoder, self.text_encoder_2 = self.sd_model.sdxl_clip()
+        else:
+            try:
+                # TODO: Fix this for SDXL
+                self.text_encoder = get_clip()
+            except Exception as e:
+                print(e)
+                print("download pipeline failed, falling back to import_mlir")
+                (
+                    self.text_encoder,
+                    self.text_encoder_2,
+                ) = self.sd_model.sdxl_clip()
+
+    def unload_clip_sdxl(self):
+        del self.text_encoder, self.text_encoder_2
+        self.text_encoder = None
+        self.text_encoder_2 = None
+
    def load_unet(self):
        if self.unet is not None:
            return
@@ -159,6 +202,182 @@ class StableDiffusionPipeline:
    def unload_vae(self):
        del self.vae
        self.vae = None
+        gc.collect()
+
+    def encode_prompt_sdxl(
+        self,
+        prompt: str,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        hf_model_id: Optional[
+            str
+        ] = "stabilityai/stable-diffusion-xl-base-1.0",
+    ):
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        self.tokenizer_2 = get_tokenizer("tokenizer_2", hf_model_id)
+        self.load_clip_sdxl()
+        tokenizers = (
+            [self.tokenizer, self.tokenizer_2]
+            if self.tokenizer is not None
+            else [self.tokenizer_2]
+        )
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2]
+            if self.text_encoder is not None
+            else [self.text_encoder_2]
+        )
+
+        # textual inversion: procecss multi-vector tokens if necessary
+        prompt_embeds_list = []
+        prompts = [prompt, prompt]
+        for prompt, tokenizer, text_encoder in zip(
+            prompts, tokenizers, text_encoders
+        ):
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = tokenizer.batch_decode(
+                    untruncated_ids[:, tokenizer.model_max_length - 1 : -1]
+                )
+                print(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            text_encoder_output = text_encoder("forward", (text_input_ids,))
+            prompt_embeds = torch.from_numpy(text_encoder_output[0])
+            pooled_prompt_embeds = torch.from_numpy(text_encoder_output[1])
+
+            prompt_embeds_list.append(prompt_embeds)
+
+        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = (
+            negative_prompt is None
+            and self.config.force_zeros_for_empty_prompt
+        )
+        if (
+            do_classifier_free_guidance
+            and negative_prompt_embeds is None
+            and zero_out_negative_prompt
+        ):
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(
+                pooled_prompt_embeds
+            )
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(
+                negative_prompt
+            ):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(
+                uncond_tokens, tokenizers, text_encoders
+            ):
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_encoder_output = text_encoder(
+                    "forward", (uncond_input.input_ids,)
+                )
+                negative_prompt_embeds = torch.from_numpy(
+                    text_encoder_output[0]
+                )
+                negative_pooled_prompt_embeds = torch.from_numpy(
+                    text_encoder_output[1]
+                )
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(
+                negative_prompt_embeds_list, dim=-1
+            )
+
+        if self.ondemand:
+            self.unload_clip_sdxl()
+            gc.collect()
+
+        # TODO: Look into dtype for text_encoder_2!
+        prompt_embeds = prompt_embeds.to(dtype=torch.float16)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=torch.float32)
+        negative_prompt_embeds = negative_prompt_embeds.repeat(
+            1, num_images_per_prompt, 1
+        )
+        negative_prompt_embeds = negative_prompt_embeds.view(
+            batch_size * num_images_per_prompt, seq_len, -1
+        )
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(
+            1, num_images_per_prompt
+        ).view(bs_embed * num_images_per_prompt, -1)
+        negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(
+            1, num_images_per_prompt
+        ).view(bs_embed * num_images_per_prompt, -1)
+
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )

    def encode_prompts(self, prompts, neg_prompts, max_length):
        # Tokenize text and get embeddings
@@ -186,6 +405,7 @@ class StableDiffusionPipeline:
        clip_inf_time = (time.time() - clip_inf_start) * 1000
        if self.ondemand:
            self.unload_clip()
+            gc.collect()
        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"

        return text_embeddings
@@ -298,6 +518,8 @@ class StableDiffusionPipeline:
        if self.ondemand:
            self.unload_unet()
            self.unload_unet_512()
+            gc.collect()
+
        avg_step_time = step_time_sum / len(total_timesteps)
        self.log += f"\nAverage step time: {avg_step_time}ms/it"

@@ -306,6 +528,96 @@ class StableDiffusionPipeline:
        all_latents = torch.cat(latent_history, dim=0)
        return all_latents

+    def produce_img_latents_sdxl(
+        self,
+        latents,
+        total_timesteps,
+        add_text_embeds,
+        add_time_ids,
+        prompt_embeds,
+        cpu_scheduling,
+        guidance_scale,
+        dtype,
+        mask=None,
+        masked_image_latents=None,
+        return_all_latents=False,
+    ):
+        # return None
+        self.status = SD_STATE_IDLE
+        step_time_sum = 0
+        extra_step_kwargs = {"generator": None}
+        self.load_unet()
+        for i, t in tqdm(enumerate(total_timesteps)):
+            step_start_time = time.time()
+            timestep = torch.tensor([t]).to(dtype).detach().numpy()
+            # expand the latents if we are doing classifier free guidance
+            if isinstance(latents, np.ndarray):
+                latents = torch.tensor(latents)
+            latent_model_input = torch.cat([latents] * 2)
+
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t
+            )
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
+
+            noise_pred = self.unet(
+                "forward",
+                (
+                    latent_model_input,
+                    timestep,
+                    prompt_embeds,
+                    add_text_embeds,
+                    add_time_ids,
+                    guidance_scale,
+                ),
+                send_to_host=True,
+            )
+            if not isinstance(latents, torch.Tensor):
+                latents = torch.from_numpy(latents).to("cpu")
+            noise_pred = torch.from_numpy(noise_pred).to("cpu")
+
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+            )[0]
+            latents = latents.detach().numpy()
+            noise_pred = noise_pred.detach().numpy()
+
+            step_time = (time.time() - step_start_time) * 1000
+            step_time_sum += step_time
+
+            if self.status == SD_STATE_CANCEL:
+                break
+        if self.ondemand:
+            self.unload_unet()
+            gc.collect()
+
+        avg_step_time = step_time_sum / len(total_timesteps)
+        self.log += f"\nAverage step time: {avg_step_time}ms/it"
+
+        return latents
+
+    def decode_latents_sdxl(self, latents, is_fp32_vae):
+        # latents are in unet dtype here so switch if we want to use fp32
+        if is_fp32_vae:
+            print("Casting latents to float32 for VAE")
+            latents = latents.to(torch.float32)
+        images = self.vae("forward", (latents,))
+        images = (torch.from_numpy(images) / 2 + 0.5).clamp(0, 1)
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        images = (images * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image[:, :, :3]) for image in images]
+
+        return pil_images
+
    @classmethod
    def from_pretrained(
        cls,
@@ -338,8 +650,10 @@ class StableDiffusionPipeline:
        ondemand: bool,
        low_cpu_mem_usage: bool = False,
        debug: bool = False,
-        use_stencil: str = None,
+        stencils: list[str] = [],
+        # stencil_images: list[Image] = []
        use_lora: str = "",
+        lora_strength: float = 0.75,
        ddpm_scheduler: DDPMScheduler = None,
        use_quantize=None,
    ):
@@ -355,7 +669,11 @@ class StableDiffusionPipeline:
            "OutpaintPipeline",
        ]
        is_upscaler = cls.__name__ in ["UpscalerPipeline"]
+        is_sdxl = cls.__name__ in ["Text2ImageSDXLPipeline"]

+        print(f"model_id", model_id)
+        print(f"ckpt_loc", ckpt_loc)
+        print(f"favored_base_models:", cls.favored_base_models(model_id))
        sd_model = SharkifyStableDiffusionModel(
            model_id,
            ckpt_loc,
@@ -371,9 +689,14 @@ class StableDiffusionPipeline:
            debug=debug,
            is_inpaint=is_inpaint,
            is_upscaler=is_upscaler,
-            use_stencil=use_stencil,
+            is_sdxl=is_sdxl,
+            stencils=stencils,
            use_lora=use_lora,
+            lora_strength=lora_strength,
            use_quantize=use_quantize,
+            favored_base_models=cls.favored_base_models(
+                model_id if model_id != "" else ckpt_loc
+            ),
        )

        if cls.__name__ in ["UpscalerPipeline"]:
@@ -383,10 +706,35 @@ class StableDiffusionPipeline:
                sd_model,
                import_mlir,
                use_lora,
+                lora_strength,
                ondemand,
            )

-        return cls(scheduler, sd_model, import_mlir, use_lora, ondemand)
+        if cls.__name__ == "StencilPipeline":
+            return cls(
+                scheduler,
+                sd_model,
+                import_mlir,
+                use_lora,
+                lora_strength,
+                ondemand,
+                stencils,
+            )
+        if cls.__name__ == "Text2ImageSDXLPipeline":
+            is_fp32_vae = True if "16" not in custom_vae else False
+            return cls(
+                scheduler,
+                sd_model,
+                import_mlir,
+                use_lora,
+                lora_strength,
+                ondemand,
+                is_fp32_vae,
+            )
+
+        return cls(
+            scheduler, sd_model, import_mlir, use_lora, lora_strength, ondemand
+        )

    # #####################################################
    # Implements text embeddings with weights from prompts
@@ -498,9 +846,10 @@ class StableDiffusionPipeline:
        clip_inf_time = (time.time() - clip_inf_start) * 1000
        if self.ondemand:
            self.unload_clip()
+            gc.collect()
        self.log += f"\nClip Inference time (ms) = {clip_inf_time:.3f}"

-        return text_embeddings.numpy()
+        return text_embeddings.numpy().astype(np.float16)


 from typing import List, Optional, Union
--- a/apps/stable_diffusion/src/schedulers/init.py
+++ b/apps/stable_diffusion/src/schedulers/init.py
@@ -1,4 +1,7 @@
-from apps.stable_diffusion.src.schedulers.sd_schedulers import get_schedulers
 from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
    SharkEulerDiscreteScheduler,
 )
+from apps.stable_diffusion.src.schedulers.shark_eulerancestraldiscrete import (
+    SharkEulerAncestralDiscreteScheduler,
+)
+from apps.stable_diffusion.src.schedulers.sd_schedulers import get_schedulers
--- a/apps/stable_diffusion/src/schedulers/sd_schedulers.py
+++ b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -1,4 +1,5 @@
 from diffusers import (
+    LCMScheduler,
    LMSDiscreteScheduler,
    PNDMScheduler,
    DDPMScheduler,
@@ -15,9 +16,21 @@ from diffusers import (
 from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
    SharkEulerDiscreteScheduler,
 )
+from apps.stable_diffusion.src.schedulers.shark_eulerancestraldiscrete import (
+    SharkEulerAncestralDiscreteScheduler,
+)


 def get_schedulers(model_id):
+    # TODO: Robust scheduler setup on pipeline creation -- if we don't
+    # set batch_size here, the SHARK schedulers will
+    # compile with batch size = 1 regardless of whether the model
+    # outputs latents of a larger batch size, e.g. SDXL.
+    # However, obviously, searching for whether the base model ID
+    # contains "xl" is not very robust.
+
+    batch_size = 2 if "xl" in model_id.lower() else 1
+
    schedulers = dict()
    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
        model_id,
@@ -39,6 +52,10 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
+    schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
    schedulers[
        "DPMSolverMultistep"
    ] = DPMSolverMultistepScheduler.from_pretrained(
@@ -84,6 +101,12 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
+    schedulers[
+        "SharkEulerAncestralDiscrete"
+    ] = SharkEulerAncestralDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
    schedulers[
        "DPMSolverSinglestep"
    ] = DPMSolverSinglestepScheduler.from_pretrained(
@@ -100,5 +123,6 @@ def get_schedulers(model_id):
        model_id,
        subfolder="scheduler",
    )
-    schedulers["SharkEulerDiscrete"].compile()
+    schedulers["SharkEulerDiscrete"].compile(batch_size)
+    schedulers["SharkEulerAncestralDiscrete"].compile(batch_size)
    return schedulers
--- a/apps/stable_diffusion/src/schedulers/shark_eulerancestraldiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerancestraldiscrete.py
@@ -0,0 +1,251 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    EulerAncestralDiscreteScheduler,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.configuration_utils import register_to_config
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    get_shark_model,
+    args,
+)
+import torch
+
+
+class SharkEulerAncestralDiscreteScheduler(EulerAncestralDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+            timestep_spacing,
+            steps_offset,
+        )
+        # TODO: make it dynamic so we dont have to worry about batch size
+        self.batch_size = None
+        self.init_input_shape = None
+
+    def compile(self, batch_size=1):
+        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+        device = args.device.split(":", 1)[0].strip()
+        self.batch_size = batch_size
+
+        model_input = {
+            "eulera": {
+                "output": torch.randn(
+                    batch_size, 4, args.height // 8, args.width // 8
+                ),
+                "latent": torch.randn(
+                    batch_size, 4, args.height // 8, args.width // 8
+                ),
+                "sigma": torch.tensor(1).to(torch.float32),
+                "sigma_from": torch.tensor(1).to(torch.float32),
+                "sigma_to": torch.tensor(1).to(torch.float32),
+                "noise": torch.randn(
+                    batch_size, 4, args.height // 8, args.width // 8
+                ),
+            },
+        }
+
+        example_latent = model_input["eulera"]["latent"]
+        example_output = model_input["eulera"]["output"]
+        example_noise = model_input["eulera"]["noise"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+            example_noise = example_noise.half()
+        example_sigma = model_input["eulera"]["sigma"]
+        example_sigma_from = model_input["eulera"]["sigma_from"]
+        example_sigma_to = model_input["eulera"]["sigma_to"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepEpsilonModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(
+                self, noise_pred, latent, sigma, sigma_from, sigma_to, noise
+            ):
+                sigma_up = (
+                    sigma_to**2
+                    * (sigma_from**2 - sigma_to**2)
+                    / sigma_from**2
+                ) ** 0.5
+                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+                dt = sigma_down - sigma
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                prev_sample = latent + derivative * dt
+                return prev_sample + noise * sigma_up
+
+        class SchedulerStepVPredictionModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(
+                self, noise_pred, sigma, sigma_from, sigma_to, latent, noise
+            ):
+                sigma_up = (
+                    sigma_to**2
+                    * (sigma_from**2 - sigma_to**2)
+                    / sigma_from**2
+                ) ** 0.5
+                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+                dt = sigma_down - sigma
+                pred_original_sample = noise_pred * (
+                    -sigma / (sigma**2 + 1) ** 0.5
+                ) + (latent / (sigma**2 + 1))
+                derivative = (latent - pred_original_sample) / sigma
+                prev_sample = latent + derivative * dt
+                return prev_sample + noise * sigma_up
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+
+        def _import(self):
+            scaling_model = ScalingModel()
+            self.scaling_model, _ = compile_through_fx(
+                model=scaling_model,
+                inputs=(example_latent, example_sigma),
+                extended_model_name=f"euler_a_scale_model_input_{self.batch_size}_{args.height}_{args.width}_{device}_"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+            pred_type_model_dict = {
+                "epsilon": SchedulerStepEpsilonModel(),
+                "v_prediction": SchedulerStepVPredictionModel(),
+            }
+            step_model = pred_type_model_dict[self.config.prediction_type]
+            self.step_model, _ = compile_through_fx(
+                step_model,
+                (
+                    example_output,
+                    example_latent,
+                    example_sigma,
+                    example_sigma_from,
+                    example_sigma_to,
+                    example_noise,
+                ),
+                extended_model_name=f"euler_a_step_{self.config.prediction_type}_{self.batch_size}_{args.height}_{args.width}_{device}_"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+        if args.import_mlir:
+            _import(self)
+
+        else:
+            try:
+                self.scaling_model = get_shark_model(
+                    SCHEDULER_BUCKET,
+                    "euler_a_scale_model_input_" + args.precision,
+                    iree_flags,
+                )
+                self.step_model = get_shark_model(
+                    SCHEDULER_BUCKET,
+                    "euler_a_step_"
+                    + self.config.prediction_type
+                    + args.precision,
+                    iree_flags,
+                )
+            except:
+                print(
+                    "failed to download model, falling back and using import_mlir"
+                )
+                args.import_mlir = True
+                _import(self)
+
+    def scale_model_input(self, sample, timestep):
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sigma = self.sigmas[self.step_index]
+        return self.scaling_model(
+            "forward",
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(
+        self,
+        noise_pred,
+        timestep,
+        latent,
+        generator: Optional[torch.Generator] = None,
+        return_dict: Optional[bool] = False,
+    ):
+        step_inputs = []
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        noise = randn_tensor(
+            torch.Size(noise_pred.shape),
+            dtype=torch.float16,
+            device="cpu",
+            generator=generator,
+        )
+        step_inputs = [
+            noise_pred,
+            latent,
+            sigma,
+            sigma_from,
+            sigma_to,
+            noise,
+        ]
+        # TODO: deal with dynamic inputs in turbine flow.
+        # update step index since we're done with the variable and will return with compiled module output.
+        self._step_index += 1
+
+        if noise_pred.shape[0] < self.batch_size:
+            for i in [0, 1, 5]:
+                try:
+                    step_inputs[i] = torch.tensor(step_inputs[i])
+                except:
+                    step_inputs[i] = torch.tensor(step_inputs[i].to_host())
+                step_inputs[i] = torch.cat(
+                    (step_inputs[i], step_inputs[i]), axis=0
+                )
+            return self.step_model(
+                "forward",
+                tuple(step_inputs),
+                send_to_host=True,
+            )
+
+        return self.step_model(
+            "forward",
+            tuple(step_inputs),
+            send_to_host=False,
+        )
--- a/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
+++ b/apps/stable_diffusion/src/schedulers/shark_eulerdiscrete.py
@@ -2,12 +2,9 @@ import sys
 import numpy as np
 from typing import List, Optional, Tuple, Union
 from diffusers import (
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
    EulerDiscreteScheduler,
 )
+from diffusers.utils.torch_utils import randn_tensor
 from diffusers.configuration_utils import register_to_config
 from apps.stable_diffusion.src.utils import (
    compile_through_fx,
@@ -27,6 +24,13 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
        beta_schedule: str = "linear",
        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
        prediction_type: str = "epsilon",
+        interpolation_type: str = "linear",
+        use_karras_sigmas: bool = False,
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
+        timestep_spacing: str = "linspace",
+        timestep_type: str = "discrete",
+        steps_offset: int = 0,
    ):
        super().__init__(
            num_train_timesteps,
@@ -35,20 +39,29 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            beta_schedule,
            trained_betas,
            prediction_type,
+            interpolation_type,
+            use_karras_sigmas,
+            sigma_min,
+            sigma_max,
+            timestep_spacing,
+            timestep_type,
+            steps_offset,
        )
+        # TODO: make it dynamic so we dont have to worry about batch size
+        self.batch_size = 1

-    def compile(self):
+    def compile(self, batch_size=1):
        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
-        BATCH_SIZE = args.batch_size
        device = args.device.split(":", 1)[0].strip()
+        self.batch_size = batch_size

        model_input = {
            "euler": {
                "latent": torch.randn(
-                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                    batch_size, 4, args.height // 8, args.width // 8
                ),
                "output": torch.randn(
-                    BATCH_SIZE, 4, args.height // 8, args.width // 8
+                    batch_size, 4, args.height // 8, args.width // 8
                ),
                "sigma": torch.tensor(1).to(torch.float32),
                "dt": torch.tensor(1).to(torch.float32),
@@ -70,12 +83,32 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            def forward(self, latent, sigma):
                return latent / ((sigma**2 + 1) ** 0.5)

-        class SchedulerStepModel(torch.nn.Module):
+        class SchedulerStepEpsilonModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma_hat, latent, dt):
+                pred_original_sample = latent - sigma_hat * noise_pred
+                derivative = (latent - pred_original_sample) / sigma_hat
+                return latent + derivative * dt
+
+        class SchedulerStepSampleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, noise_pred, sigma_hat, latent, dt):
+                pred_original_sample = noise_pred
+                derivative = (latent - pred_original_sample) / sigma_hat
+                return latent + derivative * dt
+
+        class SchedulerStepVPredictionModel(torch.nn.Module):
            def __init__(self):
                super().__init__()

            def forward(self, noise_pred, sigma, latent, dt):
-                pred_original_sample = latent - sigma * noise_pred
+                pred_original_sample = noise_pred * (
+                    -sigma / (sigma**2 + 1) ** 0.5
+                ) + (latent / (sigma**2 + 1))
                derivative = (latent - pred_original_sample) / sigma
                return latent + derivative * dt

@@ -84,25 +117,28 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            iree_flags.append(
                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
            )
-        # Disable bindings fusion to work with moltenVK.
-        if sys.platform == "darwin":
-            iree_flags.append("-iree-stream-fuse-binding=false")

        def _import(self):
            scaling_model = ScalingModel()
            self.scaling_model, _ = compile_through_fx(
                model=scaling_model,
                inputs=(example_latent, example_sigma),
-                extended_model_name=f"euler_scale_model_input_{BATCH_SIZE}_{args.height}_{args.width}_{device}_"
+                extended_model_name=f"euler_scale_model_input_{self.batch_size}_{args.height}_{args.width}_{device}_"
                + args.precision,
                extra_args=iree_flags,
            )

-            step_model = SchedulerStepModel()
+            pred_type_model_dict = {
+                "epsilon": SchedulerStepEpsilonModel(),
+                "v_prediction": SchedulerStepVPredictionModel(),
+                "sample": SchedulerStepSampleModel(),
+                "original_sample": SchedulerStepSampleModel(),
+            }
+            step_model = pred_type_model_dict[self.config.prediction_type]
            self.step_model, _ = compile_through_fx(
                step_model,
                (example_output, example_sigma, example_latent, example_dt),
-                extended_model_name=f"euler_step_{BATCH_SIZE}_{args.height}_{args.width}_{device}_"
+                extended_model_name=f"euler_step_{self.config.prediction_type}_{self.batch_size}_{args.height}_{args.width}_{device}_"
                + args.precision,
                extra_args=iree_flags,
            )
@@ -112,6 +148,11 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):

        else:
            try:
+                step_model_type = (
+                    "sample"
+                    if "sample" in self.config.prediction_type
+                    else self.config.prediction_type
+                )
                self.scaling_model = get_shark_model(
                    SCHEDULER_BUCKET,
                    "euler_scale_model_input_" + args.precision,
@@ -119,7 +160,7 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
                )
                self.step_model = get_shark_model(
                    SCHEDULER_BUCKET,
-                    "euler_step_" + args.precision,
+                    "euler_step_" + step_model_type + args.precision,
                    iree_flags,
                )
            except:
@@ -130,8 +171,9 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
                _import(self)

    def scale_model_input(self, sample, timestep):
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sigma = self.sigmas[self.step_index]
        return self.scaling_model(
            "forward",
            (
@@ -141,15 +183,61 @@ class SharkEulerDiscreteScheduler(EulerDiscreteScheduler):
            send_to_host=False,
        )

-    def step(self, noise_pred, timestep, latent):
-        step_index = (self.timesteps == timestep).nonzero().item()
-        sigma = self.sigmas[step_index]
-        dt = self.sigmas[step_index + 1] - sigma
+    def step(
+        self,
+        noise_pred,
+        timestep,
+        latent,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: Optional[bool] = False,
+    ):
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        gamma = (
+            min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1)
+            if s_tmin <= sigma <= s_tmax
+            else 0.0
+        )
+
+        sigma_hat = sigma * (gamma + 1)
+
+        noise_pred = (
+            torch.from_numpy(noise_pred)
+            if isinstance(noise_pred, np.ndarray)
+            else noise_pred
+        )
+
+        noise = randn_tensor(
+            torch.Size(noise_pred.shape),
+            dtype=torch.float16,
+            device="cpu",
+            generator=generator,
+        )
+
+        eps = noise * s_noise
+
+        if gamma > 0:
+            latent = latent + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        if self.config.prediction_type == "v_prediction":
+            sigma_hat = sigma
+
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+
+        self._step_index += 1
+
        return self.step_model(
            "forward",
            (
                noise_pred,
-                sigma,
+                sigma_hat,
                latent,
                dt,
            ),
--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -13,6 +13,7 @@ from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.stencils.stencil_utils import (
    controlnet_hint_conversion,
+    controlnet_hint_reshaping,
    get_stencil_model_id,
 )
 from apps.stable_diffusion.src.utils.utils import (
@@ -41,3 +42,8 @@ from apps.stable_diffusion.src.utils.utils import (
    resize_stencil,
    _compile_module,
 )
+from apps.stable_diffusion.src.utils.civitai import get_civitai_checkpoint
+from apps.stable_diffusion.src.utils.resamplers import (
+    resamplers,
+    resampler_list,
+)
--- a/apps/stable_diffusion/src/utils/civitai.py
+++ b/apps/stable_diffusion/src/utils/civitai.py
@@ -0,0 +1,42 @@
+import re
+import requests
+from apps.stable_diffusion.src.utils.stable_args import args
+
+from pathlib import Path
+from tqdm import tqdm
+
+
+def get_civitai_checkpoint(url: str):
+    with requests.get(url, allow_redirects=True, stream=True) as response:
+        response.raise_for_status()
+
+        # civitai api returns the filename in the content disposition
+        base_filename = re.findall(
+            '"([^"]*)"', response.headers["Content-Disposition"]
+        )[0]
+        destination_path = (
+            Path.cwd() / (args.ckpt_dir or "models") / base_filename
+        )
+
+        # we don't have this model downloaded yet
+        if not destination_path.is_file():
+            print(
+                f"downloading civitai model from {url} to {destination_path}"
+            )
+
+            size = int(response.headers["content-length"], 0)
+            progress_bar = tqdm(total=size, unit="iB", unit_scale=True)
+
+            with open(destination_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=65536):
+                    f.write(chunk)
+                    progress_bar.update(len(chunk))
+
+            progress_bar.close()
+
+        # we already have this model downloaded
+        else:
+            print(f"civitai model already downloaded to {destination_path}")
+
+        response.close()
+        return destination_path.as_posix()
--- a/apps/stable_diffusion/src/utils/resamplers.py
+++ b/apps/stable_diffusion/src/utils/resamplers.py
@@ -0,0 +1,12 @@
+import PIL.Image as Image
+
+resamplers = {
+    "Lanczos": Image.Resampling.LANCZOS,
+    "Nearest Neighbor": Image.Resampling.NEAREST,
+    "Bilinear": Image.Resampling.BILINEAR,
+    "Bicubic": Image.Resampling.BICUBIC,
+    "Hamming": Image.Resampling.HAMMING,
+    "Box": Image.Resampling.BOX,
+}
+
+resampler_list = resamplers.keys()
--- a/apps/stable_diffusion/src/utils/resources/base_model.json
+++ b/apps/stable_diffusion/src/utils/resources/base_model.json
@@ -8,6 +8,15 @@
            "dtype":"i64"
        }
    },
+    "sdxl_clip": {
+        "token" : {
+            "shape" : [
+                "1*batch_size",
+                "max_len"
+            ],
+            "dtype":"i64"
+        }
+    },
    "vae_encode": {
        "image" : {
            "shape" : [
@@ -179,9 +188,95 @@
                "shape": [2],
                "dtype": "i64"
            }
+        },
+        "stabilityai/sdxl-turbo": {
+            "latents": {
+                "shape": [
+                    "2*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "prompt_embeds": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    2048
+                ],
+                "dtype": "f32"
+            },
+            "text_embeds": {
+                "shape": [
+                    "2*batch_size",
+                    1280
+                ],
+                "dtype": "f32"
+            },
+            "time_ids": {
+                "shape": [
+                    "2*batch_size",
+                    6
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 1,
+                "dtype": "f32"
+            }
+        },
+        "stabilityai/stable-diffusion-xl-base-1.0": {
+            "latents": {
+                "shape": [
+                    "2*batch_size",
+                    4,
+                    "height",
+                    "width"
+                ],
+                "dtype": "f32"
+            },
+            "timesteps": {
+                "shape": [
+                    1
+                ],
+                "dtype": "f32"
+            },
+            "prompt_embeds": {
+                "shape": [
+                    "2*batch_size",
+                    "max_len",
+                    2048
+                ],
+                "dtype": "f32"
+            },
+            "text_embeds": {
+                "shape": [
+                    "2*batch_size",
+                    1280
+                ],
+                "dtype": "f32"
+            },
+            "time_ids": {
+                "shape": [
+                    "2*batch_size",
+                    6
+                ],
+                "dtype": "f32"
+            },
+            "guidance_scale": {
+                "shape": 1,
+                "dtype": "f32"
+            }
        }
    },
-    "stencil_adaptor": {
+    "stencil_adapter": {
        "latents": {
            "shape": [
                "1*batch_size",
@@ -208,6 +303,58 @@
        "controlnet_hint": {
            "shape": [1, 3, "8*height", "8*width"],
            "dtype": "f32"
+        },
+        "acc1": {
+            "shape": [2, 320, "height", "width"],
+            "dtype": "f32"
+        },
+        "acc2": {
+            "shape": [2, 320, "height", "width"],
+            "dtype": "f32"
+        },
+        "acc3": {
+            "shape": [2, 320, "height", "width"],
+            "dtype": "f32"
+        },
+        "acc4": {
+            "shape": [2, 320, "height/2", "width/2"],
+            "dtype": "f32"
+        },
+        "acc5": {
+            "shape": [2, 640, "height/2", "width/2"],
+            "dtype": "f32"
+        },
+        "acc6": {
+            "shape": [2, 640, "height/2", "width/2"],
+            "dtype": "f32"
+        },
+        "acc7": {
+            "shape": [2, 640, "height/4", "width/4"],
+            "dtype": "f32"
+        },
+        "acc8": {
+            "shape": [2, 1280, "height/4", "width/4"],
+            "dtype": "f32"
+        },
+        "acc9": {
+            "shape": [2, 1280, "height/4", "width/4"],
+            "dtype": "f32"
+        },
+        "acc10": {
+            "shape": [2, 1280, "height/8", "width/8"],
+            "dtype": "f32"
+        },
+        "acc11": {
+            "shape": [2, 1280, "height/8", "width/8"],
+            "dtype": "f32"
+        },
+        "acc12": {
+            "shape": [2, 1280, "height/8", "width/8"],
+            "dtype": "f32"
+        },
+        "acc13": {
+            "shape": [2, 1280, "height/8", "width/8"],
+            "dtype": "f32"
        }
    },
    "stencil_unet": {
@@ -290,7 +437,59 @@
            "control13": {
                "shape": [2, 1280, "height/8", "width/8"],
                "dtype": "f32"
+            },
+            "scale1": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale2": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale3": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale4": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale5": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale6": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale7": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale8": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale9": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale10": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale11": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale12": {
+                "shape": 1,
+                "dtype": "f32"
+            },
+            "scale13": {
+                "shape": 1,
+                "dtype": "f32"
            }
        }
    }
-}
+}
--- a/apps/stable_diffusion/src/utils/resources/opt_flags.json
+++ b/apps/stable_diffusion/src/utils/resources/opt_flags.json
@@ -11,12 +11,12 @@
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-global-opt-detach-elementwise-from-named-ops,iree-global-opt-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-global-opt-detach-elementwise-from-named-ops,iree-global-opt-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
        ]
      }
    }
@@ -28,7 +28,7 @@
        "specified_compilation_flags": {
          "cuda": [],
          "default_device": [
-            "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
+            "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-global-opt-detach-elementwise-from-named-ops,iree-global-opt-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
          ]
        }
      },
@@ -37,7 +37,7 @@
        "specified_compilation_flags": {
          "cuda": [],
          "default_device": [
-            "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16},iree-linalg-ext-convert-conv2d-to-winograd))"
+            "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-global-opt-detach-elementwise-from-named-ops,iree-global-opt-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16},iree-linalg-ext-convert-conv2d-to-winograd))"
          ]
        }
      }
@@ -45,12 +45,12 @@
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-global-opt-detach-elementwise-from-named-ops,iree-global-opt-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32},iree-linalg-ext-convert-conv2d-to-winograd))"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-detach-elementwise-from-named-ops,iree-flow-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16},iree-linalg-ext-convert-conv2d-to-winograd))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-global-opt-detach-elementwise-from-named-ops,iree-global-opt-convert-1x1-filter-conv2d-to-matmul,iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16},iree-linalg-ext-convert-conv2d-to-winograd))"
        ]
      }
    }
@@ -59,24 +59,28 @@
    "tuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))",
+          "--iree-opt-data-tiling=False"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))",
+          "--iree-opt-data-tiling=False"
        ]
      }
    },
    "untuned": {
      "fp16": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))",
+          "--iree-opt-data-tiling=False"
        ]
      },
      "fp32": {
        "default_compilation_flags": [
-          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))"
+          "--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))",
+          "--iree-opt-data-tiling=False"
        ]
      }
    }
--- a/apps/stable_diffusion/src/utils/resources/prompts.json
+++ b/apps/stable_diffusion/src/utils/resources/prompts.json
@@ -1,4 +1,5 @@
 [["A high tech solarpunk utopia in the Amazon rainforest"],
+["Astrophotography, the shark nebula, nebula with a tiny shark-like cloud in the middle in the middle, hubble telescope, vivid colors"],
 ["A pikachu fine dining with a view to the Eiffel Tower"],
 ["A mecha robot in a favela in expressionist style"],
 ["an insect robot preparing a delicious meal"],
--- a/apps/stable_diffusion/src/utils/sd_annotation.py
+++ b/apps/stable_diffusion/src/utils/sd_annotation.py
@@ -109,7 +109,7 @@ def load_lower_configs(base_model_id=None):
            spec = spec.split("-")[0]

    if args.annotation_model == "vae":
-        if not spec or spec in ["rdna3", "sm_80"]:
+        if not spec or spec in ["sm_80"]:
            config_name = (
                f"{args.annotation_model}_{args.precision}_{device}.json"
            )
@@ -158,9 +158,9 @@ def load_lower_configs(base_model_id=None):
                f"{spec}.json"
            )

-    full_gs_url = config_bucket + config_name
    lowering_config_dir = os.path.join(WORKDIR, "configs", config_name)
    print("Loading lowering config file from ", lowering_config_dir)
+    full_gs_url = config_bucket + config_name
    download_public_file(full_gs_url, lowering_config_dir, True)
    return lowering_config_dir

@@ -203,8 +203,8 @@ def dump_after_mlir(input_mlir, use_winograd):
    if use_winograd:
        preprocess_flag = (
            "--iree-preprocessing-pass-pipeline=builtin.module"
-            "(func.func(iree-flow-detach-elementwise-from-named-ops,"
-            "iree-flow-convert-1x1-filter-conv2d-to-matmul,"
+            "(func.func(iree-global-opt-detach-elementwise-from-named-ops,"
+            "iree-global-opt-convert-1x1-filter-conv2d-to-matmul,"
            "iree-preprocessing-convert-conv2d-to-img2col,"
            "iree-preprocessing-pad-linalg-ops{pad-size=32},"
            "iree-linalg-ext-convert-conv2d-to-winograd))"
@@ -212,8 +212,8 @@ def dump_after_mlir(input_mlir, use_winograd):
    else:
        preprocess_flag = (
            "--iree-preprocessing-pass-pipeline=builtin.module"
-            "(func.func(iree-flow-detach-elementwise-from-named-ops,"
-            "iree-flow-convert-1x1-filter-conv2d-to-matmul,"
+            "(func.func(iree-global-opt-detach-elementwise-from-named-ops,"
+            "iree-global-opt-convert-1x1-filter-conv2d-to-matmul,"
            "iree-preprocessing-convert-conv2d-to-img2col,"
            "iree-preprocessing-pad-linalg-ops{pad-size=32}))"
        )
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -2,6 +2,8 @@ import argparse
 import os
 from pathlib import Path

+from apps.stable_diffusion.src.utils.resamplers import resampler_list
+

 def path_expand(s):
    return Path(s).expanduser().resolve()
@@ -83,7 +85,7 @@ p.add_argument(
    "--height",
    type=int,
    default=512,
-    choices=range(128, 769, 8),
+    choices=range(128, 1025, 8),
    help="The height of the output image.",
 )

@@ -91,7 +93,7 @@ p.add_argument(
    "--width",
    type=int,
    default=512,
-    choices=range(128, 769, 8),
+    choices=range(128, 1025, 8),
    help="The width of the output image.",
 )

@@ -132,6 +134,47 @@ p.add_argument(
    "img2img.",
 )

+p.add_argument(
+    "--use_hiresfix",
+    type=bool,
+    default=False,
+    help="Use Hires Fix to do higher resolution images, while trying to "
+    "avoid the issues that come with it. This is accomplished by first "
+    "generating an image using txt2img, then running it through img2img.",
+)
+
+p.add_argument(
+    "--hiresfix_height",
+    type=int,
+    default=768,
+    choices=range(128, 769, 8),
+    help="The height of the Hires Fix image.",
+)
+
+p.add_argument(
+    "--hiresfix_width",
+    type=int,
+    default=768,
+    choices=range(128, 769, 8),
+    help="The width of the Hires Fix image.",
+)
+
+p.add_argument(
+    "--hiresfix_strength",
+    type=float,
+    default=0.6,
+    help="The denoising strength to apply for the Hires Fix.",
+)
+
+p.add_argument(
+    "--resample_type",
+    type=str,
+    default="Nearest Neighbor",
+    choices=resampler_list,
+    help="The resample type to use when resizing an image before being run "
+    "through stable diffusion.",
+)
+
 ##############################################################################
 # Stable Diffusion Training Params
 ##############################################################################
@@ -202,28 +245,30 @@ p.add_argument(
    "--left",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="If expend left for outpainting.",
+    help="If extend left for outpainting.",
 )

 p.add_argument(
    "--right",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="If expend right for outpainting.",
+    help="If extend right for outpainting.",
 )

 p.add_argument(
+    "--up",
    "--top",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="If expend top for outpainting.",
+    help="If extend top for outpainting.",
 )

 p.add_argument(
+    "--down",
    "--bottom",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="If expend bottom for outpainting.",
+    help="If extend bottom for outpainting.",
 )

 p.add_argument(
@@ -255,7 +300,7 @@ p.add_argument(

 p.add_argument(
    "--import_mlir",
-    default=False,
+    default=True,
    action=argparse.BooleanOptionalAction,
    help="Imports the model from torch module to shark_module otherwise "
    "downloads the model from shark_tank.",
@@ -278,7 +323,7 @@ p.add_argument(

 p.add_argument(
    "--use_tuned",
-    default=True,
+    default=False,
    action=argparse.BooleanOptionalAction,
    help="Download and use the tuned version of the model if available.",
 )
@@ -371,10 +416,17 @@ p.add_argument(

 p.add_argument(
    "--use_stencil",
-    choices=["canny", "openpose", "scribble"],
+    choices=["canny", "openpose", "scribble", "zoedepth"],
    help="Enable the stencil feature.",
 )

+p.add_argument(
+    "--control_mode",
+    choices=["Prompt", "Balanced", "Controlnet"],
+    default="Balanced",
+    help="How Controlnet injection should be prioritized.",
+)
+
 p.add_argument(
    "--use_lora",
    type=str,
@@ -383,6 +435,13 @@ p.add_argument(
    "file (~3 MB).",
 )

+p.add_argument(
+    "--lora_strength",
+    type=float,
+    default=1.0,
+    help="Strength (alpha) scaling factor to use when applying LoRA weights",
+)
+
 p.add_argument(
    "--use_quantize",
    type=str,
@@ -407,6 +466,21 @@ p.add_argument(
    help="Specify your own huggingface authentication tokens for models like Llama2.",
 )

+p.add_argument(
+    "--device_allocator_heap_key",
+    type=str,
+    default="",
+    help="Specify heap key for device caching allocator."
+    "Expected form: max_allocation_size;max_allocation_capacity;max_free_allocation_count"
+    "Example: --device_allocator_heap_key='*;1gib' (will limit caching on device to 1 gigabyte)",
+)
+
+p.add_argument(
+    "--autogen",
+    type=bool,
+    default="False",
+    help="Only used for a gradio workaround.",
+)
 ##############################################################################
 # IREE - Vulkan supported flags
 ##############################################################################
@@ -519,6 +593,27 @@ p.add_argument(
    "in shark importer. Does nothing if import_mlir is false (the default).",
 )

+p.add_argument(
+    "--compile_debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag to toggle debug assert/verify flags for imported IR in the"
+    "iree-compiler. Default to false.",
+)
+
+p.add_argument(
+    "--iree_constant_folding",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Controls constant folding in iree-compile for all SD models.",
+)
+
+p.add_argument(
+    "--data_tiling",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Controls data tiling in iree-compile for all SD models.",
+)

 ##############################################################################
 # Web UI flags
@@ -568,6 +663,25 @@ p.add_argument(
    help="Flag for enabling rest API.",
 )

+p.add_argument(
+    "--api_accept_origin",
+    action="append",
+    type=str,
+    help="An origin to be accepted by the REST api for Cross Origin"
+    "Resource Sharing (CORS). Use multiple times for multiple origins, "
+    'or use --api_accept_origin="*" to accept all origins. If no origins '
+    "are set no CORS headers will be returned by the api. Use, for "
+    "instance, if you need to access the REST api from Javascript running "
+    "in a web browser.",
+)
+
+p.add_argument(
+    "--debug",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for enabling debugging log in WebUI.",
+)
+
 p.add_argument(
    "--output_gallery",
    default=True,
@@ -645,6 +759,18 @@ p.add_argument(
    help="Specifies whether the docuchat's web version is running or not.",
 )

+##############################################################################
+# rocm Flags
+##############################################################################
+
+p.add_argument(
+    "--iree_rocm_target_chip",
+    type=str,
+    default="",
+    help="Add the rocm device architecture ex gfx1100, gfx90a, etc. Use `hipinfo` "
+    "or `iree-run-module --dump_devices=rocm` or `hipinfo` to get desired arch name",
+)
+
 args, unknown = p.parse_known_args()
 if args.import_debug:
    os.environ["IREE_SAVE_TEMPS"] = os.path.join(
--- a/apps/stable_diffusion/src/utils/stencils/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/init.py
@@ -1,2 +1,3 @@
 from apps.stable_diffusion.src.utils.stencils.canny import CannyDetector
 from apps.stable_diffusion.src.utils.stencils.openpose import OpenposeDetector
+from apps.stable_diffusion.src.utils.stencils.zoe import ZoeDetector
--- a/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
+++ b/apps/stable_diffusion/src/utils/stencils/stencil_utils.py
@@ -1,14 +1,44 @@
 import numpy as np
 from PIL import Image
 import torch
+import os
+from pathlib import Path
+import torchvision
+import time
 from apps.stable_diffusion.src.utils.stencils import (
    CannyDetector,
    OpenposeDetector,
+    ZoeDetector,
 )

 stencil = {}


+def save_img(img):
+    from apps.stable_diffusion.src.utils import (
+        get_generated_imgs_path,
+        get_generated_imgs_todays_subdir,
+    )
+
+    subdir = Path(get_generated_imgs_path(), "preprocessed_control_hints")
+    os.makedirs(subdir, exist_ok=True)
+    if isinstance(img, Image.Image):
+        img.save(
+            os.path.join(
+                subdir, "controlnet_" + str(int(time.time())) + ".png"
+            )
+        )
+    elif isinstance(img, np.ndarray):
+        img = Image.fromarray(img)
+        img.save(os.path.join(subdir, str(int(time.time())) + ".png"))
+    else:
+        converter = torchvision.transforms.ToPILImage()
+        for i in img:
+            converter(i).save(
+                os.path.join(subdir, str(int(time.time())) + ".png")
+            )
+
+
 def HWC3(x):
    assert x.dtype == np.uint8
    if x.ndim == 2:
@@ -28,7 +58,7 @@ def HWC3(x):
        return y


-def controlnet_hint_shaping(
+def controlnet_hint_reshaping(
    controlnet_hint, height, width, dtype, num_images_per_prompt=1
 ):
    channels = 3
@@ -47,10 +77,12 @@ def controlnet_hint_shaping(
                )
            return controlnet_hint
        else:
-            raise ValueError(
-                f"Acceptble shape of `stencil` are any of ({channels}, {height}, {width}),"
-                + f" (1, {channels}, {height}, {width}) or ({num_images_per_prompt}, "
-                + f"{channels}, {height}, {width}) but is {controlnet_hint.shape}"
+            return controlnet_hint_reshaping(
+                Image.fromarray(controlnet_hint.detach().numpy()),
+                height,
+                width,
+                dtype,
+                num_images_per_prompt,
            )
    elif isinstance(controlnet_hint, np.ndarray):
        # np.ndarray: acceptable shape is any of hw, hwc, bhwc(b==1) or bhwc(b==num_images_per_promot)
@@ -77,29 +109,38 @@ def controlnet_hint_shaping(
            )  # b h w c -> b c h w
            return controlnet_hint
        else:
-            raise ValueError(
-                f"Acceptble shape of `stencil` are any of ({width}, {channels}), "
-                + f"({height}, {width}, {channels}), "
-                + f"(1, {height}, {width}, {channels}) or "
-                + f"({num_images_per_prompt}, {channels}, {height}, {width}) but is {controlnet_hint.shape}"
+            return controlnet_hint_reshaping(
+                Image.fromarray(controlnet_hint),
+                height,
+                width,
+                dtype,
+                num_images_per_prompt,
            )
+
    elif isinstance(controlnet_hint, Image.Image):
+        controlnet_hint = controlnet_hint.convert(
+            "RGB"
+        )  # make sure 3 channel RGB format
        if controlnet_hint.size == (width, height):
-            controlnet_hint = controlnet_hint.convert(
-                "RGB"
-            )  # make sure 3 channel RGB format
-            controlnet_hint = np.array(controlnet_hint)  # to numpy
+            controlnet_hint = np.array(controlnet_hint).astype(
+                np.float16
+            )  # to numpy
            controlnet_hint = controlnet_hint[:, :, ::-1]  # RGB -> BGR
-            return controlnet_hint_shaping(
-                controlnet_hint, height, width, num_images_per_prompt
+            return controlnet_hint_reshaping(
+                controlnet_hint, height, width, dtype, num_images_per_prompt
            )
        else:
-            raise ValueError(
-                f"Acceptable image size of `stencil` is ({width}, {height}) but is {controlnet_hint.size}"
-            )
+            (hint_w, hint_h) = controlnet_hint.size
+            left = int((hint_w - width) / 2)
+            right = left + height
+            controlnet_hint = controlnet_hint.crop((left, 0, right, hint_h))
+            controlnet_hint = controlnet_hint.resize((width, height))
+        return controlnet_hint_reshaping(
+            controlnet_hint, height, width, dtype, num_images_per_prompt
+        )
    else:
        raise ValueError(
-            f"Acceptable type of `stencil` are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
+            f"Acceptible controlnet input types are any of torch.Tensor, np.ndarray, PIL.Image.Image but is {type(controlnet_hint)}"
        )


@@ -109,17 +150,26 @@ def controlnet_hint_conversion(
    controlnet_hint = None
    match use_stencil:
        case "canny":
-            print("Detecting edge with canny")
+            print(
+                "Converting controlnet hint to edge detection mask with canny preprocessor."
+            )
            controlnet_hint = hint_canny(image)
        case "openpose":
-            print("Detecting human pose")
+            print(
+                "Detecting human pose in controlnet hint with openpose preprocessor."
+            )
            controlnet_hint = hint_openpose(image)
        case "scribble":
-            print("Working with scribble")
+            print("Using your scribble as a controlnet hint.")
            controlnet_hint = hint_scribble(image)
+        case "zoedepth":
+            print(
+                "Converting controlnet hint to a depth mapping with ZoeDepth."
+            )
+            controlnet_hint = hint_zoedepth(image)
        case _:
            return None
-    controlnet_hint = controlnet_hint_shaping(
+    controlnet_hint = controlnet_hint_reshaping(
        controlnet_hint, height, width, dtype, num_images_per_prompt
    )
    return controlnet_hint
@@ -127,7 +177,7 @@ def controlnet_hint_conversion(

 stencil_to_model_id_map = {
    "canny": "lllyasviel/control_v11p_sd15_canny",
-    "depth": "lllyasviel/control_v11p_sd15_depth",
+    "zoedepth": "lllyasviel/control_v11f1p_sd15_depth",
    "hed": "lllyasviel/sd-controlnet-hed",
    "mlsd": "lllyasviel/control_v11p_sd15_mlsd",
    "normal": "lllyasviel/control_v11p_sd15_normalbae",
@@ -157,6 +207,7 @@ def hint_canny(
        detected_map = stencil["canny"](
            input_image, low_threshold, high_threshold
        )
+        save_img(detected_map)
        detected_map = HWC3(detected_map)
        return detected_map

@@ -172,6 +223,7 @@ def hint_openpose(
            stencil["openpose"] = OpenposeDetector()

        detected_map, _ = stencil["openpose"](input_image)
+        save_img(detected_map)
        detected_map = HWC3(detected_map)
        return detected_map

@@ -183,4 +235,19 @@ def hint_scribble(image: Image.Image):

        detected_map = np.zeros_like(input_image, dtype=np.uint8)
        detected_map[np.min(input_image, axis=2) < 127] = 255
+        save_img(detected_map)
+        return detected_map
+
+
+# Stencil 4. Depth (Only Zoe Preprocessing)
+def hint_zoedepth(image: Image.Image):
+    with torch.no_grad():
+        input_image = np.array(image)
+
+        if not "depth" in stencil:
+            stencil["depth"] = ZoeDetector()
+
+        detected_map = stencil["depth"](input_image)
+        save_img(detected_map)
+        detected_map = HWC3(detected_map)
        return detected_map
--- a/apps/stable_diffusion/src/utils/stencils/zoe/init.py
+++ b/apps/stable_diffusion/src/utils/stencils/zoe/init.py
@@ -0,0 +1,64 @@
+import numpy as np
+import torch
+from pathlib import Path
+import requests
+
+
+from einops import rearrange
+
+remote_model_path = (
+    "https://huggingface.co/lllyasviel/Annotators/resolve/main/ZoeD_M12_N.pt"
+)
+
+
+class ZoeDetector:
+    def __init__(self):
+        cwd = Path.cwd()
+        ckpt_path = Path(cwd, "stencil_annotator")
+        ckpt_path.mkdir(parents=True, exist_ok=True)
+        modelpath = ckpt_path / "ZoeD_M12_N.pt"
+
+        with requests.get(remote_model_path, stream=True) as r:
+            r.raise_for_status()
+            with open(modelpath, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+        model = torch.hub.load(
+            "monorimet/ZoeDepth:torch_update",
+            "ZoeD_N",
+            pretrained=False,
+            force_reload=False,
+        )
+
+        # Hack to fix the ZoeDepth import issue
+        model_keys = model.state_dict().keys()
+        loaded_dict = torch.load(modelpath, map_location=model.device)["model"]
+        loaded_keys = loaded_dict.keys()
+        for key in loaded_keys - model_keys:
+            loaded_dict.pop(key)
+
+        model.load_state_dict(loaded_dict)
+        model.eval()
+        self.model = model
+
+    def __call__(self, input_image):
+        assert input_image.ndim == 3
+        image_depth = input_image
+        with torch.no_grad():
+            image_depth = torch.from_numpy(image_depth).float()
+            image_depth = image_depth / 255.0
+            image_depth = rearrange(image_depth, "h w c -> 1 c h w")
+            depth = self.model.infer(image_depth)
+
+            depth = depth[0, 0].cpu().numpy()
+
+            vmin = np.percentile(depth, 2)
+            vmax = np.percentile(depth, 85)
+
+            depth -= vmin
+            depth /= vmax - vmin
+            depth = 1.0 - depth
+            depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8)
+
+            return depth_image
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -6,6 +6,7 @@ from PIL import PngImagePlugin
 from PIL import Image
 from datetime import datetime as dt
 from csv import DictWriter
+from dataclasses import dataclass
 from pathlib import Path
 import numpy as np
 from random import (
@@ -18,14 +19,14 @@ import tempfile
 import torch
 from safetensors.torch import load_file
 from shark.shark_inference import SharkInference
-from shark.shark_importer import import_with_fx
+from shark.shark_importer import import_with_fx, save_mlir
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
    get_vulkan_target_triple,
    get_iree_vulkan_runtime_flags,
 )
 from shark.iree_utils.metal_utils import get_metal_target_triple
-from shark.iree_utils.gpu_utils import get_cuda_sm_cc
+from shark.iree_utils.gpu_utils import get_cuda_sm_cc, get_iree_rocm_args
 from apps.stable_diffusion.src.utils.stable_args import args
 from apps.stable_diffusion.src.utils.resources import opt_flags
 from apps.stable_diffusion.src.utils.sd_annotation import sd_model_annotation
@@ -78,7 +79,7 @@ def _compile_module(shark_module, model_name, extra_args=[]):
                    )
                )
            path = shark_module.save_module(
-                os.getcwd(), model_name, extra_args
+                os.getcwd(), model_name, extra_args, debug=args.compile_debug
            )
            shark_module.load_module(path, extra_args=extra_args)
    else:
@@ -118,7 +119,7 @@ def compile_through_fx(
    is_f16=False,
    f16_input_mask=None,
    use_tuned=False,
-    save_dir=tempfile.gettempdir(),
+    save_dir="",
    debug=False,
    generate_vmfb=True,
    extra_args=None,
@@ -154,8 +155,8 @@ def compile_through_fx(
        f16_input_mask=f16_input_mask,
        debug=debug,
        model_name=extended_model_name,
-        save_dir=save_dir,
    )
+
    if use_tuned:
        if "vae" in extended_model_name.split("_")[0]:
            args.annotation_model = "vae"
@@ -168,6 +169,14 @@ def compile_through_fx(
            mlir_module, extended_model_name, base_model_id
        )

+    if not os.path.isdir(save_dir):
+        save_dir = ""
+
+    mlir_module = save_mlir(
+        mlir_module,
+        model_name=extended_model_name,
+        dir=save_dir,
+    )
    shark_module = SharkInference(
        mlir_module,
        device=args.device if device is None else device,
@@ -179,17 +188,22 @@ def compile_through_fx(
            mlir_module,
        )

-    del mlir_module
    gc.collect()


 def set_iree_runtime_flags():
+    # TODO: This function should be device-agnostic and piped properly
+    # to general runtime driver init.
    vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
    if args.enable_rgp:
        vulkan_runtime_flags += [
            f"--enable_rgp=true",
            f"--vulkan_debug_utils=true",
        ]
+    if args.device_allocator_heap_key:
+        vulkan_runtime_flags += [
+            f"--device_allocator=caching:device_local={args.device_allocator_heap_key}",
+        ]
    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)


@@ -464,18 +478,38 @@ def get_available_devices():
                        f"{device_name} => {driver_name.replace('local', 'cpu')}"
                    )
                else:
-                    device_list.append(f"{device_name} => {driver_name}://{i}")
+                    # for drivers with single devices
+                    # let the default device be selected without any indexing
+                    if len(device_list_dict) == 1:
+                        device_list.append(f"{device_name} => {driver_name}")
+                    else:
+                        device_list.append(
+                            f"{device_name} => {driver_name}://{i}"
+                        )
        return device_list

    set_iree_runtime_flags()

    available_devices = []
-    vulkan_devices = get_devices_by_name("vulkan")
+    from shark.iree_utils.vulkan_utils import (
+        get_all_vulkan_devices,
+    )
+
+    vulkaninfo_list = get_all_vulkan_devices()
+    vulkan_devices = []
+    id = 0
+    for device in vulkaninfo_list:
+        vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
+        id += 1
+    if id != 0:
+        print(f"vulkan devices are available.")
    available_devices.extend(vulkan_devices)
    metal_devices = get_devices_by_name("metal")
    available_devices.extend(metal_devices)
    cuda_devices = get_devices_by_name("cuda")
    available_devices.extend(cuda_devices)
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
    cpu_device = get_devices_by_name("cpu-sync")
    available_devices.extend(cpu_device)
    cpu_device = get_devices_by_name("cpu-task")
@@ -499,10 +533,17 @@ def get_opt_flags(model, precision="fp16"):
        iree_flags.append(
            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
        )
-
-    # Disable bindings fusion to work with moltenVK.
-    if sys.platform == "darwin":
-        iree_flags.append("-iree-stream-fuse-binding=false")
+    if "rocm" in args.device:
+        rocm_args = get_iree_rocm_args()
+        iree_flags.extend(rocm_args)
+        print(iree_flags)
+    if args.iree_constant_folding == False:
+        iree_flags.append("--iree-opt-const-expr-hoisting=False")
+        iree_flags.append(
+            "--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
+        )
+    if args.data_tiling == False:
+        iree_flags.append("--iree-opt-data-tiling=False")

    if "default_compilation_flags" in opt_flags[model][is_tuned][precision]:
        iree_flags += opt_flags[model][is_tuned][precision][
@@ -525,6 +566,10 @@ def get_opt_flags(model, precision="fp16"):
        iree_flags += opt_flags[model][is_tuned][precision][
            "specified_compilation_flags"
        ][device]
+    if "vae" not in model:
+        # Due to lack of support for multi-reduce, we always collapse reduction
+        # dims before dispatch formation right now.
+        iree_flags += ["--iree-flow-collapse-reduction-dims"]
    return iree_flags


@@ -566,7 +611,7 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
    )
    num_in_channels = 9 if is_inpaint else 4
    pipe = download_from_original_stable_diffusion_ckpt(
-        checkpoint_path=custom_weights,
+        checkpoint_path_or_dict=custom_weights,
        extract_ema=extract_ema,
        from_safetensors=from_safetensors,
        num_in_channels=num_in_channels,
@@ -594,30 +639,51 @@ def convert_original_vae(vae_checkpoint):
    return converted_vae_checkpoint


-def processLoRA(model, use_lora, splitting_prefix):
+@dataclass
+class LoRAweight:
+    up: torch.tensor
+    down: torch.tensor
+    mid: torch.tensor
+    alpha: torch.float32 = 1.0
+
+
+def processLoRA(model, use_lora, splitting_prefix, lora_strength):
    state_dict = ""
    if ".safetensors" in use_lora:
        state_dict = load_file(use_lora)
    else:
        state_dict = torch.load(use_lora)
-    alpha = 0.75
-    visited = []

-    # directly update weight in model
-    process_unet = "te" not in splitting_prefix
+    # gather the weights from the LoRA in a more convenient form, assumes
+    # everything will have an up.weight. Unsure if this is a safe assumption.
+    weight_dict: dict[str, LoRAweight] = {}
    for key in state_dict:
-        if ".alpha" in key or key in visited:
-            continue
+        if key.startswith(splitting_prefix) and key.endswith("up.weight"):
+            stem = key.split("up.weight")[0]
+            weight_key = stem.removesuffix(".lora_")
+            weight_key = weight_key.removesuffix("_lora_")
+            weight_key = weight_key.removesuffix(".lora_linear_layer.")

+            if weight_key not in weight_dict:
+                weight_dict[weight_key] = LoRAweight(
+                    state_dict[f"{stem}up.weight"],
+                    state_dict[f"{stem}down.weight"],
+                    state_dict.get(f"{stem}mid.weight", None),
+                    state_dict[f"{weight_key}.alpha"]
+                    / state_dict[f"{stem}up.weight"].shape[1]
+                    if f"{weight_key}.alpha" in state_dict
+                    else 1.0,
+                )
+
+    # Directly update weight in model
+
+    # Mostly adaptions of https://github.com/kohya-ss/sd-scripts/blob/main/networks/merge_lora.py
+    # and similar code in https://github.com/huggingface/diffusers/issues/3064
+
+    # TODO: handle mid weights (how do they even work?)
+    for key, lora_weight in weight_dict.items():
        curr_layer = model
-        if ("text" not in key and process_unet) or (
-            "text" in key and not process_unet
-        ):
-            layer_infos = (
-                key.split(".")[0].split(splitting_prefix)[-1].split("_")
-            )
-        else:
-            continue
+        layer_infos = key.split(".")[0].split(splitting_prefix)[-1].split("_")

        # find the target layer
        temp_name = layer_infos.pop(0)
@@ -634,46 +700,46 @@ def processLoRA(model, use_lora, splitting_prefix):
                else:
                    temp_name = layer_infos.pop(0)

-        pair_keys = []
-        if "lora_down" in key:
-            pair_keys.append(key.replace("lora_down", "lora_up"))
-            pair_keys.append(key)
-        else:
-            pair_keys.append(key)
-            pair_keys.append(key.replace("lora_up", "lora_down"))
-
-        # update weight
-        if len(state_dict[pair_keys[0]].shape) == 4:
-            weight_up = (
-                state_dict[pair_keys[0]]
-                .squeeze(3)
-                .squeeze(2)
-                .to(torch.float32)
-            )
+        weight = curr_layer.weight.data
+        scale = lora_weight.alpha * lora_strength
+        if len(weight.size()) == 2:
+            if len(lora_weight.up.shape) == 4:
+                weight_up = (
+                    lora_weight.up.squeeze(3).squeeze(2).to(torch.float32)
+                )
+                weight_down = (
+                    lora_weight.down.squeeze(3).squeeze(2).to(torch.float32)
+                )
+                change = (
+                    torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+                )
+            else:
+                change = torch.mm(lora_weight.up, lora_weight.down)
+        elif lora_weight.down.size()[2:4] == (1, 1):
+            weight_up = lora_weight.up.squeeze(3).squeeze(2).to(torch.float32)
            weight_down = (
-                state_dict[pair_keys[1]]
-                .squeeze(3)
-                .squeeze(2)
-                .to(torch.float32)
+                lora_weight.down.squeeze(3).squeeze(2).to(torch.float32)
            )
-            curr_layer.weight.data += alpha * torch.mm(
-                weight_up, weight_down
-            ).unsqueeze(2).unsqueeze(3)
+            change = torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
        else:
-            weight_up = state_dict[pair_keys[0]].to(torch.float32)
-            weight_down = state_dict[pair_keys[1]].to(torch.float32)
-            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
-        # update visited list
-        for item in pair_keys:
-            visited.append(item)
+            change = torch.nn.functional.conv2d(
+                lora_weight.down.permute(1, 0, 2, 3),
+                lora_weight.up,
+            ).permute(1, 0, 2, 3)
+
+        curr_layer.weight.data += change * scale
+
    return model


-def update_lora_weight_for_unet(unet, use_lora):
+def update_lora_weight_for_unet(unet, use_lora, lora_strength):
    extensions = [".bin", ".safetensors", ".pt"]
    if not any([extension in use_lora for extension in extensions]):
        # We assume if it is a HF ID with standalone LoRA weights.
        unet.load_attn_procs(use_lora)
+        print(
+            f"updated unet weights via diffusers load_attn_procs from LoRA: {use_lora}"
+        )
        return unet

    main_file_name = get_path_stem(use_lora)
@@ -689,16 +755,21 @@ def update_lora_weight_for_unet(unet, use_lora):
    try:
        dir_name = os.path.dirname(use_lora)
        unet.load_attn_procs(dir_name, weight_name=main_file_name)
+        print(
+            f"updated unet weights via diffusers load_attn_procs from LoRA: {use_lora}"
+        )
        return unet
    except:
-        return processLoRA(unet, use_lora, "lora_unet_")
+        print(f"updated unet weights manually from LoRA: {use_lora}")
+        return processLoRA(unet, use_lora, "lora_unet_", lora_strength)


-def update_lora_weight(model, use_lora, model_name):
+def update_lora_weight(model, use_lora, model_name, lora_strength):
    if "unet" in model_name:
-        return update_lora_weight_for_unet(model, use_lora)
+        return update_lora_weight_for_unet(model, use_lora, lora_strength)
    try:
-        return processLoRA(model, use_lora, "lora_te_")
+        print(f"updating CLIP weights from LoRA: {use_lora}")
+        return processLoRA(model, use_lora, "lora_te_", lora_strength)
    except:
        return None

@@ -773,11 +844,12 @@ def batch_seeds(
    seeds = seeds[:batch_count] + [-1] * (batch_count - len(seeds))

    if repeatable:
-        # set seed for the rng based on what we have so far
-        saved_random_state = random_getstate()
        if all(seed < 0 for seed in seeds):
            seeds[0] = sanitize_seed(seeds[0])
-        seed_random(str(seeds))
+
+        # set seed for the rng based on what we have so far
+        saved_random_state = random_getstate()
+        seed_random(str([n for n in seeds if n > -1]))

    # generate any seeds that are unspecified
    seeds = [sanitize_seed(seed) for seed in seeds]
@@ -816,6 +888,8 @@ def clear_all():
    elif os.name == "unix":
        shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
        shutil.rmtree(os.path.join(home, ".local/shark_tank"))
+    if args.local_tank_cache != "":
+        shutil.rmtree(args.local_tank_cache)


 def get_generated_imgs_path() -> Path:
@@ -851,7 +925,7 @@ def save_output_img(output_img, img_seed, extra_info=None):

    img_lora = None
    if args.use_lora:
-        img_lora = Path(os.path.basename(args.use_lora)).stem
+        img_lora = f"{Path(os.path.basename(args.use_lora)).stem}:{args.lora_strength}"

    if args.output_img_format == "jpg":
        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
@@ -861,6 +935,13 @@ def save_output_img(output_img, img_seed, extra_info=None):
        pngInfo = PngImagePlugin.PngInfo()

        if args.write_metadata_to_png:
+            # Using a conditional expression caused problems, so setting a new
+            # variable for now.
+            if args.use_hiresfix:
+                png_size_text = f"{args.hiresfix_width}x{args.hiresfix_height}"
+            else:
+                png_size_text = f"{args.width}x{args.height}"
+
            pngInfo.add_text(
                "parameters",
                f"{args.prompts[0]}"
@@ -869,7 +950,7 @@ def save_output_img(output_img, img_seed, extra_info=None):
                f"Sampler: {args.scheduler}, "
                f"CFG scale: {args.guidance_scale}, "
                f"Seed: {img_seed},"
-                f"Size: {args.width}x{args.height}, "
+                f"Size: {png_size_text}, "
                f"Model: {img_model}, "
                f"VAE: {img_vae}, "
                f"LoRA: {img_lora}",
@@ -896,8 +977,10 @@ def save_output_img(output_img, img_seed, extra_info=None):
        "CFG_SCALE": args.guidance_scale,
        "PRECISION": args.precision,
        "STEPS": args.steps,
-        "HEIGHT": args.height,
-        "WIDTH": args.width,
+        "HEIGHT": args.height
+        if not args.use_hiresfix
+        else args.hiresfix_height,
+        "WIDTH": args.width if not args.use_hiresfix else args.hiresfix_width,
        "MAX_LENGTH": args.max_length,
        "OUTPUT": out_img_path,
        "VAE": img_vae,
@@ -935,6 +1018,10 @@ def get_generation_text_info(seeds, device):
    )
    text_output += (
        f"\nsize={args.height}x{args.width}, "
+        if not args.use_hiresfix
+        else f"\nsize={args.hiresfix_height}x{args.hiresfix_width}, "
+    )
+    text_output += (
        f"batch_count={args.batch_count}, "
        f"batch_size={args.batch_size}, "
        f"max_length={args.max_length}"
@@ -948,8 +1035,7 @@ def get_generation_text_info(seeds, device):
 #   Both width and height should be in the range of [128, 768] and multiple of 8.
 # This utility function performs the transformation on the input image while
 # also maintaining the aspect ratio before sending it to the stencil pipeline.
-def resize_stencil(image: Image.Image):
-    width, height = image.size
+def resize_stencil(image: Image.Image, width, height):
    aspect_ratio = width / height
    min_size = min(width, height)
    if min_size < 128:
--- a/apps/stable_diffusion/studio_bundle.spec
+++ b/apps/stable_diffusion/studio_bundle.spec
@@ -19,6 +19,9 @@ a = Analysis(
    win_private_assemblies=False,
    cipher=block_cipher,
    noarchive=False,
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    },
 )
 pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)

--- a/apps/stable_diffusion/web/api/init.py
+++ b/apps/stable_diffusion/web/api/init.py
@@ -0,0 +1 @@
+from apps.stable_diffusion.web.api.sdapi_v1 import sdapi
--- a/apps/stable_diffusion/web/api/sdapi_v1.py
+++ b/apps/stable_diffusion/web/api/sdapi_v1.py
@@ -0,0 +1,574 @@
+import os
+
+from collections import defaultdict
+from enum import Enum
+from fastapi import FastAPI
+from pydantic import BaseModel, Field, conlist, model_validator
+
+from apps.stable_diffusion.web.api.utils import (
+    frozen_args,
+    sampler_aliases,
+    encode_pil_to_base64,
+    decode_base64_to_image,
+    get_model_from_request,
+    get_scheduler_from_request,
+    get_device,
+    GenerationInputData,
+    GenerationResponseData,
+)
+
+from apps.stable_diffusion.web.ui.utils import (
+    get_custom_model_files,
+    get_custom_model_pathfile,
+    predefined_models,
+    predefined_paint_models,
+    predefined_upscaler_models,
+    scheduler_list,
+)
+from apps.stable_diffusion.web.ui.txt2img_ui import txt2img_inf
+from apps.stable_diffusion.web.ui.img2img_ui import img2img_inf
+from apps.stable_diffusion.web.ui.inpaint_ui import inpaint_inf
+from apps.stable_diffusion.web.ui.outpaint_ui import outpaint_inf
+from apps.stable_diffusion.web.ui.upscaler_ui import upscaler_inf
+
+sdapi = FastAPI()
+
+
+# Rest API: /sdapi/v1/sd-models (lists available models)
+class AppParam(str, Enum):
+    txt2img = "txt2img"
+    img2img = "img2img"
+    inpaint = "inpaint"
+    outpaint = "outpaint"
+    upscaler = "upscaler"
+
+
+@sdapi.get(
+    "/v1/sd-models",
+    summary="lists available models",
+    description=(
+        "This is all the models that this server currently knows about.\n "
+        "Models listed may still have a compilation and build pending that "
+        "will be triggered the first time they are used."
+    ),
+)
+def sd_models_api(app: AppParam = frozen_args.app):
+    match app:
+        case "inpaint" | "outpaint":
+            checkpoint_type = "inpainting"
+            predefined = predefined_paint_models
+        case "upscaler":
+            checkpoint_type = "upscaler"
+            predefined = predefined_upscaler_models
+        case _:
+            checkpoint_type = ""
+            predefined = predefined_models
+
+    return [
+        {
+            "title": model_file,
+            "model_name": model_file,
+            "hash": None,
+            "sha256": None,
+            "filename": get_custom_model_pathfile(model_file),
+            "config": None,
+        }
+        for model_file in get_custom_model_files(
+            custom_checkpoint_type=checkpoint_type
+        )
+    ] + [
+        {
+            "title": model,
+            "model_name": model,
+            "hash": None,
+            "sha256": None,
+            "filename": None,
+            "config": None,
+        }
+        for model in predefined
+    ]
+
+
+# Rest API: /sdapi/v1/samplers (lists schedulers)
+@sdapi.get(
+    "/v1/samplers",
+    summary="lists available schedulers/samplers",
+    description=(
+        "These are all the Schedulers defined and available. Not "
+        "every scheduler is compatible with all apis. Aliases are "
+        "equivalent samplers in A1111 if they are known."
+    ),
+)
+def sd_samplers_api():
+    reverse_sampler_aliases = defaultdict(list)
+    for key, value in sampler_aliases.items():
+        reverse_sampler_aliases[value].append(key)
+
+    return (
+        {
+            "name": scheduler,
+            "aliases": reverse_sampler_aliases.get(scheduler, []),
+            "options": {},
+        }
+        for scheduler in scheduler_list
+    )
+
+
+# Rest API: /sdapi/v1/options (lists application level options)
+@sdapi.get(
+    "/v1/options",
+    summary="lists current settings of application level options",
+    description=(
+        "A subset of the command line arguments set at startup renamed "
+        "to correspond to the A1111 naming. Only a small subset of A1111 "
+        "options are returned."
+    ),
+)
+def options_api():
+    # This is mostly just enough to support what Koboldcpp wants, with a
+    # few other things that seemed obvious
+    return {
+        "samples_save": True,
+        "samples_format": frozen_args.output_img_format,
+        "sd_model_checkpoint": os.path.basename(frozen_args.ckpt_loc)
+        if frozen_args.ckpt_loc
+        else frozen_args.hf_model_id,
+        "sd_lora": frozen_args.use_lora,
+        "sd_vae": frozen_args.custom_vae or "Automatic",
+        "enable_pnginfo": frozen_args.write_metadata_to_png,
+    }
+
+
+# Rest API: /sdapi/v1/cmd-flags (lists command line argument settings)
+@sdapi.get(
+    "/v1/cmd-flags",
+    summary="lists the command line arguments value that were set on startup.",
+)
+def cmd_flags_api():
+    return vars(frozen_args)
+
+
+# Rest API: /sdapi/v1/txt2img (Text to image)
+class ModelOverrideSettings(BaseModel):
+    sd_model_checkpoint: str = get_model_from_request(
+        fallback_model="stabilityai/stable-diffusion-2-1-base"
+    )
+
+
+class Txt2ImgInputData(GenerationInputData):
+    enable_hr: bool = frozen_args.use_hiresfix
+    hr_resize_y: int = Field(
+        default=frozen_args.hiresfix_height, ge=128, le=768, multiple_of=8
+    )
+    hr_resize_x: int = Field(
+        default=frozen_args.hiresfix_width, ge=128, le=768, multiple_of=8
+    )
+    override_settings: ModelOverrideSettings = None
+
+
+@sdapi.post(
+    "/v1/txt2img",
+    summary="Does text to image generation",
+    response_model=GenerationResponseData,
+)
+def txt2img_api(InputData: Txt2ImgInputData):
+    model_id = get_model_from_request(
+        InputData,
+        fallback_model="stabilityai/stable-diffusion-2-1-base",
+    )
+    scheduler = get_scheduler_from_request(
+        InputData, "txt2img_hires" if InputData.enable_hr else "txt2img"
+    )
+
+    print(
+        f"Prompt: {InputData.prompt}, "
+        f"Negative Prompt: {InputData.negative_prompt}, "
+        f"Seed: {InputData.seed},"
+        f"Model: {model_id}, "
+        f"Scheduler: {scheduler}. "
+    )
+
+    res = txt2img_inf(
+        InputData.prompt,
+        InputData.negative_prompt,
+        InputData.height,
+        InputData.width,
+        InputData.steps,
+        InputData.cfg_scale,
+        InputData.seed,
+        batch_count=InputData.n_iter,
+        batch_size=1,
+        scheduler=scheduler,
+        model_id=model_id,
+        custom_vae=frozen_args.custom_vae or "None",
+        precision="fp16",
+        device=get_device(frozen_args.device),
+        max_length=frozen_args.max_length,
+        save_metadata_to_json=frozen_args.save_metadata_to_json,
+        save_metadata_to_png=frozen_args.write_metadata_to_png,
+        lora_weights=frozen_args.use_lora,
+        lora_strength=frozen_args.lora_strength,
+        ondemand=frozen_args.ondemand,
+        repeatable_seeds=False,
+        use_hiresfix=InputData.enable_hr,
+        hiresfix_height=InputData.hr_resize_y,
+        hiresfix_width=InputData.hr_resize_x,
+        hiresfix_strength=frozen_args.hiresfix_strength,
+        resample_type=frozen_args.resample_type,
+    )
+
+    # Since we're not streaming we just want the last generator result
+    for items_so_far in res:
+        items = items_so_far
+
+    return {
+        "images": encode_pil_to_base64(items[0]),
+        "parameters": {},
+        "info": items[1],
+    }
+
+
+# Rest API: /sdapi/v1/img2img (Image to image)
+class StencilParam(str, Enum):
+    canny = "canny"
+    openpose = "openpose"
+    scribble = "scribble"
+    zoedepth = "zoedepth"
+
+
+class Img2ImgInputData(GenerationInputData):
+    init_images: conlist(str, min_length=1, max_length=2)
+    denoising_strength: float = frozen_args.strength
+    use_stencil: StencilParam = frozen_args.use_stencil
+    override_settings: ModelOverrideSettings = None
+
+    @model_validator(mode="after")
+    def check_image_supplied_for_scribble_stencil(self) -> "Img2ImgInputData":
+        if (
+            self.use_stencil == StencilParam.scribble
+            and len(self.init_images) < 2
+        ):
+            raise ValueError(
+                "a second image must be supplied for the controlnet:scribble stencil"
+            )
+
+        return self
+
+
+@sdapi.post(
+    "/v1/img2img",
+    summary="Does image to image generation",
+    response_model=GenerationResponseData,
+)
+def img2img_api(
+    InputData: Img2ImgInputData,
+):
+    model_id = get_model_from_request(
+        InputData,
+        fallback_model="stabilityai/stable-diffusion-2-1-base",
+    )
+    scheduler = get_scheduler_from_request(InputData, "img2img")
+
+    init_image = decode_base64_to_image(InputData.init_images[0])
+    mask_image = (
+        decode_base64_to_image(InputData.init_images[1])
+        if len(InputData.init_images) > 1
+        else None
+    )
+
+    print(
+        f"Prompt: {InputData.prompt}, "
+        f"Negative Prompt: {InputData.negative_prompt}, "
+        f"Seed: {InputData.seed}, "
+        f"Model: {model_id}, "
+        f"Scheduler: {scheduler}."
+    )
+
+    res = img2img_inf(
+        InputData.prompt,
+        InputData.negative_prompt,
+        {"image": init_image, "mask": mask_image},
+        InputData.height,
+        InputData.width,
+        InputData.steps,
+        InputData.denoising_strength,
+        InputData.cfg_scale,
+        InputData.seed,
+        batch_count=InputData.n_iter,
+        batch_size=1,
+        scheduler=scheduler,
+        model_id=model_id,
+        custom_vae=frozen_args.custom_vae or "None",
+        precision="fp16",
+        device=get_device(frozen_args.device),
+        max_length=frozen_args.max_length,
+        use_stencil=InputData.use_stencil,
+        save_metadata_to_json=frozen_args.save_metadata_to_json,
+        save_metadata_to_png=frozen_args.write_metadata_to_png,
+        lora_weights=frozen_args.use_lora,
+        lora_strength=frozen_args.lora_strength,
+        ondemand=frozen_args.ondemand,
+        repeatable_seeds=False,
+        resample_type=frozen_args.resample_type,
+    )
+
+    # Since we're not streaming we just want the last generator result
+    for items_so_far in res:
+        items = items_so_far
+
+    return {
+        "images": encode_pil_to_base64(items[0]),
+        "parameters": {},
+        "info": items[1],
+    }
+
+
+# Rest API: /sdapi/v1/inpaint (Inpainting)
+class PaintModelOverideSettings(BaseModel):
+    sd_model_checkpoint: str = get_model_from_request(
+        checkpoint_type="inpainting",
+        fallback_model="stabilityai/stable-diffusion-2-inpainting",
+    )
+
+
+class InpaintInputData(GenerationInputData):
+    image: str = Field(description="Base64 encoded input image")
+    mask: str = Field(description="Base64 encoded mask image")
+    is_full_res: bool = False  # Is this setting backwards in the UI?
+    full_res_padding: int = Field(default=32, ge=0, le=256, multiple_of=4)
+    denoising_strength: float = frozen_args.strength
+    use_stencil: StencilParam = frozen_args.use_stencil
+    override_settings: PaintModelOverideSettings = None
+
+
+@sdapi.post(
+    "/v1/inpaint",
+    summary="Does inpainting generation on an image",
+    response_model=GenerationResponseData,
+)
+def inpaint_api(
+    InputData: InpaintInputData,
+):
+    model_id = get_model_from_request(
+        InputData,
+        checkpoint_type="inpainting",
+        fallback_model="stabilityai/stable-diffusion-2-inpainting",
+    )
+    scheduler = get_scheduler_from_request(InputData, "inpaint")
+
+    init_image = decode_base64_to_image(InputData.image)
+    mask = decode_base64_to_image(InputData.mask)
+
+    print(
+        f"Prompt: {InputData.prompt}, "
+        f'Negative Prompt: {InputData.negative_prompt}", '
+        f'Seed: {InputData.seed}", '
+        f"Model: {model_id}, "
+        f"Scheduler: {scheduler}."
+    )
+
+    res = inpaint_inf(
+        InputData.prompt,
+        InputData.negative_prompt,
+        init_image,
+        mask,
+        InputData.height,
+        InputData.width,
+        InputData.is_full_res,
+        InputData.full_res_padding,
+        InputData.steps,
+        InputData.cfg_scale,
+        InputData.seed,
+        batch_count=InputData.n_iter,
+        batch_size=1,
+        scheduler=scheduler,
+        model_id=model_id,
+        custom_vae=frozen_args.custom_vae or "None",
+        precision="fp16",
+        device=get_device(frozen_args.device),
+        max_length=frozen_args.max_length,
+        save_metadata_to_json=frozen_args.save_metadata_to_json,
+        save_metadata_to_png=frozen_args.write_metadata_to_png,
+        lora_weights=frozen_args.use_lora,
+        lora_strength=frozen_args.lora_strength,
+        ondemand=frozen_args.ondemand,
+        repeatable_seeds=False,
+    )
+
+    # Since we're not streaming we just want the last generator result
+    for items_so_far in res:
+        items = items_so_far
+
+    return {
+        "images": encode_pil_to_base64(items[0]),
+        "parameters": {},
+        "info": items[1],
+    }
+
+
+# Rest API: /sdapi/v1/outpaint (Outpainting)
+class DirectionParam(str, Enum):
+    left = "left"
+    right = "right"
+    up = "up"
+    down = "down"
+
+
+class OutpaintInputData(GenerationInputData):
+    init_images: list[str]
+    pixels: int = Field(
+        default=frozen_args.pixels, ge=8, le=256, multiple_of=8
+    )
+    mask_blur: int = Field(default=frozen_args.mask_blur, ge=0, le=64)
+    directions: set[DirectionParam] = [
+        direction
+        for direction in ["left", "right", "up", "down"]
+        if vars(frozen_args)[direction]
+    ]
+    noise_q: float = frozen_args.noise_q
+    color_variation: float = frozen_args.color_variation
+    override_settings: PaintModelOverideSettings = None
+
+
+@sdapi.post(
+    "/v1/outpaint",
+    summary="Does outpainting generation on an image",
+    response_model=GenerationResponseData,
+)
+def outpaint_api(
+    InputData: OutpaintInputData,
+):
+    model_id = get_model_from_request(
+        InputData,
+        checkpoint_type="inpainting",
+        fallback_model="stabilityai/stable-diffusion-2-inpainting",
+    )
+    scheduler = get_scheduler_from_request(InputData, "outpaint")
+
+    init_image = decode_base64_to_image(InputData.init_images[0])
+
+    print(
+        f"Prompt: {InputData.prompt}, "
+        f"Negative Prompt: {InputData.negative_prompt}, "
+        f"Seed: {InputData.seed}, "
+        f"Model: {model_id}, "
+        f"Scheduler: {scheduler}."
+    )
+
+    res = outpaint_inf(
+        InputData.prompt,
+        InputData.negative_prompt,
+        init_image,
+        InputData.pixels,
+        InputData.mask_blur,
+        InputData.directions,
+        InputData.noise_q,
+        InputData.color_variation,
+        InputData.height,
+        InputData.width,
+        InputData.steps,
+        InputData.cfg_scale,
+        InputData.seed,
+        batch_count=InputData.n_iter,
+        batch_size=1,
+        scheduler=scheduler,
+        model_id=model_id,
+        custom_vae=frozen_args.custom_vae or "None",
+        precision="fp16",
+        device=get_device(frozen_args.device),
+        max_length=frozen_args.max_length,
+        save_metadata_to_json=frozen_args.save_metadata_to_json,
+        save_metadata_to_png=frozen_args.write_metadata_to_png,
+        lora_weights=frozen_args.use_lora,
+        lora_strength=frozen_args.lora_strength,
+        ondemand=frozen_args.ondemand,
+        repeatable_seeds=False,
+    )
+
+    # Since we're not streaming we just want the last generator result
+    for items_so_far in res:
+        items = items_so_far
+
+    return {
+        "images": encode_pil_to_base64(items[0]),
+        "parameters": {},
+        "info": items[1],
+    }
+
+
+# Rest API: /sdapi/v1/upscaler (Upscaling)
+class UpscalerModelOverideSettings(BaseModel):
+    sd_model_checkpoint: str = get_model_from_request(
+        checkpoint_type="upscaler",
+        fallback_model="stabilityai/stable-diffusion-x4-upscaler",
+    )
+
+
+class UpscalerInputData(GenerationInputData):
+    init_images: list[str] = Field(
+        description="Base64 encoded image to upscale"
+    )
+    noise_level: int = frozen_args.noise_level
+    override_settings: UpscalerModelOverideSettings = None
+
+
+@sdapi.post(
+    "/v1/upscaler",
+    summary="Does image upscaling",
+    response_model=GenerationResponseData,
+)
+def upscaler_api(
+    InputData: UpscalerInputData,
+):
+    model_id = get_model_from_request(
+        InputData,
+        checkpoint_type="upscaler",
+        fallback_model="stabilityai/stable-diffusion-x4-upscaler",
+    )
+    scheduler = get_scheduler_from_request(InputData, "upscaler")
+
+    init_image = decode_base64_to_image(InputData.init_images[0])
+
+    print(
+        f"Prompt: {InputData.prompt}, "
+        f"Negative Prompt: {InputData.negative_prompt}, "
+        f"Seed: {InputData.seed}, "
+        f"Model: {model_id}, "
+        f"Scheduler: {scheduler}."
+    )
+
+    res = upscaler_inf(
+        InputData.prompt,
+        InputData.negative_prompt,
+        init_image,
+        InputData.height,
+        InputData.width,
+        InputData.steps,
+        InputData.noise_level,
+        InputData.cfg_scale,
+        InputData.seed,
+        batch_count=InputData.n_iter,
+        batch_size=1,
+        scheduler=scheduler,
+        model_id=model_id,
+        custom_vae=frozen_args.custom_vae or "None",
+        precision="fp16",
+        device=get_device(frozen_args.device),
+        max_length=frozen_args.max_length,
+        save_metadata_to_json=frozen_args.save_metadata_to_json,
+        save_metadata_to_png=frozen_args.write_metadata_to_png,
+        lora_weights=frozen_args.use_lora,
+        lora_strength=frozen_args.lora_strength,
+        ondemand=frozen_args.ondemand,
+        repeatable_seeds=False,
+    )
+
+    # Since we're not streaming we just want the last generator result
+    for items_so_far in res:
+        items = items_so_far
+
+    return {
+        "images": encode_pil_to_base64(items[0]),
+        "parameters": {},
+        "info": items[1],
+    }
--- a/apps/stable_diffusion/web/api/utils.py
+++ b/apps/stable_diffusion/web/api/utils.py
@@ -0,0 +1,200 @@
+import base64
+import pickle
+
+from argparse import Namespace
+from fastapi.exceptions import HTTPException
+from io import BytesIO
+from PIL import Image
+from pydantic import BaseModel, Field
+
+from apps.stable_diffusion.src import args
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    get_custom_model_files,
+    predefined_models,
+    predefined_paint_models,
+    predefined_upscaler_models,
+    scheduler_list,
+    scheduler_list_cpu_only,
+)
+
+
+# Probably overly cautious, but try to ensure we only use the starting
+# args in each api call, as the code does `args.<whatever> = <changed_value>`
+# in lots of places and in testing, it seemed to me, these changes leaked
+# into subsequent api calls.
+
+# Roundtripping through pickle for deepcopy, there is probably a better way
+frozen_args = Namespace(**(pickle.loads(pickle.dumps(vars(args)))))
+
+# an attempt to map some of the A1111 sampler names to scheduler names
+# https://github.com/huggingface/diffusers/issues/4167 is where the
+# (not so obvious) ones come from
+sampler_aliases = {
+    # a1111/onnx (these point to diffusers classes in A1111)
+    "pndm": "PNDM",
+    "heun": "HeunDiscrete",
+    "ddim": "DDIM",
+    "ddpm": "DDPM",
+    "euler": "EulerDiscrete",
+    "euler-ancestral": "EulerAncestralDiscrete",
+    "dpm": "DPMSolverMultistep",
+    # a1111/k_diffusion (the obvious ones)
+    "Euler a": "EulerAncestralDiscrete",
+    "Euler": "EulerDiscrete",
+    "LMS": "LMSDiscrete",
+    "Heun": "HeunDiscrete",
+    # a1111/k_diffusion (not so obvious)
+    "DPM++ 2M": "DPMSolverMultistep",
+    "DPM++ 2M Karras": "DPMSolverMultistepKarras",
+    "DPM++ 2M SDE": "DPMSolverMultistep++",
+    "DPM++ 2M SDE Karras": "DPMSolverMultistepKarras++",
+    "DPM2": "KDPM2Discrete",
+    "DPM2 a": "KDPM2AncestralDiscrete",
+}
+
+allowed_schedulers = {
+    "txt2img": {
+        "schedulers": scheduler_list,
+        "fallback": "SharkEulerDiscrete",
+    },
+    "txt2img_hires": {
+        "schedulers": scheduler_list_cpu_only,
+        "fallback": "DEISMultistep",
+    },
+    "img2img": {
+        "schedulers": scheduler_list_cpu_only,
+        "fallback": "EulerDiscrete",
+    },
+    "inpaint": {
+        "schedulers": scheduler_list_cpu_only,
+        "fallback": "DDIM",
+    },
+    "outpaint": {
+        "schedulers": scheduler_list_cpu_only,
+        "fallback": "DDIM",
+    },
+    "upscaler": {
+        "schedulers": scheduler_list_cpu_only,
+        "fallback": "DDIM",
+    },
+}
+
+# base pydantic model for sd generation apis
+
+
+class GenerationInputData(BaseModel):
+    prompt: str = ""
+    negative_prompt: str = ""
+    hf_model_id: str | None = None
+    height: int = Field(
+        default=frozen_args.height, ge=128, le=768, multiple_of=8
+    )
+    width: int = Field(
+        default=frozen_args.width, ge=128, le=768, multiple_of=8
+    )
+    sampler_name: str = frozen_args.scheduler
+    cfg_scale: float = Field(default=frozen_args.guidance_scale, ge=1)
+    steps: int = Field(default=frozen_args.steps, ge=1, le=100)
+    seed: int = frozen_args.seed
+    n_iter: int = Field(default=frozen_args.batch_count)
+
+
+class GenerationResponseData(BaseModel):
+    images: list[str] = Field(description="Generated images, Base64 encoded")
+    properties: dict = {}
+    info: str
+
+
+# image encoding/decoding
+
+
+def encode_pil_to_base64(images: list[Image.Image]):
+    encoded_imgs = []
+    for image in images:
+        with BytesIO() as output_bytes:
+            if frozen_args.output_img_format.lower() == "png":
+                image.save(output_bytes, format="PNG")
+
+            elif frozen_args.output_img_format.lower() in ("jpg", "jpeg"):
+                image.save(output_bytes, format="JPEG")
+            else:
+                raise HTTPException(
+                    status_code=500, detail="Invalid image format"
+                )
+            bytes_data = output_bytes.getvalue()
+            encoded_imgs.append(base64.b64encode(bytes_data))
+    return encoded_imgs
+
+
+def decode_base64_to_image(encoding: str):
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as err:
+        print(err)
+        raise HTTPException(status_code=400, detail="Invalid encoded image")
+
+
+# get valid sd models/vaes/schedulers etc.
+
+
+def get_predefined_models(custom_checkpoint_type: str):
+    match custom_checkpoint_type:
+        case "inpainting":
+            return predefined_paint_models
+        case "upscaler":
+            return predefined_upscaler_models
+        case _:
+            return predefined_models
+
+
+def get_model_from_request(
+    request_data=None,
+    checkpoint_type: str = "",
+    fallback_model: str = "",
+):
+    model = None
+    if request_data:
+        if request_data.hf_model_id:
+            model = request_data.hf_model_id
+        elif request_data.override_settings:
+            model = request_data.override_settings.sd_model_checkpoint
+
+    # if the request didn't specify a model try the command line args
+    result = model or frozen_args.ckpt_loc or frozen_args.hf_model_id
+
+    # make sure whatever we have is a valid model for the checkpoint type
+    if result in get_custom_model_files(
+        custom_checkpoint_type=checkpoint_type
+    ) + get_predefined_models(checkpoint_type):
+        return result
+    # if not return what was specified as the fallback
+    else:
+        return fallback_model
+
+
+def get_scheduler_from_request(
+    request_data: GenerationInputData, operation: str
+):
+    allowed = allowed_schedulers[operation]
+
+    requested = request_data.sampler_name
+    requested = sampler_aliases.get(requested, requested)
+
+    return (
+        requested
+        if requested in allowed["schedulers"]
+        else allowed["fallback"]
+    )
+
+
+def get_device(device_str: str):
+    # first substring match in the list available devices, with first
+    # device when none are matched
+    return next(
+        (device for device in available_devices if device_str in device),
+        available_devices[0],
+    )
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -1,6 +1,8 @@
-from multiprocessing import Process, freeze_support
+from multiprocessing import freeze_support
 import os
 import sys
+import logging
+import apps.stable_diffusion.web.utils.app as app

 if sys.platform == "darwin":
    # import before IREE to avoid torch-MLIR library issues
@@ -20,78 +22,74 @@ if args.clear_all:
    clear_all()


-def launch_app(address):
-    from tkinter import Tk
-    import webview
-
-    window = Tk()
-
-    # get screen width and height of display and make it more reasonably
-    # sized as we aren't making it full-screen or maximized
-    width = int(window.winfo_screenwidth() * 0.81)
-    height = int(window.winfo_screenheight() * 0.91)
-    webview.create_window(
-        "SHARK AI Studio",
-        url=address,
-        width=width,
-        height=height,
-        text_select=True,
-    )
-    webview.start(private_mode=False)
-
-
 if __name__ == "__main__":
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
    # required to do multiprocessing in a pyinstaller freeze
    freeze_support()
    if args.api or "api" in args.ui.split(","):
        from apps.stable_diffusion.web.ui import (
-            txt2img_api,
-            img2img_api,
-            upscaler_api,
-            inpaint_api,
-            outpaint_api,
            llm_chat_api,
        )
+        from apps.stable_diffusion.web.api import sdapi

        from fastapi import FastAPI, APIRouter
+        from fastapi.middleware.cors import CORSMiddleware
        import uvicorn

        # init global sd pipeline and config
        global_obj._init()

-        app = FastAPI()
-        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
-        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
-        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
-        app.add_api_route("/sdapi/v1/outpaint", outpaint_api, methods=["post"])
-        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
+        api = FastAPI()
+        api.mount("/sdapi/", sdapi)

        # chat APIs needed for compatibility with multiple extensions using OpenAI API
-        app.add_api_route(
+        api.add_api_route(
            "/v1/chat/completions", llm_chat_api, methods=["post"]
        )
-        app.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
-        app.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
-        app.add_api_route("/completions", llm_chat_api, methods=["post"])
-        app.add_api_route(
+        api.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
+        api.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
+        api.add_api_route("/completions", llm_chat_api, methods=["post"])
+        api.add_api_route(
            "/v1/engines/codegen/completions", llm_chat_api, methods=["post"]
        )
-        app.include_router(APIRouter())
-        uvicorn.run(app, host="0.0.0.0", port=args.server_port)
+        api.include_router(APIRouter())
+
+        # deal with CORS requests if CORS accept origins are set
+        if args.api_accept_origin:
+            print(
+                f"API Configured for CORS. Accepting origins: { args.api_accept_origin }"
+            )
+            api.add_middleware(
+                CORSMiddleware,
+                allow_origins=args.api_accept_origin,
+                allow_methods=["GET", "POST"],
+                allow_headers=["*"],
+            )
+        else:
+            print("API not configured for CORS")
+
+        uvicorn.run(api, host="0.0.0.0", port=args.server_port)
        sys.exit(0)

    # Setup to use shark_tmp for gradio's temporary image files and clear any
    # existing temporary images there if they exist. Then we can import gradio.
    # It has to be in this order or gradio ignores what we've set up.
-    from apps.stable_diffusion.web.utils.gradio_configs import (
-        config_gradio_tmp_imgs_folder,
+    from apps.stable_diffusion.web.utils.tmp_configs import (
+        config_tmp,
+        shark_tmp,
    )

-    config_gradio_tmp_imgs_folder()
+    config_tmp()
    import gradio as gr

    # Create custom models folders if they don't exist
-    from apps.stable_diffusion.web.ui.utils import create_custom_models_folders
+    from apps.stable_diffusion.web.ui.utils import (
+        create_custom_models_folders,
+        nodicon_loc,
+        mask_editor_value_for_gallery_data,
+        mask_editor_value_for_image_file,
+    )

    create_custom_models_folders()

@@ -102,12 +100,9 @@ if __name__ == "__main__":
        )
        return os.path.join(base_path, relative_path)

-    dark_theme = resource_path("ui/css/sd_dark_theme.css")
-
    from apps.stable_diffusion.web.ui import (
        txt2img_web,
        txt2img_custom_model,
-        txt2img_hf_model_id,
        txt2img_gallery,
        txt2img_png_info_img,
        txt2img_status,
@@ -115,10 +110,20 @@ if __name__ == "__main__":
        txt2img_sendto_inpaint,
        txt2img_sendto_outpaint,
        txt2img_sendto_upscaler,
-        h2ogpt_web,
+        # SDXL
+        txt2img_sdxl_web,
+        txt2img_sdxl_custom_model,
+        txt2img_sdxl_gallery,
+        txt2img_sdxl_png_info_img,
+        txt2img_sdxl_status,
+        txt2img_sdxl_sendto_img2img,
+        txt2img_sdxl_sendto_inpaint,
+        txt2img_sdxl_sendto_outpaint,
+        txt2img_sdxl_sendto_upscaler,
+        # h2ogpt_upload,
+        # h2ogpt_web,
        img2img_web,
        img2img_custom_model,
-        img2img_hf_model_id,
        img2img_gallery,
        img2img_init_image,
        img2img_status,
@@ -127,7 +132,6 @@ if __name__ == "__main__":
        img2img_sendto_upscaler,
        inpaint_web,
        inpaint_custom_model,
-        inpaint_hf_model_id,
        inpaint_gallery,
        inpaint_init_image,
        inpaint_status,
@@ -136,7 +140,6 @@ if __name__ == "__main__":
        inpaint_sendto_upscaler,
        outpaint_web,
        outpaint_custom_model,
-        outpaint_hf_model_id,
        outpaint_gallery,
        outpaint_init_image,
        outpaint_status,
@@ -145,15 +148,15 @@ if __name__ == "__main__":
        outpaint_sendto_upscaler,
        upscaler_web,
        upscaler_custom_model,
-        upscaler_hf_model_id,
        upscaler_gallery,
        upscaler_init_image,
        upscaler_status,
        upscaler_sendto_img2img,
        upscaler_sendto_inpaint,
        upscaler_sendto_outpaint,
-        lora_train_web,
-        model_web,
+        #  lora_train_web,
+        #  model_web,
+        model_config_web,
        hf_models,
        modelmanager_sendto_txt2img,
        modelmanager_sendto_img2img,
@@ -167,6 +170,7 @@ if __name__ == "__main__":
        outputgallery_watch,
        outputgallery_filename,
        outputgallery_sendto_txt2img,
+        outputgallery_sendto_txt2img_sdxl,
        outputgallery_sendto_img2img,
        outputgallery_sendto_inpaint,
        outputgallery_sendto_outpaint,
@@ -176,11 +180,21 @@ if __name__ == "__main__":
    # init global sd pipeline and config
    global_obj._init()

-    def register_button_click(button, selectedid, inputs, outputs):
+    def register_sendto_click(button, selectedid, inputs, outputs):
        button.click(
            lambda x: (
-                x[0]["name"] if len(x) != 0 else None,
-                gr.Tabs.update(selected=selectedid),
+                x.root[0].image.path if len(x.root) != 0 else None,
+                gr.Tabs(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    def register_sendto_editor_click(button, selectedid, inputs, outputs):
+        button.click(
+            lambda x: (
+                mask_editor_value_for_gallery_data(x),
+                gr.Tabs(selected=selectedid),
            ),
            inputs,
            outputs,
@@ -191,26 +205,56 @@ if __name__ == "__main__":
            lambda x: (
                "None",
                x,
-                gr.Tabs.update(selected=selectedid),
+                gr.Tabs(selected=selectedid),
            ),
            inputs,
            outputs,
+            queue=False,
        )

-    def register_outputgallery_button(button, selectedid, inputs, outputs):
+    def register_outputgallery_sendto_button(
+        button, selectedid, inputs, outputs
+    ):
        button.click(
            lambda x: (
                x,
-                gr.Tabs.update(selected=selectedid),
+                gr.Tabs(selected=selectedid),
            ),
            inputs,
            outputs,
        )

+    def register_outputgallery_sendto_editor_button(
+        button, selectedid, inputs, outputs
+    ):
+        button.click(
+            lambda x: (
+                mask_editor_value_for_image_file(x),
+                gr.Tabs(selected=selectedid),
+            ),
+            inputs,
+            outputs,
+        )
+
+    dark_theme = resource_path("ui/css/sd_dark_theme.css")
+    gradio_workarounds = resource_path("ui/js/sd_gradio_workarounds.js")
+
    with gr.Blocks(
-        css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
+        css=dark_theme,
+        js=gradio_workarounds,
+        analytics_enabled=False,
+        title="SHARK AI Studio",
    ) as sd_web:
        with gr.Tabs() as tabs:
+            # NOTE: If adding, removing, or re-ordering tabs, make sure that they
+            # have a unique id that doesn't clash with any of the other tabs,
+            # and that the order in the code here is the order they should
+            # appear in the ui, as the id value doesn't determine the order.
+
+            # Where possible, avoid changing the id of any tab that is the
+            # destination of one of the 'send to' buttons. If you do have to change
+            # that id, make sure you update the relevant register_button_click calls
+            # further down with the new id.
            with gr.TabItem(label="Text-to-Image", id=0):
                txt2img_web.render()
            with gr.TabItem(label="Image-to-Image", id=1):
@@ -224,197 +268,217 @@ if __name__ == "__main__":
            if args.output_gallery:
                with gr.TabItem(label="Output Gallery", id=5) as og_tab:
                    outputgallery_web.render()
-
-                # extra output gallery configuration
-                outputgallery_tab_select(og_tab.select)
-                outputgallery_watch(
-                    [
-                        txt2img_status,
-                        img2img_status,
-                        inpaint_status,
-                        outpaint_status,
-                        upscaler_status,
-                    ]
-                )
-            with gr.TabItem(label="Model Manager", id=6):
-                model_web.render()
-            with gr.TabItem(label="LoRA Training (Experimental)", id=8):
-                lora_train_web.render()
-            with gr.TabItem(label="Chat Bot (Experimental)", id=7):
+            #  with gr.TabItem(label="Model Manager", id=6):
+            #      model_web.render()
+            #  with gr.TabItem(label="LoRA Training (Experimental)", id=7):
+            #      lora_train_web.render()
+            with gr.TabItem(label="Chat Bot", id=8):
                stablelm_chat.render()
-            with gr.TabItem(label="MultiModal (Experimental)", id=9):
-                minigpt4_web.render()
-            with gr.TabItem(label="DocuChat(Experimental)", id=10):
-                h2ogpt_web.render()
+            # with gr.TabItem(
+            #    label="Generate Sharding Config (Experimental)", id=9
+            # ):
+            #    model_config_web.render()
+            # with gr.TabItem(label="MultiModal (Experimental)", id=10):
+            #     minigpt4_web.render()
+            # with gr.TabItem(label="DocuChat Upload", id=11):
+            #     h2ogpt_upload.render()
+            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
+            #     h2ogpt_web.render()
+            with gr.TabItem(label="Text-to-Image (SDXL)", id=13):
+                txt2img_sdxl_web.render()
+
+            # extra output gallery configuration
+            outputgallery_tab_select(og_tab.select)
+            outputgallery_watch(
+                [
+                    txt2img_status,
+                    img2img_status,
+                    inpaint_status,
+                    outpaint_status,
+                    upscaler_status,
+                    txt2img_sdxl_status,
+                ],
+            )
+
+            actual_port = app.usable_port()
+            if actual_port != args.server_port:
+                sd_web.load(
+                    fn=lambda: gr.Info(
+                        f"Port {args.server_port} is in use by another application. "
+                        f"Shark is running on port {actual_port} instead."
+                    )
+                )

        # send to buttons
-        register_button_click(
+        register_sendto_click(
            txt2img_sendto_img2img,
            1,
            [txt2img_gallery],
            [img2img_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_editor_click(
            txt2img_sendto_inpaint,
            2,
            [txt2img_gallery],
            [inpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            txt2img_sendto_outpaint,
            3,
            [txt2img_gallery],
            [outpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            txt2img_sendto_upscaler,
            4,
            [txt2img_gallery],
            [upscaler_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_editor_click(
            img2img_sendto_inpaint,
            2,
            [img2img_gallery],
            [inpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            img2img_sendto_outpaint,
            3,
            [img2img_gallery],
            [outpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            img2img_sendto_upscaler,
            4,
            [img2img_gallery],
            [upscaler_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            inpaint_sendto_img2img,
            1,
            [inpaint_gallery],
            [img2img_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            inpaint_sendto_outpaint,
            3,
            [inpaint_gallery],
            [outpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            inpaint_sendto_upscaler,
            4,
            [inpaint_gallery],
            [upscaler_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            outpaint_sendto_img2img,
            1,
            [outpaint_gallery],
            [img2img_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_editor_click(
            outpaint_sendto_inpaint,
            2,
            [outpaint_gallery],
            [inpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            outpaint_sendto_upscaler,
            4,
            [outpaint_gallery],
            [upscaler_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            upscaler_sendto_img2img,
            1,
            [upscaler_gallery],
            [img2img_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_editor_click(
            upscaler_sendto_inpaint,
            2,
            [upscaler_gallery],
            [inpaint_init_image, tabs],
        )
-        register_button_click(
+        register_sendto_click(
            upscaler_sendto_outpaint,
            3,
            [upscaler_gallery],
            [outpaint_init_image, tabs],
        )
        if args.output_gallery:
-            register_outputgallery_button(
+            register_outputgallery_sendto_button(
                outputgallery_sendto_txt2img,
                0,
                [outputgallery_filename],
                [txt2img_png_info_img, tabs],
            )
-            register_outputgallery_button(
+            register_outputgallery_sendto_button(
                outputgallery_sendto_img2img,
                1,
                [outputgallery_filename],
                [img2img_init_image, tabs],
            )
-            register_outputgallery_button(
+            register_outputgallery_sendto_editor_button(
                outputgallery_sendto_inpaint,
                2,
                [outputgallery_filename],
                [inpaint_init_image, tabs],
            )
-            register_outputgallery_button(
+            register_outputgallery_sendto_button(
                outputgallery_sendto_outpaint,
                3,
                [outputgallery_filename],
                [outpaint_init_image, tabs],
            )
-            register_outputgallery_button(
+            register_outputgallery_sendto_button(
                outputgallery_sendto_upscaler,
                4,
                [outputgallery_filename],
                [upscaler_init_image, tabs],
            )
+            register_outputgallery_sendto_button(
+                outputgallery_sendto_txt2img_sdxl,
+                0,
+                [outputgallery_filename],
+                [txt2img_sdxl_png_info_img, tabs],
+            )
        register_modelmanager_button(
            modelmanager_sendto_txt2img,
            0,
            [hf_models],
-            [txt2img_custom_model, txt2img_hf_model_id, tabs],
+            [txt2img_custom_model, tabs],
        )
        register_modelmanager_button(
            modelmanager_sendto_img2img,
            1,
            [hf_models],
-            [img2img_custom_model, img2img_hf_model_id, tabs],
+            [img2img_custom_model, tabs],
        )
        register_modelmanager_button(
            modelmanager_sendto_inpaint,
            2,
            [hf_models],
-            [inpaint_custom_model, inpaint_hf_model_id, tabs],
+            [inpaint_custom_model, tabs],
        )
        register_modelmanager_button(
            modelmanager_sendto_outpaint,
            3,
            [hf_models],
-            [outpaint_custom_model, outpaint_hf_model_id, tabs],
+            [outpaint_custom_model, tabs],
        )
        register_modelmanager_button(
            modelmanager_sendto_upscaler,
            4,
            [hf_models],
-            [upscaler_custom_model, upscaler_hf_model_id, tabs],
+            [upscaler_custom_model, tabs],
        )

    sd_web.queue()
-    if args.ui == "app":
-        t = Process(
-            target=launch_app, args=[f"http://localhost:{args.server_port}"]
-        )
-        t.start()
    sd_web.launch(
        share=args.share,
-        inbrowser=args.ui == "web",
+        inbrowser=not app.launch(actual_port),
        server_name="0.0.0.0",
-        server_port=args.server_port,
+        server_port=actual_port,
+        favicon_path=nodicon_loc,
    )
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -1,9 +1,7 @@
 from apps.stable_diffusion.web.ui.txt2img_ui import (
    txt2img_inf,
-    txt2img_api,
    txt2img_web,
    txt2img_custom_model,
-    txt2img_hf_model_id,
    txt2img_gallery,
    txt2img_png_info_img,
    txt2img_status,
@@ -12,12 +10,22 @@ from apps.stable_diffusion.web.ui.txt2img_ui import (
    txt2img_sendto_outpaint,
    txt2img_sendto_upscaler,
 )
+from apps.stable_diffusion.web.ui.txt2img_sdxl_ui import (
+    txt2img_sdxl_inf,
+    txt2img_sdxl_web,
+    txt2img_sdxl_custom_model,
+    txt2img_sdxl_gallery,
+    txt2img_sdxl_status,
+    txt2img_sdxl_png_info_img,
+    txt2img_sdxl_sendto_img2img,
+    txt2img_sdxl_sendto_inpaint,
+    txt2img_sdxl_sendto_outpaint,
+    txt2img_sdxl_sendto_upscaler,
+)
 from apps.stable_diffusion.web.ui.img2img_ui import (
    img2img_inf,
-    img2img_api,
    img2img_web,
    img2img_custom_model,
-    img2img_hf_model_id,
    img2img_gallery,
    img2img_init_image,
    img2img_status,
@@ -27,10 +35,8 @@ from apps.stable_diffusion.web.ui.img2img_ui import (
 )
 from apps.stable_diffusion.web.ui.inpaint_ui import (
    inpaint_inf,
-    inpaint_api,
    inpaint_web,
    inpaint_custom_model,
-    inpaint_hf_model_id,
    inpaint_gallery,
    inpaint_init_image,
    inpaint_status,
@@ -40,10 +46,8 @@ from apps.stable_diffusion.web.ui.inpaint_ui import (
 )
 from apps.stable_diffusion.web.ui.outpaint_ui import (
    outpaint_inf,
-    outpaint_api,
    outpaint_web,
    outpaint_custom_model,
-    outpaint_hf_model_id,
    outpaint_gallery,
    outpaint_init_image,
    outpaint_status,
@@ -53,10 +57,8 @@ from apps.stable_diffusion.web.ui.outpaint_ui import (
 )
 from apps.stable_diffusion.web.ui.upscaler_ui import (
    upscaler_inf,
-    upscaler_api,
    upscaler_web,
    upscaler_custom_model,
-    upscaler_hf_model_id,
    upscaler_gallery,
    upscaler_init_image,
    upscaler_status,
@@ -78,7 +80,7 @@ from apps.stable_diffusion.web.ui.stablelm_ui import (
    stablelm_chat,
    llm_chat_api,
 )
-from apps.stable_diffusion.web.ui.h2ogpt import h2ogpt_web
+from apps.stable_diffusion.web.ui.generate_config import model_config_web
 from apps.stable_diffusion.web.ui.minigpt4_ui import minigpt4_web
 from apps.stable_diffusion.web.ui.outputgallery_ui import (
    outputgallery_web,
@@ -86,6 +88,7 @@ from apps.stable_diffusion.web.ui.outputgallery_ui import (
    outputgallery_watch,
    outputgallery_filename,
    outputgallery_sendto_txt2img,
+    outputgallery_sendto_txt2img_sdxl,
    outputgallery_sendto_img2img,
    outputgallery_sendto_inpaint,
    outputgallery_sendto_outpaint,
--- a/apps/stable_diffusion/web/ui/common_ui_events.py
+++ b/apps/stable_diffusion/web/ui/common_ui_events.py
@@ -0,0 +1,64 @@
+import gradio as gr
+
+from apps.stable_diffusion.web.ui.utils import (
+    HSLHue,
+    hsl_color,
+    get_lora_metadata,
+)
+
+
+# Answers HTML to show the most frequent tags used when a LoRA was trained,
+# taken from the metadata of its .safetensors file.
+def lora_changed(lora_file):
+    # tag frequency percentage, that gets maximum amount of the staring hue
+    TAG_COLOR_THRESHOLD = 0.55
+    # tag frequency percentage, above which a tag is displayed
+    TAG_DISPLAY_THRESHOLD = 0.65
+    # template for the html used to display a tag
+    TAG_HTML_TEMPLATE = '<span class="lora-tag" style="border: 1px solid {color};">{tag}</span>'
+
+    if lora_file == "None":
+        return ["<div><i>No LoRA selected</i></div>"]
+    elif not lora_file.lower().endswith(".safetensors"):
+        return [
+            "<div><i>Only metadata queries for .safetensors files are currently supported</i></div>"
+        ]
+    else:
+        metadata = get_lora_metadata(lora_file)
+        if metadata:
+            frequencies = metadata["frequencies"]
+            return [
+                "".join(
+                    [
+                        f'<div class="lora-model">Trained against weights in: {metadata["model"]}</div>'
+                    ]
+                    + [
+                        TAG_HTML_TEMPLATE.format(
+                            color=hsl_color(
+                                (tag[1] - TAG_COLOR_THRESHOLD)
+                                / (1 - TAG_COLOR_THRESHOLD),
+                                start=HSLHue.RED,
+                                end=HSLHue.GREEN,
+                            ),
+                            tag=tag[0],
+                        )
+                        for tag in frequencies
+                        if tag[1] > TAG_DISPLAY_THRESHOLD
+                    ],
+                )
+            ]
+        elif metadata is None:
+            return [
+                "<div><i>This LoRA does not publish tag frequency metadata</i></div>"
+            ]
+        else:
+            return [
+                "<div><i>This LoRA has empty tag frequency metadata, or we could not parse it</i></div>"
+            ]
+
+
+def lora_strength_changed(strength):
+    if strength > 1.0:
+        return gr.Number(elem_classes="value-out-of-range")
+    else:
+        return gr.Number(elem_classes="")
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -105,7 +105,19 @@ body {
    background-color: var(--background-fill-primary);
 }

-/* display in full width for desktop devices */
+.generating.svelte-zlszon.svelte-zlszon {
+    border: none;
+}
+
+.generating {
+    border: none !important;
+}
+
+#chatbot {
+    height: 100% !important;
+}
+
+/* display in full width for desktop devices, but see below */
@media (min-width: 1536px)
 {
    .gradio-container {
@@ -113,12 +125,17 @@ body {
    }
 }

-.gradio-container .contain {
-    padding: 0 var(--size-4) !important;
+/* media rules in custom css are don't appear to be applied in
+   gradio versions > 4.7, so we have to define a class which
+   we will manually need add and remove using javascript.
+   Remove this once this fixed in gradio.
+*/
+.gradio-container-size-full {
+    max-width: var(--size-full) !important;
 }

-#ui_title {
-    padding: var(--size-2) 0 0 var(--size-1);
+.gradio-container .contain {
+    padding: 0 var(--size-4) !important;
 }

 #top_logo {
@@ -128,6 +145,10 @@ body {
    border: 0;
 }

+#ui_title {
+    padding: var(--size-2) 0 0 var(--size-1);
+}
+
 #demo_title_outer {
    border-radius: 0;
 }
@@ -170,6 +191,8 @@ footer {
    aspect-ratio: unset;
    max-height: calc(55vh - (2 * var(--spacing-lg)));
 }
+
+/* fix width and height of gallery items when on very large desktop screens, but see below */
@media (min-width: 1921px) {
    /* Force a 768px_height + 4px_margin_height + navbar_height for the gallery */
    #gallery .grid-wrap, #gallery .preview{
@@ -181,6 +204,20 @@ footer {
        max-height: 770px !important;
    }
 }
+
+/* media rules in custom css are don't appear to be applied in
+   gradio versions > 4.7, so we have to define classes which
+   we will manually need add and remove using javascript.
+   Remove this once this fixed in gradio.
+*/
+.gallery-force-height768 .grid-wrap, .gallery-force-height768 .preview {
+    min-height: calc(768px + 4px + var(--size-14)) !important;
+    max-height: calc(768px + 4px + var(--size-14)) !important;
+}
+.gallery-limit-height768 .thumbnail-item.thumbnail-lg {
+    max-height: 770px !important;
+}
+
 /* Don't upscale when viewing in solo image mode */
 #gallery .preview img {
    object-fit: scale-down;
@@ -222,18 +259,19 @@ footer {
    display:none;
 }

-/* Hide the download icon from the nod logo */
-#top_logo button {
-    display: none;
-}
-
 /* workarounds for container=false not currently working for dropdowns */
 .dropdown_no_container {
    padding: 0 !important;
 }

-#output_subdir_container :first-child {
-    border: none;
+#output_subdir_container {
+    background-color: var(--block-background-fill);
+    padding-right: 8px;
+}
+
+/* number input value is out of range */
+.value-out-of-range input[type="number"] {
+    color: red !important;
 }

 /* reduced animation load when generating */
@@ -246,16 +284,54 @@ footer {
    background-color: var(--block-label-background-fill);
 }

+/* lora tag pills */
+.lora-tags {
+    border: 1px solid var(--border-color-primary);
+    color: var(--block-info-text-color) !important;
+    padding: var(--block-padding);
+}
+
+.lora-tag {
+    display: inline-block;
+    height: 2em;
+    color: rgb(212 212 212) !important;
+    margin-right: 5pt;
+    margin-bottom: 5pt;
+    padding: 2pt 5pt;
+    border-radius: 5pt;
+    white-space: nowrap;
+}
+
+.lora-model {
+    margin-bottom: var(--spacing-lg);
+    color: var(--block-info-text-color) !important;
+    line-height: var(--line-sm);
+}
+
 /* output gallery tab */
+.output_parameters_dataframe table.table {
+    /* works around a gradio bug that always shows scrollbars */
+    overflow: clip auto;
+}
+
+.output_parameters_dataframe .cell-wrap span {
+    /* inadequate workaround for gradio issue #6086 */
+    user-select:text !important;
+    -moz-user-select:text !important;
+    -webkit-user-select:text !important;
+    -o-user-select:text !important;
+    -ms-user-select:text !important;
+}
+
 .output_parameters_dataframe tbody td {
    font-size: small;
-    line-height: var(--line-xs)
+    line-height: var(--line-xs);
 }

 .output_icon_button {
    max-width: 30px;
    align-self: end;
-    padding-bottom: 8px;
+    padding-bottom: 16px !important;
 }

 .outputgallery_sendto {
@@ -272,6 +348,11 @@ footer {
    object-fit: contain !important;
 }

+/* use the whole gallery area for previeews */
+#outputgallery_gallery .preview {
+    width: inherit;
+}
+
 /* centered logo for when there are no images */
 #top_logo.logo_centered {
    height: 100%;
--- a/apps/stable_diffusion/web/ui/generate_config.py
+++ b/apps/stable_diffusion/web/ui/generate_config.py
@@ -0,0 +1,41 @@
+import gradio as gr
+import torch
+from transformers import AutoTokenizer
+from apps.language_models.src.model_wrappers.vicuna_model import CombinedModel
+from shark.shark_generate_model_config import GenerateConfigFile
+
+
+def get_model_config():
+    hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
+    compilation_prompt = "".join(["0" for _ in range(17)])
+    compilation_input_ids = tokenizer(
+        compilation_prompt,
+        return_tensors="pt",
+    ).input_ids
+    compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
+        [1, 19]
+    )
+    firstVicunaCompileInput = (compilation_input_ids,)
+
+    model = CombinedModel()
+    c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
+    return c.split_into_layers()
+
+
+with gr.Blocks() as model_config_web:
+    with gr.Row():
+        hf_models = gr.Dropdown(
+            label="Model List",
+            choices=["Vicuna"],
+            value="Vicuna",
+            visible=True,
+        )
+        get_model_config_btn = gr.Button(value="Get Model Config")
+    json_view = gr.JSON()
+
+    get_model_config_btn.click(
+        fn=get_model_config,
+        inputs=[],
+        outputs=[json_view],
+    )
--- a/apps/stable_diffusion/web/ui/h2ogpt.py
+++ b/apps/stable_diffusion/web/ui/h2ogpt.py
@@ -12,6 +12,10 @@ from apps.language_models.langchain.enums import (
    LangChainAction,
 )
 import apps.language_models.langchain.gen as gen
+from gpt_langchain import (
+    path_to_docs,
+    create_or_update_db,
+)
 from apps.stable_diffusion.src import args


@@ -33,8 +37,15 @@ start_message = """

 def create_prompt(history):
    system_message = start_message
+    for item in history:
+        print("His item: ", item)

-    conversation = "".join(["".join([item[0], item[1]]) for item in history])
+    conversation = "<|endoftext|>".join(
+        [
+            "<|endoftext|><|answer|>".join([item[0], item[1]])
+            for item in history
+        ]
+    )

    msg = system_message + conversation
    msg = msg.strip()
@@ -44,10 +55,12 @@ def create_prompt(history):
 def chat(curr_system_message, history, device, precision):
    args.run_docuchat_web = True
    global h2ogpt_model
+    global sharkModel
    global h2ogpt_tokenizer
    global model_state
    global langchain
    global userpath_selector
+    from apps.language_models.langchain.h2oai_pipeline import generate_token

    if h2ogpt_model == 0:
        if "cuda" in device:
@@ -102,9 +115,14 @@ def chat(curr_system_message, history, device, precision):
            prompt_type=None,
            prompt_dict=None,
        )
+        from apps.language_models.langchain.h2oai_pipeline import (
+            H2OGPTSHARKModel,
+        )
+
+        sharkModel = H2OGPTSHARKModel()

    prompt = create_prompt(history)
-    output = langchain.evaluate(
+    output_dict = langchain.evaluate(
        model_state=model_state,
        my_db_state=None,
        instruction=prompt,
@@ -164,14 +182,22 @@ def chat(curr_system_message, history, device, precision):
        model_lock=True,
        user_path=userpath_selector.value,
    )
-    for partial_text in output:
-        history[-1][1] = partial_text["response"]
-        yield history

+    output = generate_token(sharkModel, **output_dict)
+    for partial_text in output:
+        history[-1][1] = partial_text
+        yield history
    return history


-with gr.Blocks(title="H2OGPT") as h2ogpt_web:
+userpath_selector = gr.Textbox(
+    label="Document Directory",
+    value=str(os.path.abspath("apps/language_models/langchain/user_path/")),
+    interactive=True,
+    container=True,
+)
+
+with gr.Blocks(title="DocuChat") as h2ogpt_web:
    with gr.Row():
        supported_devices = available_devices
        enabled = len(supported_devices) > 0
@@ -186,6 +212,7 @@ with gr.Blocks(title="H2OGPT") as h2ogpt_web:
            else "Only CUDA Supported for now",
            choices=supported_devices,
            interactive=enabled,
+            allow_custom_value=True,
        )
        precision = gr.Radio(
            label="Precision",
@@ -198,14 +225,6 @@ with gr.Blocks(title="H2OGPT") as h2ogpt_web:
            ],
            visible=True,
        )
-        userpath_selector = gr.Textbox(
-            label="Document Directory",
-            value=str(
-                os.path.abspath("apps/language_models/langchain/user_path/")
-            ),
-            interactive=True,
-            container=True,
-        )
    chatbot = gr.Chatbot(height=500)
    with gr.Row():
        with gr.Column():
@@ -249,3 +268,100 @@ with gr.Blocks(title="H2OGPT") as h2ogpt_web:
        queue=False,
    )
    clear.click(lambda: None, None, [chatbot], queue=False)
+
+
+with gr.Blocks(title="DocuChat Upload") as h2ogpt_upload:
+    import pathlib
+
+    upload_path = None
+    database = None
+    database_directory = os.path.abspath(
+        "apps/language_models/langchain/db_path/"
+    )
+
+    def read_path():
+        global upload_path
+        filenames = [
+            [f]
+            for f in os.listdir(upload_path)
+            if os.path.isfile(os.path.join(upload_path, f))
+        ]
+        filenames.sort()
+        return filenames
+
+    def upload_file(f):
+        names = []
+        for tmpfile in f:
+            name = tmpfile.name.split("/")[-1]
+            basename = os.path.join(upload_path, name)
+            with open(basename, "wb") as w:
+                with open(tmpfile.name, "rb") as r:
+                    w.write(r.read())
+        update_or_create_db()
+        return read_path()
+
+    def update_userpath(newpath):
+        global upload_path
+        upload_path = newpath
+        pathlib.Path(upload_path).mkdir(parents=True, exist_ok=True)
+        return read_path()
+
+    def update_or_create_db():
+        global database
+        global upload_path
+
+        sources = path_to_docs(
+            upload_path,
+            verbose=True,
+            fail_any_exception=False,
+            n_jobs=-1,
+            chunk=True,
+            chunk_size=512,
+            url=None,
+            enable_captions=False,
+            captions_model=None,
+            caption_loader=None,
+            enable_ocr=False,
+        )
+
+        pathlib.Path(database_directory).mkdir(parents=True, exist_ok=True)
+
+        database = create_or_update_db(
+            "chroma",
+            database_directory,
+            "UserData",
+            sources,
+            False,
+            True,
+            True,
+            "sentence-transformers/all-MiniLM-L6-v2",
+        )
+
+    def first_run():
+        global database
+        if database is None:
+            update_or_create_db()
+
+    update_userpath(
+        os.path.abspath("apps/language_models/langchain/user_path/")
+    )
+    h2ogpt_upload.load(fn=first_run)
+    h2ogpt_web.load(fn=first_run)
+
+    with gr.Column():
+        text = gr.DataFrame(
+            col_count=(1, "fixed"),
+            type="array",
+            label="Documents",
+            value=read_path(),
+        )
+        with gr.Row():
+            upload = gr.UploadButton(
+                label="Upload documents",
+                file_count="multiple",
+            )
+            upload.upload(fn=upload_file, inputs=upload, outputs=text)
+            userpath_selector.render()
+            userpath_selector.input(
+                fn=update_userpath, inputs=userpath_selector, outputs=text
+            ).then(fn=update_or_create_db)
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -3,10 +3,15 @@ import torch
 import time
 import sys
 import gradio as gr
+import PIL.ImageOps
 from PIL import Image
-import base64
-from io import BytesIO
-from fastapi.exceptions import HTTPException
+
+from gradio.components.image_editor import (
+    Brush,
+    Eraser,
+    EditorData,
+    EditorValue,
+)
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -16,6 +21,10 @@ from apps.stable_diffusion.web.ui.utils import (
    predefined_paint_models,
    cancel_sd,
 )
+from apps.stable_diffusion.web.ui.common_ui_events import (
+    lora_changed,
+    lora_strength_changed,
+)
 from apps.stable_diffusion.src import (
    args,
    InpaintPipeline,
@@ -38,11 +47,53 @@ init_use_tuned = args.use_tuned
 init_import_mlir = args.import_mlir


+def set_image_states(editor_data):
+    input_mask = editor_data["layers"][0]
+
+    # inpaint_inf wants white mask on black background (?), whilst ImageEditor
+    # delivers black mask on transparent (0 opacity) background
+    inference_mask = Image.new(
+        mode="RGB", size=input_mask.size, color=(255, 255, 255)
+    )
+    inference_mask.paste(input_mask, input_mask)
+    inference_mask = PIL.ImageOps.invert(inference_mask)
+
+    return (
+        # we set the ImageEditor data again, because it likes to clear
+        # the image layers (which include the mask) if the user hasn't
+        # used the upload button, and we sent it and image
+        # TODO: work out what is going wrong in that case so we don't have
+        # to do this
+        {
+            "background": editor_data["background"],
+            "layers": [input_mask],
+            "composite": None,
+        },
+        editor_data["background"],
+        input_mask,
+        inference_mask,
+    )
+
+
+def reload_image_editor(editor_image, editor_mask):
+    # we set the ImageEditor data again, because it likes to clear
+    # the image layers (which include the mask) if the user hasn't
+    # used the upload button, and we sent it the image
+    # TODO: work out what is going wrong in that case so we don't have
+    # to do this
+    return {
+        "background": editor_image,
+        "layers": [editor_mask],
+        "composite": None,
+    }
+
+
 # Exposed to UI.
 def inpaint_inf(
    prompt: str,
    negative_prompt: str,
-    image_dict,
+    image,
+    mask_image,
    height: int,
    width: int,
    inpaint_full_res: bool,
@@ -53,8 +104,7 @@ def inpaint_inf(
    batch_count: int,
    batch_size: int,
    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
+    model_id: str,
    custom_vae: str,
    precision: str,
    device: str,
@@ -62,7 +112,7 @@ def inpaint_inf(
    save_metadata_to_json: bool,
    save_metadata_to_png: bool,
    lora_weights: str,
-    lora_hf_id: str,
+    lora_strength: float,
    ondemand: bool,
    repeatable_seeds: int,
 ):
@@ -89,27 +139,22 @@ def inpaint_inf(
    args.ckpt_loc = ""
    args.hf_model_id = ""
    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, "
-                "both must not be empty.",
-            )
-        if "civitai" in hf_model_id:
-            args.ckpt_loc = hf_model_id
-        else:
-            args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+
+    # .safetensor or .chkpt on the custom model path
+    if model_id in get_custom_model_files(custom_checkpoint_type="inpainting"):
+        args.ckpt_loc = get_custom_model_pathfile(model_id)
+    # civitai download
+    elif "civitai" in model_id:
+        args.ckpt_loc = model_id
+    # either predefined or huggingface
    else:
-        args.hf_model_id = custom_model
+        args.hf_model_id = model_id
+
    if custom_vae != "None":
        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")

-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
+    args.use_lora = get_custom_vae_or_lora_weights(lora_weights, "lora")
+    args.lora_strength = lora_strength

    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png
@@ -128,7 +173,8 @@ def inpaint_inf(
        width,
        device,
        use_lora=args.use_lora,
-        use_stencil=None,
+        lora_strength=args.lora_strength,
+        stencils=[],
        ondemand=ondemand,
    )
    if (
@@ -172,6 +218,7 @@ def inpaint_inf(
                low_cpu_mem_usage=args.low_cpu_mem_usage,
                debug=args.import_debug if args.import_mlir else False,
                use_lora=args.use_lora,
+                lora_strength=args.lora_strength,
                ondemand=args.ondemand,
            )
        )
@@ -181,8 +228,6 @@ def inpaint_inf(
    start_time = time.time()
    global_obj.get_sd_obj().log = ""
    generated_imgs = []
-    image = image_dict["image"]
-    mask_image = image_dict["mask"]
    text_output = ""
    try:
        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
@@ -228,87 +273,10 @@ def inpaint_inf(
    return generated_imgs, text_output


-def decode_base64_to_image(encoding):
-    if encoding.startswith("data:image/"):
-        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
-    try:
-        image = Image.open(BytesIO(base64.b64decode(encoding)))
-        return image
-    except Exception as err:
-        print(err)
-        raise HTTPException(status_code=500, detail="Invalid encoded image")
-
-
-def encode_pil_to_base64(images):
-    encoded_imgs = []
-    for image in images:
-        with BytesIO() as output_bytes:
-            if args.output_img_format.lower() == "png":
-                image.save(output_bytes, format="PNG")
-
-            elif args.output_img_format.lower() in ("jpg", "jpeg"):
-                image.save(output_bytes, format="JPEG")
-            else:
-                raise HTTPException(
-                    status_code=500, detail="Invalid image format"
-                )
-            bytes_data = output_bytes.getvalue()
-            encoded_imgs.append(base64.b64encode(bytes_data))
-    return encoded_imgs
-
-
-# Inpaint Rest API.
-def inpaint_api(
-    InputData: dict,
-):
-    print(
-        f'Prompt: {InputData["prompt"]}, '
-        f'Negative Prompt: {InputData["negative_prompt"]}, '
-        f'Seed: {InputData["seed"]}.'
-    )
-    init_image = decode_base64_to_image(InputData["image"])
-    mask = decode_base64_to_image(InputData["mask"])
-    res = inpaint_inf(
-        InputData["prompt"],
-        InputData["negative_prompt"],
-        {"image": init_image, "mask": mask},
-        InputData["height"],
-        InputData["width"],
-        InputData["is_full_res"],
-        InputData["full_res_padding"],
-        InputData["steps"],
-        InputData["cfg_scale"],
-        InputData["seed"],
-        batch_count=1,
-        batch_size=1,
-        scheduler="EulerDiscrete",
-        custom_model="None",
-        hf_model_id=InputData["hf_model_id"]
-        if "hf_model_id" in InputData.keys()
-        else "stabilityai/stable-diffusion-2-inpainting",
-        custom_vae="None",
-        precision="fp16",
-        device=available_devices[0],
-        max_length=64,
-        save_metadata_to_json=False,
-        save_metadata_to_png=False,
-        lora_weights="None",
-        lora_hf_id="",
-        ondemand=False,
-        repeatable_seeds=False,
-    )
-
-    # Converts generator type to subscriptable
-    res = next(res)
-
-    return {
-        "images": encode_pil_to_base64(res[0]),
-        "parameters": {},
-        "info": res[1],
-    }
-
-
 with gr.Blocks(title="Inpainting") as inpaint_web:
+    editor_image = gr.State()
+    editor_mask = gr.State()
+    inference_mask = gr.State()
    with gr.Row(elem_id="ui_title"):
        nod_logo = Image.open(nodlogo_loc)
        with gr.Row():
@@ -317,6 +285,7 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                    value=nod_logo,
                    show_label=False,
                    interactive=False,
+                    show_download_button=False,
                    elem_id="top_logo",
                    width=150,
                    height=50,
@@ -324,37 +293,34 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
    with gr.Row(elem_id="ui_body"):
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
+                inpaint_init_image = gr.Sketchpad(
+                    label="Masked Image",
+                    type="pil",
+                    sources=("clipboard", "upload"),
+                    interactive=True,
+                    brush=Brush(
+                        colors=["#000000"],
+                        color_mode="fixed",
+                    ),
+                )
                with gr.Row():
                    # janky fix for overflowing text
                    inpaint_model_info = (
-                        str(get_custom_model_path())
-                    ).replace("\\", "\n\\")
-                    inpaint_model_info = (
-                        f"Custom Model Path: {inpaint_model_info}"
+                        f"Custom Model Path: {str(get_custom_model_path())}"
                    )
                    inpaint_custom_model = gr.Dropdown(
                        label=f"Models",
-                        info=inpaint_model_info,
+                        info="Select, or enter HuggingFace Model ID or Civitai model download URL",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
                        else "stabilityai/stable-diffusion-2-inpainting",
-                        choices=["None"]
-                        + get_custom_model_files(
+                        choices=get_custom_model_files(
                            custom_checkpoint_type="inpainting"
                        )
                        + predefined_paint_models,
-                    )
-                    inpaint_hf_model_id = gr.Textbox(
-                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown "
-                        "on the left and enter model ID here "
-                        "e.g: ghunkins/stable-diffusion-liberty-inpainting, "
-                        "https://civitai.com/api/download/models/3433",
-                        value="",
-                        label="HuggingFace Model ID or Civitai model "
-                        "download URL",
-                        lines=3,
+                        allow_custom_value=True,
+                        scale=2,
                    )
                    # janky fix for overflowing text
                    inpaint_vae_info = (
@@ -369,6 +335,8 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        if args.custom_vae
                        else "None",
                        choices=["None"] + get_custom_model_files("vae"),
+                        allow_custom_value=True,
+                        scale=1,
                    )

                with gr.Group(elem_id="prompt_box_outer"):
@@ -384,38 +352,32 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        lines=2,
                        elem_id="negative_prompt_box",
                    )
-
-                inpaint_init_image = gr.Image(
-                    label="Masked Image",
-                    source="upload",
-                    tool="sketch",
-                    type="pil",
-                    height=350,
-                )
-
                with gr.Accordion(label="LoRA Options", open=False):
                    with gr.Row():
-                        # janky fix for overflowing text
-                        inpaint_lora_info = (
-                            str(get_custom_model_path("lora"))
-                        ).replace("\\", "\n\\")
-                        inpaint_lora_info = f"LoRA Path: {inpaint_lora_info}"
                        lora_weights = gr.Dropdown(
-                            label=f"Standalone LoRA Weights",
-                            info=inpaint_lora_info,
+                            label=f"LoRA Weights",
+                            info=f"Select from LoRA in {str(get_custom_model_path('lora'))}, or enter HuggingFace Model ID",
                            elem_id="lora_weights",
                            value="None",
                            choices=["None"] + get_custom_model_files("lora"),
+                            allow_custom_value=True,
+                            scale=3,
                        )
-                        lora_hf_id = gr.Textbox(
-                            elem_id="lora_hf_id",
-                            placeholder="Select 'None' in the Standalone LoRA "
-                            "weights dropdown on the left if you want to use "
-                            "a standalone HuggingFace model ID for LoRA here "
-                            "e.g: sayakpaul/sd-model-finetuned-lora-t4",
-                            value="",
-                            label="HuggingFace Model ID",
-                            lines=3,
+                        lora_strength = gr.Number(
+                            label="LoRA Strength",
+                            info="Will be baked into the .vmfb",
+                            step=0.01,
+                            # number is checked on change so to allow 0.n values
+                            # we have to allow 0 or you can't type 0.n in
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=args.lora_strength,
+                            scale=1,
+                        )
+                    with gr.Row():
+                        lora_tags = gr.HTML(
+                            value="<div><i>No LoRA selected</i></div>",
+                            elem_classes="lora-tags",
                        )
                with gr.Accordion(label="Advanced Options", open=False):
                    with gr.Row():
@@ -424,6 +386,7 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                            label="Scheduler",
                            value="EulerDiscrete",
                            choices=scheduler_list_cpu_only,
+                            allow_custom_value=True,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -527,17 +490,8 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        label="Device",
                        value=available_devices[0],
                        choices=available_devices,
+                        allow_custom_value=True,
                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        lambda: -1,
-                        inputs=[],
-                        outputs=[seed],
-                        queue=False,
-                    )
-                    stop_batch = gr.Button("Stop Batch")
-                    stable_diffusion = gr.Button("Generate Image(s)")

            with gr.Column(scale=1, min_width=600):
                with gr.Group():
@@ -547,16 +501,30 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                        elem_id="gallery",
                        columns=[2],
                        object_fit="contain",
+                        # TODO: Re-enable download when fixed in Gradio
+                        show_download_button=False,
                    )
                    std_output = gr.Textbox(
-                        value=f"Images will be saved at "
+                        value=f"{inpaint_model_info}\n"
+                        "Images will be saved at "
                        f"{get_generated_imgs_path()}",
-                        lines=1,
+                        lines=2,
                        elem_id="std_output",
                        show_label=False,
                    )
                    inpaint_status = gr.Textbox(visible=False)
-
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    blank_thing_for_row = None
                with gr.Row():
                    inpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    inpaint_sendto_outpaint = gr.Button(
@@ -571,7 +539,8 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
            inputs=[
                prompt,
                negative_prompt,
-                inpaint_init_image,
+                editor_image,
+                inference_mask,
                height,
                width,
                inpaint_full_res,
@@ -583,7 +552,6 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                batch_size,
                scheduler,
                inpaint_custom_model,
-                inpaint_hf_model_id,
                custom_vae,
                precision,
                device,
@@ -591,7 +559,7 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                save_metadata_to_json,
                save_metadata_to_png,
                lora_weights,
-                lora_hf_id,
+                lora_strength,
                ondemand,
                repeatable_seeds,
            ],
@@ -602,14 +570,64 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
            fn=lambda bc, bs: status_label("Inpaint", 0, bc, bs),
            inputs=[batch_count, batch_size],
            outputs=inpaint_status,
+            show_progress="none",
+        )
+        set_image_states_args = dict(
+            fn=set_image_states,
+            inputs=[inpaint_init_image],
+            outputs=[
+                inpaint_init_image,
+                editor_image,
+                editor_mask,
+                inference_mask,
+            ],
+            show_progress="none",
+        )
+        reload_image_editor_args = dict(
+            fn=reload_image_editor,
+            inputs=[editor_image, editor_mask],
+            outputs=[inpaint_init_image],
+            show_progress="none",
        )

-        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
-        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
-            **kwargs
+        # all these trigger generation
+        prompt_submit = (
+            prompt.submit(**set_image_states_args)
+            .then(**status_kwargs)
+            .then(**kwargs)
+            .then(**reload_image_editor_args)
        )
-        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = (
+            negative_prompt.submit(**set_image_states_args)
+            .then(**status_kwargs)
+            .then(**kwargs)
+            .then(**reload_image_editor_args)
+        )
+        generate_click = (
+            stable_diffusion.click(**set_image_states_args)
+            .then(**status_kwargs)
+            .then(**kwargs)
+            .then(**reload_image_editor_args)
+        )
+
+        # Attempts to cancel generation
        stop_batch.click(
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
        )
+
+        # Updates LoRA information when one is selected
+        lora_weights.change(
+            fn=lora_changed,
+            inputs=[lora_weights],
+            outputs=[lora_tags],
+            queue=True,
+        )
+
+        lora_strength.change(
+            fn=lora_strength_changed,
+            inputs=lora_strength,
+            outputs=lora_strength,
+            queue=False,
+            show_progress=False,
+        )
--- a/apps/stable_diffusion/web/ui/js/sd_gradio_workarounds.js
+++ b/apps/stable_diffusion/web/ui/js/sd_gradio_workarounds.js
@@ -0,0 +1,49 @@
+// workaround gradio after 4.7, not applying any @media rules form the custom .css file
+
+() => {
+    console.log(`innerWidth: ${window.innerWidth}` )
+
+    // 1536px rules
+
+    const mediaQuery1536 = window.matchMedia('(min-width: 1536px)')
+
+    function handleWidth1536(event) {
+
+        // display in full width for desktop devices
+        document.querySelectorAll(".gradio-container")
+            .forEach( (node) => {
+                if (event.matches) {
+                    node.classList.add("gradio-container-size-full");
+                } else {
+                    node.classList.remove("gradio-container-size-full")
+                }
+            });
+    }
+
+    mediaQuery1536.addEventListener("change", handleWidth1536);
+    mediaQuery1536.dispatchEvent(new MediaQueryListEvent("change", {matches: window.innerWidth >= 1536}));
+
+    // 1921px rules
+
+    const mediaQuery1921 = window.matchMedia('(min-width: 1921px)')
+
+    function handleWidth1921(event) {
+
+        /* Force a 768px_height + 4px_margin_height + navbar_height for the gallery */
+        /* Limit height to 768px_height + 2px_margin_height for the thumbnails */
+        document.querySelectorAll("#gallery")
+            .forEach( (node) => {
+                if (event.matches) {
+                    node.classList.add("gallery-force-height768");
+                    node.classList.add("gallery-limit-height768");
+                } else {
+                    node.classList.remove("gallery-force-height768");
+                    node.classList.remove("gallery-limit-height768");
+                }
+            });
+    }
+
+    mediaQuery1921.addEventListener("change", handleWidth1921);
+    mediaQuery1921.dispatchEvent(new MediaQueryListEvent("change", {matches: window.innerWidth >= 1921}));
+
+}
--- a/apps/stable_diffusion/web/ui/logos/nod-icon.png
+++ b/apps/stable_diffusion/web/ui/logos/nod-icon.png
--- a/apps/stable_diffusion/web/ui/lora_train_ui.py
+++ b/apps/stable_diffusion/web/ui/lora_train_ui.py
@@ -23,6 +23,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                    value=nod_logo,
                    show_label=False,
                    interactive=False,
+                    show_download_button=False,
                    elem_id="top_logo",
                    width=150,
                    height=50,
@@ -50,6 +51,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                                choices=["None"]
                                + get_custom_model_files()
                                + predefined_models,
+                                allow_custom_value=True,
                            )
                            hf_model_id = gr.Textbox(
                                elem_id="hf_model_id",
@@ -73,6 +75,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                        elem_id="lora_weights",
                        value="None",
                        choices=["None"] + get_custom_model_files("lora"),
+                        allow_custom_value=True,
                    )
                    lora_hf_id = gr.Textbox(
                        elem_id="lora_hf_id",
@@ -105,6 +108,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                            label="Scheduler",
                            value=args.scheduler,
                            choices=scheduler_list,
+                            allow_custom_value=True,
                        )
                    with gr.Row():
                        height = gr.Slider(
@@ -177,6 +181,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                        label="Device",
                        value=available_devices[0],
                        choices=available_devices,
+                        allow_custom_value=True,
                    )
                with gr.Row():
                    with gr.Column(scale=2):
@@ -233,9 +238,7 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                max_length,
                training_images_dir,
                output_loc,
-                get_custom_vae_or_lora_weights(
-                    lora_weights, lora_hf_id, "lora"
-                ),
+                get_custom_vae_or_lora_weights(lora_weights, "lora"),
            ],
            outputs=[std_output],
            show_progress="minimal" if args.progress_bar else "none",
--- a/apps/stable_diffusion/web/ui/minigpt4_ui.py
+++ b/apps/stable_diffusion/web/ui/minigpt4_ui.py
@@ -109,7 +109,7 @@ with gr.Blocks() as minigpt4_web:
    gr.Markdown(description)

    with gr.Row():
-        with gr.Column(scale=0.5):
+        with gr.Column():
            image = gr.Image(type="pil")
            upload_button = gr.Button(
                value="Upload & Start Chat",
@@ -143,6 +143,7 @@ with gr.Blocks() as minigpt4_web:
                # else "Only CUDA Supported for now",
                choices=["cuda"],
                interactive=False,
+                allow_custom_value=True,
            )

        with gr.Column():
--- a/apps/stable_diffusion/web/ui/model_manager.py
+++ b/apps/stable_diffusion/web/ui/model_manager.py
@@ -98,13 +98,14 @@ with gr.Blocks() as model_web:
        choices=None,
        value=None,
        visible=False,
+        allow_custom_value=True,
    )
    # TODO: select and SendTo
    civit_models = gr.Gallery(
        label="Civitai Model Gallery",
        value=None,
-        interactive=True,
        visible=False,
+        show_download_button=False,
    )

    with gr.Row(visible=False) as sendto_btns:
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -3,9 +3,11 @@ import torch
 import time
 import gradio as gr
 from PIL import Image
-import base64
-from io import BytesIO
-from fastapi.exceptions import HTTPException
+
+from apps.stable_diffusion.web.ui.common_ui_events import (
+    lora_changed,
+    lora_strength_changed,
+)
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -53,8 +55,7 @@ def outpaint_inf(
    batch_count: int,
    batch_size: int,
    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
+    model_id: str,
    custom_vae: str,
    precision: str,
    device: str,
@@ -62,7 +63,7 @@ def outpaint_inf(
    save_metadata_to_json: bool,
    save_metadata_to_png: bool,
    lora_weights: str,
-    lora_hf_id: str,
+    lora_strength: float,
    ondemand: bool,
    repeatable_seeds: bool,
 ):
@@ -88,27 +89,22 @@ def outpaint_inf(
    args.ckpt_loc = ""
    args.hf_model_id = ""
    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, "
-                "both must not be empty.",
-            )
-        if "civitai" in hf_model_id:
-            args.ckpt_loc = hf_model_id
-        else:
-            args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+
+    # .safetensor or .chkpt on the custom model path
+    if model_id in get_custom_model_files(custom_checkpoint_type="inpainting"):
+        args.ckpt_loc = get_custom_model_pathfile(model_id)
+    # civitai download
+    elif "civitai" in model_id:
+        args.ckpt_loc = model_id
+    # either predefined or huggingface
    else:
-        args.hf_model_id = custom_model
+        args.hf_model_id = model_id
+
    if custom_vae != "None":
        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")

-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
+    args.use_lora = get_custom_vae_or_lora_weights(lora_weights, "lora")
+    args.lora_strength = lora_strength

    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png
@@ -127,7 +123,8 @@ def outpaint_inf(
        width,
        device,
        use_lora=args.use_lora,
-        use_stencil=None,
+        lora_strength=args.lora_strength,
+        stencils=[],
        ondemand=ondemand,
    )
    if (
@@ -169,6 +166,7 @@ def outpaint_inf(
                args.use_base_vae,
                args.use_tuned,
                use_lora=args.use_lora,
+                lora_strength=args.lora_strength,
                ondemand=args.ondemand,
            )
        )
@@ -233,88 +231,6 @@ def outpaint_inf(
    return generated_imgs, text_output, ""


-def decode_base64_to_image(encoding):
-    if encoding.startswith("data:image/"):
-        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
-    try:
-        image = Image.open(BytesIO(base64.b64decode(encoding)))
-        return image
-    except Exception as err:
-        print(err)
-        raise HTTPException(status_code=500, detail="Invalid encoded image")
-
-
-def encode_pil_to_base64(images):
-    encoded_imgs = []
-    for image in images:
-        with BytesIO() as output_bytes:
-            if args.output_img_format.lower() == "png":
-                image.save(output_bytes, format="PNG")
-
-            elif args.output_img_format.lower() in ("jpg", "jpeg"):
-                image.save(output_bytes, format="JPEG")
-            else:
-                raise HTTPException(
-                    status_code=500, detail="Invalid image format"
-                )
-            bytes_data = output_bytes.getvalue()
-            encoded_imgs.append(base64.b64encode(bytes_data))
-    return encoded_imgs
-
-
-# Inpaint Rest API.
-def outpaint_api(
-    InputData: dict,
-):
-    print(
-        f'Prompt: {InputData["prompt"]}, '
-        f'Negative Prompt: {InputData["negative_prompt"]}, '
-        f'Seed: {InputData["seed"]}'
-    )
-    init_image = decode_base64_to_image(InputData["init_images"][0])
-    res = outpaint_inf(
-        InputData["prompt"],
-        InputData["negative_prompt"],
-        init_image,
-        InputData["pixels"],
-        InputData["mask_blur"],
-        InputData["directions"],
-        InputData["noise_q"],
-        InputData["color_variation"],
-        InputData["height"],
-        InputData["width"],
-        InputData["steps"],
-        InputData["cfg_scale"],
-        InputData["seed"],
-        batch_count=1,
-        batch_size=1,
-        scheduler="EulerDiscrete",
-        custom_model="None",
-        hf_model_id=InputData["hf_model_id"]
-        if "hf_model_id" in InputData.keys()
-        else "stabilityai/stable-diffusion-2-inpainting",
-        custom_vae="None",
-        precision="fp16",
-        device=available_devices[0],
-        max_length=64,
-        save_metadata_to_json=False,
-        save_metadata_to_png=False,
-        lora_weights="None",
-        lora_hf_id="",
-        ondemand=False,
-        repeatable_seeds=False,
-    )
-
-    # Convert Generator to Subscriptable
-    res = next(res)
-
-    return {
-        "images": encode_pil_to_base64(res[0]),
-        "parameters": {},
-        "info": res[1],
-    }
-
-
 with gr.Blocks(title="Outpainting") as outpaint_web:
    with gr.Row(elem_id="ui_title"):
        nod_logo = Image.open(nodlogo_loc)
@@ -324,6 +240,7 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                    value=nod_logo,
                    show_label=False,
                    interactive=False,
+                    show_download_button=False,
                    elem_id="top_logo",
                    width=150,
                    height=50,
@@ -331,37 +248,26 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
    with gr.Row(elem_id="ui_body"):
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
+                outpaint_init_image = gr.Image(
+                    label="Input Image", type="pil", sources=["upload"]
+                )
                with gr.Row():
-                    # janky fix for overflowing text
                    outpaint_model_info = (
-                        str(get_custom_model_path())
-                    ).replace("\\", "\n\\")
-                    outpaint_model_info = (
-                        f"Custom Model Path: {outpaint_model_info}"
+                        f"Custom Model Path: {str(get_custom_model_path())}"
                    )
                    outpaint_custom_model = gr.Dropdown(
                        label=f"Models",
-                        info=outpaint_model_info,
+                        info="Select, or enter HuggingFace Model ID or Civitai model download URL",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
                        else "stabilityai/stable-diffusion-2-inpainting",
-                        choices=["None"]
-                        + get_custom_model_files(
+                        choices=get_custom_model_files(
                            custom_checkpoint_type="inpainting"
                        )
                        + predefined_paint_models,
-                    )
-                    outpaint_hf_model_id = gr.Textbox(
-                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown "
-                        "on the left and enter model ID here "
-                        "e.g: ghunkins/stable-diffusion-liberty-inpainting, "
-                        "https://civitai.com/api/download/models/3433",
-                        value="",
-                        label="HuggingFace Model ID or Civitai model "
-                        "download URL",
-                        lines=3,
+                        allow_custom_value=True,
+                        scale=2,
                    )
                    # janky fix for overflowing text
                    outpaint_vae_info = (
@@ -376,8 +282,9 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        if args.custom_vae
                        else "None",
                        choices=["None"] + get_custom_model_files("vae"),
+                        allow_custom_value=True,
+                        scale=1,
                    )
-
                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
                        label="Prompt",
@@ -391,36 +298,32 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        lines=2,
                        elem_id="negative_prompt_box",
                    )
-
-                outpaint_init_image = gr.Image(
-                    label="Input Image",
-                    type="pil",
-                    height=300,
-                )
-
                with gr.Accordion(label="LoRA Options", open=False):
                    with gr.Row():
-                        # janky fix for overflowing text
-                        outpaint_lora_info = (
-                            str(get_custom_model_path("lora"))
-                        ).replace("\\", "\n\\")
-                        outpaint_lora_info = f"LoRA Path: {outpaint_lora_info}"
                        lora_weights = gr.Dropdown(
-                            label=f"Standalone LoRA Weights",
-                            info=outpaint_lora_info,
+                            label=f"LoRA Weights",
+                            info=f"Select from LoRA in {str(get_custom_model_path('lora'))}, or enter HuggingFace Model ID",
                            elem_id="lora_weights",
                            value="None",
                            choices=["None"] + get_custom_model_files("lora"),
+                            allow_custom_value=True,
+                            scale=3,
                        )
-                        lora_hf_id = gr.Textbox(
-                            elem_id="lora_hf_id",
-                            placeholder="Select 'None' in the Standalone LoRA "
-                            "weights dropdown on the left if you want to use "
-                            "a standalone HuggingFace model ID for LoRA here "
-                            "e.g: sayakpaul/sd-model-finetuned-lora-t4",
-                            value="",
-                            label="HuggingFace Model ID",
-                            lines=3,
+                        lora_strength = gr.Number(
+                            label="LoRA Strength",
+                            info="Will be baked into the .vmfb",
+                            step=0.01,
+                            # number is checked on change so to allow 0.n values
+                            # we have to allow 0 or you can't type 0.n in
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=args.lora_strength,
+                            scale=1,
+                        )
+                    with gr.Row():
+                        lora_tags = gr.HTML(
+                            value="<div><i>No LoRA selected</i></div>",
+                            elem_classes="lora-tags",
                        )
                with gr.Accordion(label="Advanced Options", open=False):
                    with gr.Row():
@@ -429,6 +332,7 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                            label="Scheduler",
                            value="EulerDiscrete",
                            choices=scheduler_list_cpu_only,
+                            allow_custom_value=True,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -555,17 +459,8 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        label="Device",
                        value=available_devices[0],
                        choices=available_devices,
+                        allow_custom_value=True,
                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        lambda: -1,
-                        inputs=[],
-                        outputs=[seed],
-                        queue=False,
-                    )
-                    stop_batch = gr.Button("Stop Batch")
-                    stable_diffusion = gr.Button("Generate Image(s)")

            with gr.Column(scale=1, min_width=600):
                with gr.Group():
@@ -577,13 +472,26 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                        object_fit="contain",
                    )
                    std_output = gr.Textbox(
-                        value=f"Images will be saved at "
+                        value=f"{outpaint_model_info}\n"
+                        f"Images will be saved at "
                        f"{get_generated_imgs_path()}",
-                        lines=1,
+                        lines=2,
                        elem_id="std_output",
                        show_label=False,
                    )
                    outpaint_status = gr.Textbox(visible=False)
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    blank_thing_for_row = None
                with gr.Row():
                    outpaint_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    outpaint_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -611,7 +519,6 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                batch_size,
                scheduler,
                outpaint_custom_model,
-                outpaint_hf_model_id,
                custom_vae,
                precision,
                device,
@@ -619,7 +526,7 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                save_metadata_to_json,
                save_metadata_to_png,
                lora_weights,
-                lora_hf_id,
+                lora_strength,
                ondemand,
                repeatable_seeds,
            ],
@@ -641,3 +548,18 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
        )
+
+        lora_weights.change(
+            fn=lora_changed,
+            inputs=[lora_weights],
+            outputs=[lora_tags],
+            queue=True,
+        )
+
+        lora_strength.change(
+            fn=lora_strength_changed,
+            inputs=lora_strength,
+            outputs=lora_strength,
+            queue=False,
+            show_progress=False,
+        )
--- a/apps/stable_diffusion/web/ui/outputgallery_ui.py
+++ b/apps/stable_diffusion/web/ui/outputgallery_ui.py
@@ -80,27 +80,26 @@ with gr.Blocks() as outputgallery_web:
                label="Getting subdirectories...",
                value=nod_logo,
                interactive=False,
+                show_download_button=False,
                visible=True,
                show_label=True,
                elem_id="top_logo",
                elem_classes="logo_centered",
            )
-
            gallery = gr.Gallery(
                label="",
                value=gallery_files.value,
                visible=False,
                show_label=True,
-                columns=2,
+                columns=4,
            )

        with gr.Column(scale=4):
-            with gr.Box():
-                with gr.Row():
+            with gr.Group():
+                with gr.Row(elem_id="output_subdir_container"):
                    with gr.Column(
                        scale=15,
                        min_width=160,
-                        elem_id="output_subdir_container",
                    ):
                        subdirectories = gr.Dropdown(
                            label=f"Subdirectories of {output_dir}",
@@ -108,7 +107,8 @@ with gr.Blocks() as outputgallery_web:
                            choices=subdirectory_paths.value,
                            value="",
                            interactive=True,
-                            elem_classes="dropdown_no_container",
+                            # elem_classes="dropdown_no_container",
+                            allow_custom_value=True,
                        )
                    with gr.Column(
                        scale=1,
@@ -147,10 +147,12 @@ with gr.Blocks() as outputgallery_web:
            ) as parameters_accordian:
                image_parameters = gr.DataFrame(
                    headers=["Parameter", "Value"],
-                    col_count=2,
+                    col_count=(2, "fixed"),
+                    row_count=(1, "fixed"),
                    wrap=True,
                    elem_classes="output_parameters_dataframe",
                    value=[["Status", "No image selected"]],
+                    interactive=False,
                )

            with gr.Accordion(label="Send To", open=True):
@@ -161,6 +163,12 @@ with gr.Blocks() as outputgallery_web:
                        elem_classes="outputgallery_sendto",
                        size="sm",
                    )
+                    outputgallery_sendto_txt2img_sdxl = gr.Button(
+                        value="Txt2Img XL",
+                        interactive=False,
+                        elem_classes="outputgallery_sendto",
+                        size="sm",
+                    )

                    outputgallery_sendto_img2img = gr.Button(
                        value="Img2Img",
@@ -194,15 +202,18 @@ with gr.Blocks() as outputgallery_web:

    def on_clear_gallery():
        return [
-            gr.Gallery.update(
+            gr.Gallery(
                value=[],
                visible=False,
            ),
-            gr.Image.update(
+            gr.Image(
                visible=True,
            ),
        ]

+    def on_image_columns_change(columns):
+        return gr.Gallery(columns=columns)
+
    def on_select_subdir(subdir) -> list:
        # evt.value is the subdirectory name
        new_images = outputgallery_filenames(subdir)
@@ -211,12 +222,12 @@ with gr.Blocks() as outputgallery_web:
        )
        return [
            new_images,
-            gr.Gallery.update(
+            gr.Gallery(
                value=new_images,
                label=new_label,
                visible=len(new_images) > 0,
            ),
-            gr.Image.update(
+            gr.Image(
                label=new_label,
                visible=len(new_images) == 0,
            ),
@@ -250,16 +261,16 @@ with gr.Blocks() as outputgallery_web:
        )

        return [
-            gr.Dropdown.update(
+            gr.Dropdown(
                choices=refreshed_subdirs,
                value=new_subdir,
            ),
            refreshed_subdirs,
            new_images,
-            gr.Gallery.update(
+            gr.Gallery(
                value=new_images, label=new_label, visible=len(new_images) > 0
            ),
-            gr.Image.update(
+            gr.Image(
                label=new_label,
                visible=len(new_images) == 0,
            ),
@@ -285,12 +296,12 @@ with gr.Blocks() as outputgallery_web:

            return [
                new_images,
-                gr.Gallery.update(
+                gr.Gallery(
                    value=new_images,
                    label=new_label,
                    visible=len(new_images) > 0,
                ),
-                gr.Image.update(
+                gr.Image(
                    label=new_label,
                    visible=len(new_images) == 0,
                ),
@@ -315,12 +326,18 @@ with gr.Blocks() as outputgallery_web:
            else:
                return [
                    filename,
-                    list(map(list, params["parameters"].items())),
+                    gr.DataFrame(
+                        value=list(map(list, params["parameters"].items())),
+                        row_count=(len(params["parameters"]), "fixed"),
+                    ),
                ]

        return [
            filename,
-            [["Status", "No parameters found"]],
+            gr.DataFrame(
+                value=[["Status", "No parameters found"]],
+                row_count=(1, "fixed"),
+            ),
        ]

    def on_outputgallery_filename_change(filename: str) -> list:
@@ -328,12 +345,12 @@ with gr.Blocks() as outputgallery_web:
        return [
            # disable or enable each of the sendto button based on whether
            # an image is selected
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
+            gr.Button(interactive=exists),
+            gr.Button(interactive=exists),
+            gr.Button(interactive=exists),
+            gr.Button(interactive=exists),
+            gr.Button(interactive=exists),
+            gr.Button(interactive=exists),
        ]

    # The time first our tab is selected we need to do an initial refresh
@@ -364,53 +381,6 @@ with gr.Blocks() as outputgallery_web:
                gr.update(),
            )

-    # Unfortunately as of gradio 3.34.0 gr.update against Galleries doesn't
-    # support things set with .style, nor the elem_classes kwarg, so we have
-    # to directly set things up via JavaScript if we want the client to take
-    # notice of our changes to the number of columns after it decides to put
-    # them back to the original number when we change something
-    def js_set_columns_in_browser(timeout_length):
-        return f"""
-            (new_cols) => {{
-                setTimeout(() => {{
-                    required_style = "auto ".repeat(new_cols).trim();
-                    gallery = document.querySelector('#outputgallery_gallery .grid-container');
-                    if (gallery) {{
-                        gallery.style.gridTemplateColumns = required_style
-                    }}
-                }}, {timeout_length});
-                return [];      // prevents console error from gradio
-            }}
-        """
-
-    # --- Wire handlers up to the actions
-
-    # Many actions reset the number of columns shown in the gallery on the
-    # browser end, so we have to set them back to what we think they should
-    # be after the initial action.
-    #
-    # None of the actions on this tab trigger inference, and we want the
-    # user to be able to do them whilst other tabs have ongoing inference
-    # running. Waiting in the queue behind inference jobs would mean the UI
-    # can't fully respond until the inference tasks complete,
-    # hence queue=False on all of these.
-    set_gallery_columns_immediate = dict(
-        fn=None,
-        inputs=[image_columns],
-        # gradio blanks the UI on Chrome on Linux on gallery select if
-        # I don't put an output here
-        outputs=[dev_null],
-        _js=js_set_columns_in_browser(0),
-        queue=False,
-    )
-
-    # setting columns after selecting a gallery item needs a real
-    # timeout length for the number of columns to actually be applied.
-    # Not really sure why, maybe something has to finish animating?
-    set_gallery_columns_delayed = dict(
-        set_gallery_columns_immediate, _js=js_set_columns_in_browser(250)
-    )
-
    # clearing images when we need to completely change what's in the
    # gallery avoids current images being shown replacing piecemeal and
    # prevents weirdness and errors if the user selects an image during the
@@ -422,38 +392,42 @@ with gr.Blocks() as outputgallery_web:
        queue=False,
    )

-    image_columns.change(**set_gallery_columns_immediate)
-
    subdirectories.select(**clear_gallery).then(
        on_select_subdir,
        [subdirectories],
        [gallery_files, gallery, logo],
        queue=False,
-    ).then(**set_gallery_columns_immediate)
+    )

-    open_subdir.click(
-        on_open_subdir, inputs=[subdirectories], queue=False
-    ).then(**set_gallery_columns_immediate)
+    open_subdir.click(on_open_subdir, inputs=[subdirectories], queue=False)

    refresh.click(**clear_gallery).then(
        on_refresh,
        [subdirectories],
        [subdirectories, subdirectory_paths, gallery_files, gallery, logo],
        queue=False,
-    ).then(**set_gallery_columns_immediate)
+    )
+
+    image_columns.change(
+        fn=on_image_columns_change,
+        inputs=[image_columns],
+        outputs=[gallery],
+        queue=False,
+    )

    gallery.select(
        on_select_image,
        [gallery_files],
        [outputgallery_filename, image_parameters],
        queue=False,
-    ).then(**set_gallery_columns_delayed)
+    )

    outputgallery_filename.change(
        on_outputgallery_filename_change,
        [outputgallery_filename],
        [
            outputgallery_sendto_txt2img,
+            outputgallery_sendto_txt2img_sdxl,
            outputgallery_sendto_img2img,
            outputgallery_sendto_inpaint,
            outputgallery_sendto_outpaint,
@@ -476,16 +450,17 @@ with gr.Blocks() as outputgallery_web:
                open_subdir,
            ],
            queue=False,
-        ).then(**set_gallery_columns_immediate)
+        )

    # We should have been passed a list of components on other tabs that update
    # when a new image has generated on that tab, so set things up so the user
    # will see that new image if they are looking at today's subdirectory
-    def outputgallery_watch(components: gr.Textbox):
+    def outputgallery_watch(components: gr.Textbox, queued_components=[]):
        for component in components:
            component.change(
                on_new_image,
                inputs=[subdirectories, subdirectory_paths, component],
                outputs=[gallery_files, gallery, logo],
-                queue=False,
-            ).then(**set_gallery_columns_immediate)
+                queue=component in queued_components,
+                show_progress="none",
+            )
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -6,7 +6,10 @@ from transformers import (
    AutoModelForCausalLM,
 )
 from apps.stable_diffusion.web.ui.utils import available_devices
+from shark.iree_utils.compile_utils import clean_device_info
 from datetime import datetime as dt
+import json
+import sys


 def user(message, history):
@@ -22,81 +25,81 @@ past_key_values = None

 model_map = {
    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
+    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
-    "codegen": "Salesforce/codegen25-7b-multi",
-    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
    "vicuna": "TheBloke/vicuna-7B-1.1-HF",
-    "StableLM": "stabilityai/stablelm-tuned-alpha-3b",
 }

 # NOTE: Each `model_name` should have its own start message
 start_message = {
    "llama2_7b": (
-        "System: You are a helpful, respectful and honest assistant. Always answer "
-        "as helpfully as possible, while being safe.  Your answers should not "
-        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
-        "content. Please ensure that your responses are socially unbiased and positive "
-        "in nature. If a question does not make any sense, or is not factually coherent, "
-        "explain why instead of answering something not correct. If you don't know the "
-        "answer to a question, please don't share false information."
+        "You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe. Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
+        "illegal content. Please ensure that your responses are socially "
+        "unbiased and positive in nature. If a question does not make any "
+        "sense, or is not factually coherent, explain why instead of "
+        "answering something not correct. If you don't know the answer "
+        "to a question, please don't share false information."
+    ),
+    "llama2_13b": (
+        "You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe. Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
+        "illegal content. Please ensure that your responses are socially "
+        "unbiased and positive in nature. If a question does not make any "
+        "sense, or is not factually coherent, explain why instead of "
+        "answering something not correct. If you don't know the answer "
+        "to a question, please don't share false information."
    ),
    "llama2_70b": (
-        "System: You are a helpful, respectful and honest assistant. Always answer "
-        "as helpfully as possible, while being safe.  Your answers should not "
-        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
-        "content. Please ensure that your responses are socially unbiased and positive "
-        "in nature. If a question does not make any sense, or is not factually coherent, "
-        "explain why instead of answering something not correct. If you don't know the "
-        "answer to a question, please don't share false information."
-    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
+        "You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe. Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
+        "illegal content. Please ensure that your responses are socially "
+        "unbiased and positive in nature. If a question does not make any "
+        "sense, or is not factually coherent, explain why instead of "
+        "answering something not correct. If you don't know the answer "
+        "to a question, please don't share false information."
    ),
    "vicuna": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
+        "A chat between a curious user and an artificial intelligence "
+        "assistant. The assistant gives helpful, detailed, and "
+        "polite answers to the user's questions.\n"
    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


-def create_prompt(model_name, history):
-    system_message = start_message[model_name]
+def create_prompt(model_name, history, prompt_prefix):
+    system_message = ""
+    if prompt_prefix:
+        system_message = start_message[model_name]

-    if model_name in [
-        "StableLM",
-        "vicuna",
-        "vicuna1p3",
-        "llama2_7b",
-        "llama2_70b",
-    ]:
+    if "llama2" in model_name:
+        B_INST, E_INST = "[INST]", "[/INST]"
+        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+        conversation = "".join(
+            [f"{B_INST} {item[0]} {E_INST} {item[1]} " for item in history[1:]]
+        )
+        if prompt_prefix:
+            msg = f"{B_INST} {B_SYS}{system_message}{E_SYS}{history[0][0]} {E_INST} {history[0][1]} {conversation}"
+        else:
+            msg = f"{B_INST} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
+    elif model_name in ["vicuna"]:
        conversation = "".join(
            [
                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
                for item in history
            ]
        )
+        msg = system_message + conversation
+        msg = msg.strip()
    else:
        conversation = "".join(
            ["".join([item[0], item[1]]) for item in history]
        )
-
-    msg = system_message + conversation
-    msg = msg.strip()
+        msg = system_message + conversation
+        msg = msg.strip()
    return msg


@@ -105,84 +108,182 @@ def set_vicuna_model(model):
    vicuna_model = model


+def get_default_config():
+    import torch
+    from transformers import AutoTokenizer
+
+    hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
+    compilation_prompt = "".join(["0" for _ in range(17)])
+    compilation_input_ids = tokenizer(
+        compilation_prompt,
+        return_tensors="pt",
+    ).input_ids
+    compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
+        [1, 19]
+    )
+    firstVicunaCompileInput = (compilation_input_ids,)
+    from apps.language_models.src.model_wrappers.vicuna_model import (
+        CombinedModel,
+    )
+    from shark.shark_generate_model_config import GenerateConfigFile
+
+    model = CombinedModel()
+    c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
+    c.split_into_layers()
+
+
+model_vmfb_key = ""
+
+
 # TODO: Make chat reusable for UI and API
-def chat(curr_system_message, history, model, device, precision, cli=True):
+def chat(
+    prompt_prefix,
+    history,
+    model,
+    backend,
+    devices,
+    sharded,
+    precision,
+    download_vmfb,
+    config_file,
+    cli=False,
+    progress=gr.Progress(),
+):
    global past_key_values
-
+    global model_vmfb_key
    global vicuna_model
+
    model_name, model_path = list(map(str.strip, model.split("=>")))
+    device, device_id = clean_device_info(devices[0])
+    no_of_devices = len(devices)

-    if model_name in [
-        "vicuna",
-        "vicuna1p3",
-        "codegen",
-        "llama2_7b",
-        "llama2_70b",
-    ]:
-        from apps.language_models.scripts.vicuna import (
-            UnshardedVicuna,
+    from apps.language_models.scripts.vicuna import ShardedVicuna
+    from apps.language_models.scripts.vicuna import UnshardedVicuna
+    from apps.stable_diffusion.src import args
+
+    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{device_id}#{precision}#{download_vmfb}"
+    if vicuna_model is None or new_model_vmfb_key != model_vmfb_key:
+        model_vmfb_key = new_model_vmfb_key
+        max_toks = 128 if model_name == "codegen" else 512
+
+        # get iree flags that need to be overridden, from commandline args
+        _extra_args = []
+        # vulkan target triple
+        vulkan_target_triple = args.iree_vulkan_target_triple
+        from shark.iree_utils.vulkan_utils import (
+            get_all_vulkan_devices,
+            get_vulkan_target_triple,
        )
-        from apps.stable_diffusion.src import args

-        if vicuna_model == 0:
-            if "cuda" in device:
-                device = "cuda"
-            elif "sync" in device:
-                device = "cpu-sync"
-            elif "task" in device:
-                device = "cpu-task"
-            elif "vulkan" in device:
-                device = "vulkan"
-            else:
-                print("unrecognized device")
+        _extra_args = _extra_args + [
+            "--iree-opt-const-eval=false",
+            "--iree-opt-data-tiling=false",
+        ]

-            max_toks = 128 if model_name == "codegen" else 512
+        if device == "vulkan":
+            vulkaninfo_list = get_all_vulkan_devices()
+            if vulkan_target_triple == "":
+                # We already have the device_id extracted via WebUI, so we directly use
+                # that to find the target triple.
+                vulkan_target_triple = get_vulkan_target_triple(
+                    vulkaninfo_list[device_id]
+                )
+            _extra_args.append(
+                f"-iree-vulkan-target-triple={vulkan_target_triple}"
+            )
+            if "rdna" in vulkan_target_triple:
+                flags_to_add = [
+                    "--iree-spirv-index-bits=64",
+                ]
+                _extra_args = _extra_args + flags_to_add
+
+            if device_id is None:
+                id = 0
+                for device in vulkaninfo_list:
+                    target_triple = get_vulkan_target_triple(
+                        vulkaninfo_list[id]
+                    )
+                    if target_triple == vulkan_target_triple:
+                        device_id = id
+                        break
+                    id += 1
+
+                assert (
+                    device_id
+                ), f"no vulkan hardware for target-triple '{vulkan_target_triple}' exists"
+            print(f"Will use vulkan target triple : {vulkan_target_triple}")
+
+        elif "rocm" in device:
+            # add iree rocm flags
+            if args.iree_rocm_target_chip != "":
+                _extra_args.append(
+                    f"--iree-rocm-target-chip={args.iree_rocm_target_chip}"
+                )
+                print(f"extra args = {_extra_args}")
+
+        if sharded:
+            vicuna_model = ShardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                compressed=True,
+                extra_args_cmd=_extra_args,
+                n_devices=no_of_devices,
+            )
+        else:
+            #  if config_file is None:
            vicuna_model = UnshardedVicuna(
                model_name,
                hf_model_path=model_path,
                hf_auth_token=args.hf_auth_token,
                device=device,
+                vulkan_target_triple=vulkan_target_triple,
                precision=precision,
                max_num_tokens=max_toks,
+                download_vmfb=download_vmfb,
+                load_mlir_from_shark_tank=True,
+                extra_args_cmd=_extra_args,
+                device_id=device_id,
            )
-        prompt = create_prompt(model_name, history)

-        for partial_text in vicuna_model.generate(prompt, cli=cli):
-            history[-1][1] = partial_text
-            yield history
+    if vicuna_model is None:
+        sys.exit("Unable to instantiate the model object, exiting.")

-        return history
-
-    # else Model is StableLM
-    global sharkModel
-    from apps.language_models.src.pipelines.stablelm_pipeline import (
-        SharkStableLM,
-    )
-
-    if sharkModel == 0:
-        # max_new_tokens=512
-        shark_slm = SharkStableLM(
-            model_name
-        )  # pass elements from UI as required
-
-    # Construct the input message string for the model by concatenating the
-    # current system message and conversation history
-    if len(curr_system_message.split()) > 160:
-        print("clearing context")
-    prompt = create_prompt(model_name, history)
-    generate_kwargs = dict(prompt=prompt)
-
-    words_list = shark_slm.generate(**generate_kwargs)
+    prompt = create_prompt(model_name, history, prompt_prefix)

    partial_text = ""
-    for new_text in words_list:
-        print(new_text)
-        partial_text += new_text
-        history[-1][1] = partial_text
-        # Yield an empty string to clean up the message textbox and the updated
-        # conversation history
-        yield history
-    return words_list
+    token_count = 0
+    total_time_ms = 0.001  # In order to avoid divide by zero error
+    prefill_time = 0
+    is_first = True
+    # for text, msg, exec_time in progress.tqdm(
+    #    vicuna_model.generate(prompt, cli=cli),
+    #    desc="generating response",
+    # ):
+    for text, msg, exec_time in vicuna_model.generate(prompt, cli=cli):
+        if msg is None:
+            if is_first:
+                prefill_time = exec_time / 1000
+                is_first = False
+            else:
+                total_time_ms += exec_time
+                token_count += 1
+            partial_text += text + " "
+            history[-1][1] = partial_text
+            yield history, f"Prefill: {prefill_time:.2f}"
+        elif "formatted" in msg:
+            history[-1][1] = text
+            tokens_per_sec = (token_count / total_time_ms) * 1000
+            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
+        else:
+            sys.exit(
+                "unexpected message from the vicuna generate call, exiting."
+            )
+
+    return history, ""


 def llm_chat_api(InputData: dict):
@@ -218,17 +319,9 @@ def llm_chat_api(InputData: dict):
        UnshardedVicuna,
    )

+    device_id = None
    if vicuna_model == 0:
-        if "cuda" in device:
-            device = "cuda"
-        elif "sync" in device:
-            device = "cpu-sync"
-        elif "task" in device:
-            device = "cpu-task"
-        elif "vulkan" in device:
-            device = "vulkan"
-        else:
-            print("unrecognized device")
+        device, device_id = clean_device_info(device)

        vicuna_model = UnshardedVicuna(
            model_name,
@@ -236,6 +329,9 @@ def llm_chat_api(InputData: dict):
            device=device,
            precision=precision,
            max_num_tokens=max_toks,
+            download_vmfb=True,
+            load_mlir_from_shark_tank=True,
+            device_id=device_id,
        )

    # TODO: add role dict for different models
@@ -291,6 +387,16 @@ def view_json_file(file_obj):
    return content


+filtered_devices = dict()
+
+
+def change_backend(backend):
+    new_choices = gr.Dropdown(
+        choices=filtered_devices[backend], label=f"{backend} devices"
+    )
+    return new_choices
+
+
 with gr.Blocks(title="Chatbot") as stablelm_chat:
    with gr.Row():
        model_choices = list(
@@ -300,41 +406,69 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
            label="Select Model",
            value=model_choices[0],
            choices=model_choices,
+            allow_custom_value=True,
        )
        supported_devices = available_devices
        enabled = len(supported_devices) > 0
        # show cpu-task device first in list for chatbot
        supported_devices = supported_devices[-1:] + supported_devices[:-1]
        supported_devices = [x for x in supported_devices if "sync" not in x]
-        print(supported_devices)
+        backend_list = ["cpu", "cuda", "vulkan", "rocm"]
+        for x in backend_list:
+            filtered_devices[x] = [y for y in supported_devices if x in y]
+        print(filtered_devices)
+
+        backend = gr.Radio(
+            label="backend",
+            value="cpu",
+            choices=backend_list,
+        )
        device = gr.Dropdown(
-            label="Device",
-            value=supported_devices[0]
-            if enabled
-            else "Only CUDA Supported for now",
-            choices=supported_devices,
-            interactive=enabled,
+            label="cpu devices",
+            choices=filtered_devices["cpu"],
+            interactive=True,
+            allow_custom_value=True,
+            multiselect=True,
        )
        precision = gr.Radio(
            label="Precision",
-            value="fp16",
+            value="int4",
            choices=[
                "int4",
                "int8",
                "fp16",
-                "fp32",
            ],
-            visible=True,
+            visible=False,
        )
-    with gr.Row():
+        tokens_time = gr.Textbox(label="Tokens generated per second")
+        with gr.Column():
+            download_vmfb = gr.Checkbox(
+                label="Download vmfb from Shark tank if available",
+                value=True,
+                interactive=True,
+            )
+            prompt_prefix = gr.Checkbox(
+                label="Add System Prompt",
+                value=False,
+                interactive=True,
+            )
+            sharded = gr.Checkbox(
+                label="Shard Model",
+                value=False,
+                interactive=True,
+            )
+
+    with gr.Row(visible=False):
        with gr.Group():
-            config_file = gr.File(label="Upload sharding configuration")
-            json_view_button = gr.Button("View as JSON")
-        json_view = gr.JSON()
+            config_file = gr.File(
+                label="Upload sharding configuration", visible=False
+            )
+            json_view_button = gr.Button(value="View as JSON", visible=False)
+        json_view = gr.JSON(visible=False)
        json_view_button.click(
            fn=view_json_file, inputs=[config_file], outputs=[json_view]
        )
-    chatbot = gr.Chatbot(height=500)
+    chatbot = gr.Chatbot(elem_id="chatbot")
    with gr.Row():
        with gr.Column():
            msg = gr.Textbox(
@@ -349,24 +483,58 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
                submit = gr.Button("Submit", interactive=enabled)
                stop = gr.Button("Stop", interactive=enabled)
                clear = gr.Button("Clear", interactive=enabled)
-    system_msg = gr.Textbox(
-        start_message, label="System Message", interactive=False, visible=False
+
+    backend.change(
+        fn=change_backend,
+        inputs=[backend],
+        outputs=[device],
+        show_progress=False,
    )

    submit_event = msg.submit(
-        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, device, precision],
-        outputs=[chatbot],
+        inputs=[
+            prompt_prefix,
+            chatbot,
+            model,
+            backend,
+            device,
+            sharded,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
        queue=True,
    )
    submit_click_event = submit.click(
-        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+        fn=user,
+        inputs=[msg, chatbot],
+        outputs=[msg, chatbot],
+        show_progress=False,
+        queue=False,
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, device, precision],
-        outputs=[chatbot],
+        inputs=[
+            prompt_prefix,
+            chatbot,
+            model,
+            backend,
+            device,
+            sharded,
+            precision,
+            download_vmfb,
+            config_file,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
        queue=True,
    )
    stop.click(
--- a/apps/stable_diffusion/web/ui/txt2img_sdxl_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_sdxl_ui.py
@@ -0,0 +1,661 @@
+import os
+import torch
+import time
+import sys
+import gradio as gr
+from PIL import Image
+from math import ceil
+from apps.stable_diffusion.web.ui.utils import (
+    available_devices,
+    nodlogo_loc,
+    get_custom_model_path,
+    get_custom_model_files,
+    scheduler_list,
+    predefined_sdxl_models,
+    cancel_sd,
+    set_model_default_configs,
+)
+from apps.stable_diffusion.web.ui.common_ui_events import (
+    lora_changed,
+    lora_strength_changed,
+)
+from apps.stable_diffusion.web.utils.metadata import import_png_metadata
+from apps.stable_diffusion.web.utils.common_label_calc import status_label
+from apps.stable_diffusion.src import (
+    args,
+    Text2ImageSDXLPipeline,
+    get_schedulers,
+    set_init_device_flags,
+    utils,
+    save_output_img,
+    prompt_examples,
+    Image2ImagePipeline,
+)
+from apps.stable_diffusion.src.utils import (
+    get_generated_imgs_path,
+    get_generation_text_info,
+)
+
+# set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
+init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
+init_iree_metal_target_platform = args.iree_metal_target_platform
+init_use_tuned = args.use_tuned
+init_import_mlir = args.import_mlir
+
+
+def txt2img_sdxl_inf(
+    prompt: str,
+    negative_prompt: str,
+    height: int,
+    width: int,
+    steps: int,
+    guidance_scale: float,
+    seed: str | int,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    model_id: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    max_length: int,
+    save_metadata_to_json: bool,
+    save_metadata_to_png: bool,
+    lora_weights: str,
+    lora_strength: float,
+    ondemand: bool,
+    repeatable_seeds: bool,
+):
+    from apps.stable_diffusion.web.ui.utils import (
+        get_custom_model_pathfile,
+        get_custom_vae_or_lora_weights,
+        Config,
+    )
+    import apps.stable_diffusion.web.utils.global_obj as global_obj
+    from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
+        SD_STATE_CANCEL,
+    )
+
+    if precision != "fp16":
+        print("currently we support fp16 for SDXL")
+        precision = "fp16"
+
+    args.prompts = [prompt]
+    args.negative_prompts = [negative_prompt]
+    args.guidance_scale = guidance_scale
+    args.steps = steps
+    args.scheduler = scheduler
+    args.ondemand = ondemand
+
+    # set ckpt_loc and hf_model_id.
+    args.ckpt_loc = ""
+    args.hf_model_id = ""
+    args.custom_vae = ""
+
+    # .safetensor or .chkpt on the custom model path
+    if model_id in get_custom_model_files():
+        args.ckpt_loc = get_custom_model_pathfile(model_id)
+    # civitai download
+    elif "civitai" in model_id:
+        args.ckpt_loc = model_id
+    # either predefined or huggingface
+    else:
+        args.hf_model_id = model_id
+
+    if custom_vae:
+        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")
+
+    args.save_metadata_to_json = save_metadata_to_json
+    args.write_metadata_to_png = save_metadata_to_png
+
+    args.use_lora = get_custom_vae_or_lora_weights(lora_weights, "lora")
+    args.lora_strength = lora_strength
+
+    dtype = torch.float32 if precision == "fp32" else torch.half
+    cpu_scheduling = not scheduler.startswith("Shark")
+    new_config_obj = Config(
+        "txt2img_sdxl",
+        args.hf_model_id,
+        args.ckpt_loc,
+        args.custom_vae,
+        precision,
+        batch_size,
+        max_length,
+        height,
+        width,
+        device,
+        use_lora=args.use_lora,
+        lora_strength=args.lora_strength,
+        stencils=None,
+        ondemand=ondemand,
+    )
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_cfg_obj() != new_config_obj
+    ):
+        global_obj.clear_cache()
+        global_obj.set_cfg_obj(new_config_obj)
+        args.precision = precision
+        args.batch_count = batch_count
+        args.batch_size = batch_size
+        args.max_length = max_length
+        args.height = height
+        args.width = width
+        args.device = device.split("=>", 1)[1].strip()
+        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
+        args.iree_metal_target_platform = init_iree_metal_target_platform
+        args.use_tuned = init_use_tuned
+        args.import_mlir = init_import_mlir
+        args.img_path = None
+        set_init_device_flags()
+        model_id = (
+            args.hf_model_id
+            if args.hf_model_id
+            else "stabilityai/stable-diffusion-xl-base-1.0"
+        )
+
+        global_obj.set_schedulers(get_schedulers(model_id))
+        scheduler_obj = global_obj.get_scheduler(scheduler)
+        if global_obj.get_cfg_obj().ondemand:
+            print("Running txt2img in memory efficient mode.")
+        global_obj.set_sd_obj(
+            Text2ImageSDXLPipeline.from_pretrained(
+                scheduler=scheduler_obj,
+                import_mlir=args.import_mlir,
+                model_id=args.hf_model_id,
+                ckpt_loc=args.ckpt_loc,
+                precision=precision,
+                max_length=max_length,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                use_base_vae=args.use_base_vae,
+                use_tuned=args.use_tuned,
+                custom_vae=args.custom_vae,
+                low_cpu_mem_usage=args.low_cpu_mem_usage,
+                debug=args.import_debug if args.import_mlir else False,
+                use_lora=args.use_lora,
+                lora_strength=args.lora_strength,
+                use_quantize=args.use_quantize,
+                ondemand=global_obj.get_cfg_obj().ondemand,
+            )
+        )
+
+    global_obj.set_sd_scheduler(scheduler)
+
+    start_time = time.time()
+    global_obj.get_sd_obj().log = ""
+    generated_imgs = []
+    text_output = ""
+    try:
+        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    except TypeError as error:
+        raise gr.Error(str(error)) from None
+
+    for current_batch in range(batch_count):
+        out_imgs = global_obj.get_sd_obj().generate_images(
+            prompt,
+            negative_prompt,
+            batch_size,
+            height,
+            width,
+            steps,
+            guidance_scale,
+            seeds[current_batch],
+            args.max_length,
+            dtype,
+            args.use_base_vae,
+            cpu_scheduling,
+            args.max_embeddings_multiples,
+        )
+
+        total_time = time.time() - start_time
+        text_output = get_generation_text_info(
+            seeds[: current_batch + 1], device
+        )
+        text_output += "\n" + global_obj.get_sd_obj().log
+        text_output += f"\nTotal image(s) generation time: {total_time:.4f}sec"
+
+        if global_obj.get_sd_status() == SD_STATE_CANCEL:
+            break
+        else:
+            save_output_img(out_imgs[0], seeds[current_batch])
+            generated_imgs.extend(out_imgs)
+            yield generated_imgs, text_output, status_label(
+                "Text-to-Image-SDXL",
+                current_batch + 1,
+                batch_count,
+                batch_size,
+            )
+
+    return generated_imgs, text_output, ""
+
+
+theme = gr.themes.Glass(
+    primary_hue="slate",
+    secondary_hue="gray",
+)
+
+with gr.Blocks(title="Text-to-Image-SDXL", theme=theme) as txt2img_sdxl_web:
+    with gr.Row(elem_id="ui_title"):
+        nod_logo = Image.open(nodlogo_loc)
+        with gr.Row():
+            with gr.Column(scale=1, elem_id="demo_title_outer"):
+                gr.Image(
+                    value=nod_logo,
+                    show_label=False,
+                    interactive=False,
+                    show_download_button=False,
+                    elem_id="top_logo",
+                    width=150,
+                    height=50,
+                )
+    with gr.Row(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=600):
+                with gr.Row():
+                    with gr.Column(scale=10):
+                        with gr.Row():
+                            t2i_sdxl_model_info = f"Custom Model Path: {str(get_custom_model_path())}"
+                            txt2img_sdxl_custom_model = gr.Dropdown(
+                                label=f"Models",
+                                info="Select, or enter HuggingFace Model ID or Civitai model download URL",
+                                elem_id="custom_model",
+                                value=os.path.basename(args.ckpt_loc)
+                                if args.ckpt_loc
+                                else "stabilityai/stable-diffusion-xl-base-1.0",
+                                choices=predefined_sdxl_models
+                                + get_custom_model_files(
+                                    custom_checkpoint_type="sdxl"
+                                ),
+                                allow_custom_value=True,
+                                scale=11,
+                            )
+                            t2i_sdxl_vae_info = (
+                                str(get_custom_model_path("vae"))
+                            ).replace("\\", "\n\\")
+                            t2i_sdxl_vae_info = (
+                                f"VAE Path: {t2i_sdxl_vae_info}"
+                            )
+                            custom_vae = gr.Dropdown(
+                                label=f"VAE Models",
+                                info=t2i_sdxl_vae_info,
+                                elem_id="custom_model",
+                                value="madebyollin/sdxl-vae-fp16-fix",
+                                choices=[
+                                    None,
+                                    "madebyollin/sdxl-vae-fp16-fix",
+                                ]
+                                + get_custom_model_files(
+                                    "vae", custom_checkpoint_type="sdxl"
+                                ),
+                                allow_custom_value=True,
+                                scale=4,
+                            )
+                            txt2img_sdxl_png_info_img = gr.Image(
+                                scale=1,
+                                label="Import PNG info",
+                                elem_id="txt2img_prompt_image",
+                                type="pil",
+                                visible=True,
+                                sources=["upload"],
+                            )
+
+                with gr.Group(elem_id="prompt_box_outer"):
+                    txt2img_sdxl_autogen = gr.Checkbox(
+                        label="Auto-Generate Images",
+                        value=False,
+                        visible=False,
+                    )
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        value=args.prompts[0],
+                        lines=2,
+                        elem_id="prompt_box",
+                        show_copy_button=True,
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="Negative Prompt",
+                        value=args.negative_prompts[0],
+                        lines=2,
+                        elem_id="negative_prompt_box",
+                        show_copy_button=True,
+                    )
+                with gr.Accordion(label="LoRA Options", open=False):
+                    with gr.Row():
+                        lora_weights = gr.Dropdown(
+                            label=f"LoRA Weights",
+                            info=f"Select from LoRA in {str(get_custom_model_path('lora'))}, or enter HuggingFace Model ID",
+                            elem_id="lora_weights",
+                            value="None",
+                            choices=["None"] + get_custom_model_files("lora"),
+                            allow_custom_value=True,
+                            scale=3,
+                        )
+                        lora_strength = gr.Number(
+                            label="LoRA Strength",
+                            info="Will be baked into the .vmfb",
+                            step=0.01,
+                            # number is checked on change so to allow 0.n values
+                            # we have to allow 0 or you can't type 0.n in
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=args.lora_strength,
+                            scale=1,
+                        )
+                    with gr.Row():
+                        lora_tags = gr.HTML(
+                            value="<div><i>No LoRA selected</i></div>",
+                            elem_classes="lora-tags",
+                        )
+                with gr.Accordion(label="Advanced Options", open=False):
+                    with gr.Row():
+                        scheduler = gr.Dropdown(
+                            elem_id="scheduler",
+                            label="Scheduler",
+                            value="EulerDiscrete",
+                            choices=[
+                                "DDIM",
+                                "EulerAncestralDiscrete",
+                                "EulerDiscrete",
+                                "LCMScheduler",
+                            ],
+                            allow_custom_value=True,
+                            visible=True,
+                        )
+                        with gr.Column():
+                            save_metadata_to_png = gr.Checkbox(
+                                label="Save prompt information to PNG",
+                                value=args.write_metadata_to_png,
+                                interactive=True,
+                            )
+                            save_metadata_to_json = gr.Checkbox(
+                                label="Save prompt information to JSON file",
+                                value=args.save_metadata_to_json,
+                                interactive=True,
+                            )
+                    with gr.Row():
+                        height = gr.Slider(
+                            512,
+                            1024,
+                            value=768,
+                            step=256,
+                            label="Height",
+                            visible=True,
+                            interactive=True,
+                        )
+                        width = gr.Slider(
+                            512,
+                            1024,
+                            value=768,
+                            step=256,
+                            label="Width",
+                            visible=True,
+                            interactive=True,
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value="fp16",
+                            choices=[
+                                "fp16",
+                            ],
+                            visible=False,
+                        )
+                        max_length = gr.Radio(
+                            label="Max Length",
+                            value=77,
+                            choices=[
+                                64,
+                                77,
+                            ],
+                            visible=False,
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            steps = gr.Slider(
+                                1, 100, value=args.steps, step=1, label="Steps"
+                            )
+                        with gr.Column(scale=3):
+                            guidance_scale = gr.Slider(
+                                0,
+                                50,
+                                value=args.guidance_scale,
+                                step=0.1,
+                                label="Guidance Scale",
+                            )
+                        ondemand = gr.Checkbox(
+                            value=args.ondemand,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            batch_count = gr.Slider(
+                                1,
+                                100,
+                                value=args.batch_count,
+                                step=1,
+                                label="Batch Count",
+                                interactive=True,
+                            )
+                        with gr.Column(scale=3):
+                            batch_size = gr.Slider(
+                                1,
+                                4,
+                                value=args.batch_size,
+                                step=1,
+                                label="Batch Size",
+                                interactive=False,
+                                visible=False,
+                            )
+                        repeatable_seeds = gr.Checkbox(
+                            args.repeatable_seeds,
+                            label="Repeatable Seeds",
+                        )
+
+                with gr.Row():
+                    seed = gr.Textbox(
+                        value=args.seed,
+                        label="Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
+                    )
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=available_devices[0],
+                        choices=available_devices,
+                        allow_custom_value=True,
+                    )
+                with gr.Accordion(label="Prompt Examples!", open=False):
+                    ex = gr.Examples(
+                        examples=prompt_examples,
+                        inputs=prompt,
+                        cache_examples=False,
+                        elem_id="prompt_examples",
+                    )
+
+            with gr.Column(scale=1, min_width=600):
+                with gr.Group():
+                    txt2img_sdxl_gallery = gr.Gallery(
+                        label="Generated images",
+                        show_label=False,
+                        elem_id="gallery",
+                        columns=[2],
+                        object_fit="scale_down",
+                    )
+                    std_output = gr.Textbox(
+                        value=f"{t2i_sdxl_model_info}\n"
+                        f"Images will be saved at "
+                        f"{get_generated_imgs_path()}",
+                        lines=1,
+                        elem_id="std_output",
+                        show_label=False,
+                    )
+                    txt2img_sdxl_status = gr.Textbox(visible=False)
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    txt2img_sdxl_sendto_img2img = gr.Button(
+                        value="Send To Img2Img",
+                        visible=False,
+                    )
+                    txt2img_sdxl_sendto_inpaint = gr.Button(
+                        value="Send To Inpaint",
+                        visible=False,
+                    )
+                    txt2img_sdxl_sendto_outpaint = gr.Button(
+                        value="Send To Outpaint",
+                        visible=False,
+                    )
+                    txt2img_sdxl_sendto_upscaler = gr.Button(
+                        value="Send To Upscaler",
+                        visible=False,
+                    )
+
+        kwargs = dict(
+            fn=txt2img_sdxl_inf,
+            inputs=[
+                prompt,
+                negative_prompt,
+                height,
+                width,
+                steps,
+                guidance_scale,
+                seed,
+                batch_count,
+                batch_size,
+                scheduler,
+                txt2img_sdxl_custom_model,
+                custom_vae,
+                precision,
+                device,
+                max_length,
+                save_metadata_to_json,
+                save_metadata_to_png,
+                lora_weights,
+                lora_strength,
+                ondemand,
+                repeatable_seeds,
+            ],
+            outputs=[txt2img_sdxl_gallery, std_output, txt2img_sdxl_status],
+            show_progress="minimal" if args.progress_bar else "none",
+            queue=True,
+        )
+
+        status_kwargs = dict(
+            fn=lambda bc, bs: status_label("Text-to-Image-SDXL", 0, bc, bs),
+            inputs=[batch_count, batch_size],
+            outputs=txt2img_sdxl_status,
+            concurrency_limit=1,
+        )
+
+        def autogen_changed(checked):
+            if checked:
+                args.autogen = True
+            else:
+                args.autogen = False
+
+        def check_last_input(prompt):
+            if not prompt.endswith(" "):
+                return True
+            elif not args.autogen:
+                return True
+            else:
+                return False
+
+        auto_gen_kwargs = dict(
+            fn=check_last_input,
+            inputs=[negative_prompt],
+            outputs=[txt2img_sdxl_status],
+            concurrency_limit=1,
+        )
+
+        txt2img_sdxl_autogen.change(
+            fn=autogen_changed,
+            inputs=[txt2img_sdxl_autogen],
+            outputs=None,
+        )
+        prompt_submit = prompt.submit(**status_kwargs).then(**kwargs)
+        neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(
+            **kwargs
+        )
+        generate_click = stable_diffusion.click(**status_kwargs).then(**kwargs)
+        stop_batch.click(
+            fn=cancel_sd,
+            cancels=[
+                prompt_submit,
+                neg_prompt_submit,
+                generate_click,
+            ],
+        )
+
+        txt2img_sdxl_png_info_img.change(
+            fn=import_png_metadata,
+            inputs=[
+                txt2img_sdxl_png_info_img,
+                prompt,
+                negative_prompt,
+                steps,
+                scheduler,
+                guidance_scale,
+                seed,
+                width,
+                height,
+                txt2img_sdxl_custom_model,
+                lora_weights,
+                custom_vae,
+            ],
+            outputs=[
+                txt2img_sdxl_png_info_img,
+                prompt,
+                negative_prompt,
+                steps,
+                scheduler,
+                guidance_scale,
+                seed,
+                width,
+                height,
+                txt2img_sdxl_custom_model,
+                lora_weights,
+                custom_vae,
+            ],
+        )
+        txt2img_sdxl_custom_model.change(
+            fn=set_model_default_configs,
+            inputs=[
+                txt2img_sdxl_custom_model,
+            ],
+            outputs=[
+                prompt,
+                negative_prompt,
+                steps,
+                scheduler,
+                guidance_scale,
+                width,
+                height,
+                custom_vae,
+                txt2img_sdxl_autogen,
+            ],
+        )
+        lora_weights.change(
+            fn=lora_changed,
+            inputs=[lora_weights],
+            outputs=[lora_tags],
+            queue=True,
+        )
+
+        lora_strength.change(
+            fn=lora_strength_changed,
+            inputs=lora_strength,
+            outputs=lora_strength,
+            queue=False,
+            show_progress=False,
+        )
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -1,21 +1,27 @@
+import json
 import os
+import warnings
 import torch
 import time
 import sys
 import gradio as gr
 from PIL import Image
-import base64
-from io import BytesIO
-from fastapi.exceptions import HTTPException
+from math import ceil
+
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
    get_custom_model_path,
    get_custom_model_files,
    scheduler_list,
+    scheduler_list_cpu_only,
    predefined_models,
    cancel_sd,
 )
+from apps.stable_diffusion.web.ui.common_ui_events import (
+    lora_changed,
+    lora_strength_changed,
+)
 from apps.stable_diffusion.web.utils.metadata import import_png_metadata
 from apps.stable_diffusion.web.utils.common_label_calc import status_label
 from apps.stable_diffusion.src import (
@@ -26,12 +32,42 @@ from apps.stable_diffusion.src import (
    utils,
    save_output_img,
    prompt_examples,
+    Image2ImagePipeline,
 )
 from apps.stable_diffusion.src.utils import (
    get_generated_imgs_path,
    get_generation_text_info,
+    resampler_list,
 )

+# Names of all interactive fields that can be edited by user
+all_gradio_labels = [
+    "txt2img_custom_model",
+    "custom_vae",
+    "prompt",
+    "negative_prompt",
+    "lora_weights",
+    "lora_strength",
+    "scheduler",
+    "save_metadata_to_png",
+    "save_metadata_to_json",
+    "height",
+    "width",
+    "steps",
+    "guidance_scale",
+    "Low VRAM",
+    "use_hiresfix",
+    "resample_type",
+    "hiresfix_height",
+    "hiresfix_width",
+    "hiresfix_strength",
+    "batch_count",
+    "batch_size",
+    "repeatable_seeds",
+    "seed",
+    "device",
+]
+
 # set initial values of iree_vulkan_target_triple, use_tuned and import_mlir.
 init_iree_vulkan_target_triple = args.iree_vulkan_target_triple
 init_iree_metal_target_platform = args.iree_metal_target_platform
@@ -50,8 +86,7 @@ def txt2img_inf(
    batch_count: int,
    batch_size: int,
    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
+    model_id: str,
    custom_vae: str,
    precision: str,
    device: str,
@@ -59,9 +94,14 @@ def txt2img_inf(
    save_metadata_to_json: bool,
    save_metadata_to_png: bool,
    lora_weights: str,
-    lora_hf_id: str,
+    lora_strength: float,
    ondemand: bool,
    repeatable_seeds: bool,
+    use_hiresfix: bool,
+    hiresfix_height: int,
+    hiresfix_width: int,
+    hiresfix_strength: float,
+    resample_type: str,
 ):
    from apps.stable_diffusion.web.ui.utils import (
        get_custom_model_pathfile,
@@ -84,30 +124,25 @@ def txt2img_inf(
    args.ckpt_loc = ""
    args.hf_model_id = ""
    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, "
-                "both must not be empty",
-            )
-        if "civitai" in hf_model_id:
-            args.ckpt_loc = hf_model_id
-        else:
-            args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+
+    # .safetensor or .chkpt on the custom model path
+    if model_id in get_custom_model_files():
+        args.ckpt_loc = get_custom_model_pathfile(model_id)
+    # civitai download
+    elif "civitai" in model_id:
+        args.ckpt_loc = model_id
+    # either predefined or huggingface
    else:
-        args.hf_model_id = custom_model
+        args.hf_model_id = model_id
+
    if custom_vae != "None":
        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")

    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png

-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
+    args.use_lora = get_custom_vae_or_lora_weights(lora_weights, "lora")
+    args.lora_strength = lora_strength

    dtype = torch.float32 if precision == "fp32" else torch.half
    cpu_scheduling = not scheduler.startswith("Shark")
@@ -123,7 +158,8 @@ def txt2img_inf(
        width,
        device,
        use_lora=args.use_lora,
-        use_stencil=None,
+        lora_strength=args.lora_strength,
+        stencils=[],
        ondemand=ondemand,
    )
    if (
@@ -138,6 +174,11 @@ def txt2img_inf(
        args.max_length = max_length
        args.height = height
        args.width = width
+        args.use_hiresfix = use_hiresfix
+        args.hiresfix_height = hiresfix_height
+        args.hiresfix_width = hiresfix_width
+        args.hiresfix_strength = hiresfix_strength
+        args.resample_type = resample_type
        args.device = device.split("=>", 1)[1].strip()
        args.iree_vulkan_target_triple = init_iree_vulkan_target_triple
        args.iree_metal_target_platform = init_iree_metal_target_platform
@@ -169,6 +210,7 @@ def txt2img_inf(
                low_cpu_mem_usage=args.low_cpu_mem_usage,
                debug=args.import_debug if args.import_mlir else False,
                use_lora=args.use_lora,
+                lora_strength=args.lora_strength,
                ondemand=args.ondemand,
            )
        )
@@ -200,6 +242,85 @@ def txt2img_inf(
            cpu_scheduling,
            args.max_embeddings_multiples,
        )
+        # TODO: allow user to save original image
+        # TODO: add option to let user keep both pipelines loaded, and unload
+        #  either at will
+        # TODO: add custom step value slider
+        # TODO: add option to use secondary model for the img2img pass
+        if use_hiresfix is True:
+            new_config_obj = Config(
+                "img2img",
+                args.hf_model_id,
+                args.ckpt_loc,
+                args.custom_vae,
+                precision,
+                1,
+                max_length,
+                height,
+                width,
+                device,
+                use_lora=args.use_lora,
+                lora_strength=args.lora_strength,
+                stencils=[],
+                ondemand=ondemand,
+            )
+
+            global_obj.clear_cache()
+            global_obj.set_cfg_obj(new_config_obj)
+            set_init_device_flags()
+            model_id = (
+                args.hf_model_id
+                if args.hf_model_id
+                else "stabilityai/stable-diffusion-2-1-base"
+            )
+            global_obj.set_schedulers(get_schedulers(model_id))
+            scheduler_obj = global_obj.get_scheduler(args.scheduler)
+
+            global_obj.set_sd_obj(
+                Image2ImagePipeline.from_pretrained(
+                    scheduler_obj,
+                    args.import_mlir,
+                    args.hf_model_id,
+                    args.ckpt_loc,
+                    args.custom_vae,
+                    args.precision,
+                    args.max_length,
+                    1,
+                    hiresfix_height,
+                    hiresfix_width,
+                    args.use_base_vae,
+                    args.use_tuned,
+                    low_cpu_mem_usage=args.low_cpu_mem_usage,
+                    debug=args.import_debug if args.import_mlir else False,
+                    use_lora=args.use_lora,
+                    lora_strength=args.lora_strength,
+                    ondemand=args.ondemand,
+                )
+            )
+
+            global_obj.set_sd_scheduler(args.scheduler)
+
+            out_imgs = global_obj.get_sd_obj().generate_images(
+                prompt,
+                negative_prompt,
+                out_imgs[0],
+                batch_size,
+                hiresfix_height,
+                hiresfix_width,
+                ceil(steps / hiresfix_strength),
+                hiresfix_strength,
+                guidance_scale,
+                seeds[current_batch],
+                args.max_length,
+                dtype,
+                args.use_base_vae,
+                cpu_scheduling,
+                args.max_embeddings_multiples,
+                stencils=[],
+                images=None,
+                control_mode=None,
+                resample_type=resample_type,
+            )
        total_time = time.time() - start_time
        text_output = get_generation_text_info(
            seeds[: current_batch + 1], device
@@ -219,71 +340,92 @@ def txt2img_inf(
    return generated_imgs, text_output, ""


-def encode_pil_to_base64(images):
-    encoded_imgs = []
-    for image in images:
-        with BytesIO() as output_bytes:
-            if args.output_img_format.lower() == "png":
-                image.save(output_bytes, format="PNG")
-
-            elif args.output_img_format.lower() in ("jpg", "jpeg"):
-                image.save(output_bytes, format="JPEG")
-            else:
-                raise HTTPException(
-                    status_code=500, detail="Invalid image format"
-                )
-            bytes_data = output_bytes.getvalue()
-            encoded_imgs.append(base64.b64encode(bytes_data))
-    return encoded_imgs
-
-
-# Text2Img Rest API.
-def txt2img_api(
-    InputData: dict,
-):
-    print(
-        f'Prompt: {InputData["prompt"]}, '
-        f'Negative Prompt: {InputData["negative_prompt"]}, '
-        f'Seed: {InputData["seed"]}.'
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(
+        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
    )
-    res = txt2img_inf(
-        InputData["prompt"],
-        InputData["negative_prompt"],
-        InputData["height"],
-        InputData["width"],
-        InputData["steps"],
-        InputData["cfg_scale"],
-        InputData["seed"],
-        batch_count=1,
-        batch_size=1,
-        scheduler="EulerDiscrete",
-        custom_model="None",
-        hf_model_id=InputData["hf_model_id"]
-        if "hf_model_id" in InputData.keys()
-        else "stabilityai/stable-diffusion-2-1-base",
-        custom_vae="None",
-        precision="fp16",
-        device=available_devices[0],
-        max_length=64,
-        save_metadata_to_json=False,
-        save_metadata_to_png=False,
-        lora_weights="None",
-        lora_hf_id="",
-        ondemand=False,
-        repeatable_seeds=False,
-    )
-
-    # Convert Generator to Subscriptable
-    res = next(res)
-
-    return {
-        "images": encode_pil_to_base64(res[0]),
-        "parameters": {},
-        "info": res[1],
-    }
+    return os.path.join(base_path, relative_path)


-with gr.Blocks(title="Text-to-Image") as txt2img_web:
+dark_theme = resource_path("ui/css/sd_dark_theme.css")
+
+
+# This function export values for all fields that can be edited by user to the settings.json file in ui folder
+def export_settings(*values):
+    settings_list = list(zip(all_gradio_labels, values))
+    settings = {}
+
+    for label, value in settings_list:
+        settings[label] = value
+
+    settings = {"txt2img": settings}
+    with open("./ui/settings.json", "w") as json_file:
+        json.dump(settings, json_file, indent=4)
+
+
+# This function loads all values for all fields that can be edited by user from the settings.json file in ui folder
+def load_settings():
+    try:
+        with open("./ui/settings.json", "r") as json_file:
+            loaded_settings = json.load(json_file)["txt2img"]
+    except (FileNotFoundError, KeyError):
+        warnings.warn(
+            "Settings.json file not found or 'txt2img' key is missing. Using default values for fields."
+        )
+        loaded_settings = (
+            {}
+        )  # json file not existing or the data wasn't saved yet
+
+    return [
+        loaded_settings.get(
+            "txt2img_custom_model",
+            os.path.basename(args.ckpt_loc)
+            if args.ckpt_loc
+            else "stabilityai/stable-diffusion-2-1-base",
+        ),
+        loaded_settings.get(
+            "custom_vae",
+            os.path.basename(args.custom_vae) if args.custom_vae else "None",
+        ),
+        loaded_settings.get("prompt", args.prompts[0]),
+        loaded_settings.get("negative_prompt", args.negative_prompts[0]),
+        loaded_settings.get("lora_weights", "None"),
+        loaded_settings.get("lora_strength", args.lora_strength),
+        loaded_settings.get("scheduler", args.scheduler),
+        loaded_settings.get(
+            "save_metadata_to_png", args.write_metadata_to_png
+        ),
+        loaded_settings.get(
+            "save_metadata_to_json", args.save_metadata_to_json
+        ),
+        loaded_settings.get("height", args.height),
+        loaded_settings.get("width", args.width),
+        loaded_settings.get("steps", args.steps),
+        loaded_settings.get("guidance_scale", args.guidance_scale),
+        loaded_settings.get("Low VRAM", args.ondemand),
+        loaded_settings.get("use_hiresfix", args.use_hiresfix),
+        loaded_settings.get("resample_type", args.resample_type),
+        loaded_settings.get("hiresfix_height", args.hiresfix_height),
+        loaded_settings.get("hiresfix_width", args.hiresfix_width),
+        loaded_settings.get("hiresfix_strength", args.hiresfix_strength),
+        loaded_settings.get("batch_count", args.batch_count),
+        loaded_settings.get("batch_size", args.batch_size),
+        loaded_settings.get("repeatable_seeds", args.repeatable_seeds),
+        loaded_settings.get("seed", args.seed),
+        loaded_settings.get("device", available_devices[0]),
+    ]
+
+
+# This function loads the user's exported default settings on the start of program
+def onload_load_settings():
+    loaded_data = load_settings()
+    structured_data = settings_list = list(zip(all_gradio_labels, loaded_data))
+    return dict(structured_data)
+
+
+default_settings = onload_load_settings()
+with gr.Blocks(title="Text-to-Image", css=dark_theme) as txt2img_web:
    with gr.Row(elem_id="ui_title"):
        nod_logo = Image.open(nodlogo_loc)
        with gr.Row():
@@ -292,6 +434,7 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                    value=nod_logo,
                    show_label=False,
                    interactive=False,
+                    show_download_button=False,
                    elem_id="top_logo",
                    width=150,
                    height=50,
@@ -300,34 +443,20 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
                with gr.Row():
-                    with gr.Column(scale=10):
+                    with gr.Column():
                        with gr.Row():
-                            # janky fix for overflowing text
-                            t2i_model_info = (
-                                str(get_custom_model_path())
-                            ).replace("\\", "\n\\")
-                            t2i_model_info = (
-                                f"Custom Model Path: {t2i_model_info}"
-                            )
+                            t2i_model_info = f"Custom Model Path: {str(get_custom_model_path())}"
                            txt2img_custom_model = gr.Dropdown(
                                label=f"Models",
-                                info=t2i_model_info,
+                                info="Select, or enter HuggingFace Model ID or Civitai model download URL",
                                elem_id="custom_model",
-                                value=os.path.basename(args.ckpt_loc)
-                                if args.ckpt_loc
-                                else "stabilityai/stable-diffusion-2-1-base",
-                                choices=["None"]
-                                + get_custom_model_files()
+                                value=default_settings.get(
+                                    "txt2img_custom_model"
+                                ),
+                                choices=get_custom_model_files()
                                + predefined_models,
-                            )
-                            txt2img_hf_model_id = gr.Textbox(
-                                elem_id="hf_model_id",
-                                placeholder="Select 'None' in the dropdown "
-                                "on the left and enter model ID here.",
-                                value="",
-                                label="HuggingFace Model ID or Civitai model "
-                                "download URL.",
-                                lines=3,
+                                allow_custom_value=True,
+                                scale=11,
                            )
                            # janky fix for overflowing text
                            t2i_vae_info = (
@@ -338,89 +467,101 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                                label=f"VAE Models",
                                info=t2i_vae_info,
                                elem_id="custom_model",
-                                value=os.path.basename(args.custom_vae)
-                                if args.custom_vae
-                                else "None",
+                                value=default_settings.get("custom_vae"),
                                choices=["None"]
                                + get_custom_model_files("vae"),
+                                allow_custom_value=True,
+                                scale=4,
+                            )
+                            txt2img_png_info_img = gr.Image(
+                                label="Import PNG info",
+                                elem_id="txt2img_prompt_image",
+                                type="pil",
+                                visible=True,
+                                sources=["upload"],
+                                scale=1,
                            )
-                    with gr.Column(scale=1, min_width=170):
-                        txt2img_png_info_img = gr.Image(
-                            label="Import PNG info",
-                            elem_id="txt2img_prompt_image",
-                            type="pil",
-                            tool="None",
-                            visible=True,
-                        )
-
                with gr.Group(elem_id="prompt_box_outer"):
                    prompt = gr.Textbox(
                        label="Prompt",
-                        value=args.prompts[0],
+                        value=default_settings.get("prompt"),
                        lines=2,
                        elem_id="prompt_box",
                    )
+                    # TODO: coming soon
+                    autogen = gr.Checkbox(
+                        label="Continuous Generation",
+                        visible=False,
+                    )
                    negative_prompt = gr.Textbox(
                        label="Negative Prompt",
-                        value=args.negative_prompts[0],
+                        value=default_settings.get("negative_prompt"),
                        lines=2,
                        elem_id="negative_prompt_box",
                    )
                with gr.Accordion(label="LoRA Options", open=False):
                    with gr.Row():
-                        # janky fix for overflowing text
-                        t2i_lora_info = (
-                            str(get_custom_model_path("lora"))
-                        ).replace("\\", "\n\\")
-                        t2i_lora_info = f"LoRA Path: {t2i_lora_info}"
                        lora_weights = gr.Dropdown(
-                            label=f"Standalone LoRA Weights",
-                            info=t2i_lora_info,
+                            label=f"LoRA Weights",
+                            info=f"Select from LoRA in {str(get_custom_model_path('lora'))}, or enter HuggingFace Model ID",
                            elem_id="lora_weights",
-                            value="None",
+                            value=default_settings.get("lora_weights"),
                            choices=["None"] + get_custom_model_files("lora"),
+                            allow_custom_value=True,
+                            scale=3,
                        )
-                        lora_hf_id = gr.Textbox(
-                            elem_id="lora_hf_id",
-                            placeholder="Select 'None' in the Standalone LoRA "
-                            "weights dropdown on the left if you want to use "
-                            "a standalone HuggingFace model ID for LoRA here "
-                            "e.g: sayakpaul/sd-model-finetuned-lora-t4",
-                            value="",
-                            label="HuggingFace Model ID",
-                            lines=3,
+                        lora_strength = gr.Number(
+                            label="LoRA Strength",
+                            info="Will be baked into the .vmfb",
+                            step=0.01,
+                            # number is checked on change so to allow 0.n values
+                            # we have to allow 0 or you can't type 0.n in
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=default_settings.get("lora_strength"),
+                            scale=1,
+                        )
+                    with gr.Row():
+                        lora_tags = gr.HTML(
+                            value="<div><i>No LoRA selected</i></div>",
+                            elem_classes="lora-tags",
                        )
                with gr.Accordion(label="Advanced Options", open=False):
                    with gr.Row():
                        scheduler = gr.Dropdown(
                            elem_id="scheduler",
                            label="Scheduler",
-                            value=args.scheduler,
+                            value=default_settings.get("scheduler"),
                            choices=scheduler_list,
+                            allow_custom_value=True,
                        )
                        with gr.Column():
                            save_metadata_to_png = gr.Checkbox(
                                label="Save prompt information to PNG",
-                                value=args.write_metadata_to_png,
+                                value=default_settings.get(
+                                    "save_metadata_to_png"
+                                ),
                                interactive=True,
                            )
                            save_metadata_to_json = gr.Checkbox(
                                label="Save prompt information to JSON file",
-                                value=args.save_metadata_to_json,
+                                value=default_settings.get(
+                                    "save_metadata_to_json"
+                                ),
                                interactive=True,
                            )
                    with gr.Row():
                        height = gr.Slider(
                            384,
                            768,
-                            value=args.height,
+                            value=default_settings.get("height"),
                            step=8,
                            label="Height",
                        )
                        width = gr.Slider(
                            384,
                            768,
-                            value=args.width,
+                            value=default_settings.get("width"),
                            step=8,
                            label="Width",
                        )
@@ -445,18 +586,22 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                    with gr.Row():
                        with gr.Column(scale=3):
                            steps = gr.Slider(
-                                1, 100, value=args.steps, step=1, label="Steps"
+                                1,
+                                100,
+                                value=default_settings.get("steps"),
+                                step=1,
+                                label="Steps",
                            )
                        with gr.Column(scale=3):
                            guidance_scale = gr.Slider(
                                0,
                                50,
-                                value=args.guidance_scale,
+                                value=default_settings.get("guidance_scale"),
                                step=0.1,
                                label="CFG Scale",
                            )
                        ondemand = gr.Checkbox(
-                            value=args.ondemand,
+                            value=default_settings.get("Low VRAM"),
                            label="Low VRAM",
                            interactive=True,
                        )
@@ -465,7 +610,7 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            batch_count = gr.Slider(
                                1,
                                100,
-                                value=args.batch_count,
+                                value=default_settings.get("batch_count"),
                                step=1,
                                label="Batch Count",
                                interactive=True,
@@ -476,35 +621,62 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                                4,
                                value=args.batch_size,
                                step=1,
-                                label="Batch Size",
+                                label=default_settings.get("batch_size"),
                                interactive=True,
+                                visible=False,
                            )
                        repeatable_seeds = gr.Checkbox(
-                            args.repeatable_seeds,
+                            default_settings.get("repeatable_seeds"),
                            label="Repeatable Seeds",
                        )
+                with gr.Accordion(label="Hires Fix Options", open=False):
+                    with gr.Group():
+                        with gr.Row():
+                            use_hiresfix = gr.Checkbox(
+                                value=default_settings.get("use_hiresfix"),
+                                label="Use Hires Fix",
+                                interactive=True,
+                            )
+                            resample_type = gr.Dropdown(
+                                value=default_settings.get("resample_type"),
+                                choices=resampler_list,
+                                label="Resample Type",
+                                allow_custom_value=False,
+                            )
+                        hiresfix_height = gr.Slider(
+                            384,
+                            768,
+                            value=default_settings.get("hiresfix_height"),
+                            step=8,
+                            label="Hires Fix Height",
+                        )
+                        hiresfix_width = gr.Slider(
+                            384,
+                            768,
+                            value=default_settings.get("hiresfix_width"),
+                            step=8,
+                            label="Hires Fix Width",
+                        )
+                        hiresfix_strength = gr.Slider(
+                            0,
+                            1,
+                            value=default_settings.get("hiresfix_strength"),
+                            step=0.01,
+                            label="Hires Fix Denoising Strength",
+                        )
                with gr.Row():
                    seed = gr.Textbox(
-                        value=args.seed,
+                        value=default_settings.get("seed"),
                        label="Seed",
                        info="An integer or a JSON list of integers, -1 for random",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
                        label="Device",
-                        value=available_devices[0],
+                        value=default_settings.get("device"),
                        choices=available_devices,
+                        allow_custom_value=True,
                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        lambda: -1,
-                        inputs=[],
-                        outputs=[seed],
-                        queue=False,
-                    )
-                    stop_batch = gr.Button("Stop Batch")
-                    stable_diffusion = gr.Button("Generate Image(s)")
                with gr.Accordion(label="Prompt Examples!", open=False):
                    ex = gr.Examples(
                        examples=prompt_examples,
@@ -523,13 +695,26 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                        object_fit="contain",
                    )
                    std_output = gr.Textbox(
-                        value=f"Images will be saved at "
+                        value=f"{t2i_model_info}\n"
+                        f"Images will be saved at "
                        f"{get_generated_imgs_path()}",
                        lines=1,
                        elem_id="std_output",
                        show_label=False,
                    )
                    txt2img_status = gr.Textbox(visible=False)
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    blank_thing_for_row = None
                with gr.Row():
                    txt2img_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    txt2img_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -539,6 +724,75 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                    txt2img_sendto_upscaler = gr.Button(
                        value="SendTo Upscaler"
                    )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        export_defaults = gr.Button(
+                            value="Load Default Settings"
+                        )
+                        export_defaults.click(
+                            fn=load_settings,
+                            inputs=[],
+                            outputs=[
+                                txt2img_custom_model,
+                                custom_vae,
+                                prompt,
+                                negative_prompt,
+                                lora_weights,
+                                lora_strength,
+                                scheduler,
+                                save_metadata_to_png,
+                                save_metadata_to_json,
+                                height,
+                                width,
+                                steps,
+                                guidance_scale,
+                                ondemand,
+                                use_hiresfix,
+                                resample_type,
+                                hiresfix_height,
+                                hiresfix_width,
+                                hiresfix_strength,
+                                batch_count,
+                                batch_size,
+                                repeatable_seeds,
+                                seed,
+                                device,
+                            ],
+                        )
+                    with gr.Column(scale=2):
+                        export_defaults = gr.Button(
+                            value="Export Default Settings"
+                        )
+                        export_defaults.click(
+                            fn=export_settings,
+                            inputs=[
+                                txt2img_custom_model,
+                                custom_vae,
+                                prompt,
+                                negative_prompt,
+                                lora_weights,
+                                lora_strength,
+                                scheduler,
+                                save_metadata_to_png,
+                                save_metadata_to_json,
+                                height,
+                                width,
+                                steps,
+                                guidance_scale,
+                                ondemand,
+                                use_hiresfix,
+                                resample_type,
+                                hiresfix_height,
+                                hiresfix_width,
+                                hiresfix_strength,
+                                batch_count,
+                                batch_size,
+                                repeatable_seeds,
+                                seed,
+                                device,
+                            ],
+                            outputs=[],
+                        )

        kwargs = dict(
            fn=txt2img_inf,
@@ -554,7 +808,6 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                batch_size,
                scheduler,
                txt2img_custom_model,
-                txt2img_hf_model_id,
                custom_vae,
                precision,
                device,
@@ -562,9 +815,14 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                save_metadata_to_json,
                save_metadata_to_png,
                lora_weights,
-                lora_hf_id,
+                lora_strength,
                ondemand,
                repeatable_seeds,
+                use_hiresfix,
+                hiresfix_height,
+                hiresfix_width,
+                hiresfix_strength,
+                resample_type,
            ],
            outputs=[txt2img_gallery, std_output, txt2img_status],
            show_progress="minimal" if args.progress_bar else "none",
@@ -599,9 +857,7 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                width,
                height,
                txt2img_custom_model,
-                txt2img_hf_model_id,
                lora_weights,
-                lora_hf_id,
                custom_vae,
            ],
            outputs=[
@@ -615,9 +871,43 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                width,
                height,
                txt2img_custom_model,
-                txt2img_hf_model_id,
                lora_weights,
-                lora_hf_id,
+                lora_strength,
                custom_vae,
            ],
        )
+
+        # SharkEulerDiscrete doesn't work with img2img which hires_fix uses
+        def set_compatible_schedulers(hires_fix_selected):
+            if hires_fix_selected:
+                return gr.Dropdown(
+                    choices=scheduler_list_cpu_only,
+                    value="DEISMultistep",
+                )
+            else:
+                return gr.Dropdown(
+                    choices=scheduler_list,
+                    value="SharkEulerDiscrete",
+                )
+
+        use_hiresfix.change(
+            fn=set_compatible_schedulers,
+            inputs=[use_hiresfix],
+            outputs=[scheduler],
+            queue=False,
+        )
+
+        lora_weights.change(
+            fn=lora_changed,
+            inputs=[lora_weights],
+            outputs=[lora_tags],
+            queue=True,
+        )
+
+        lora_strength.change(
+            fn=lora_strength_changed,
+            inputs=lora_strength,
+            outputs=lora_strength,
+            queue=False,
+            show_progress=False,
+        )
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -3,9 +3,7 @@ import torch
 import time
 import gradio as gr
 from PIL import Image
-import base64
-from io import BytesIO
-from fastapi.exceptions import HTTPException
+
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -15,6 +13,10 @@ from apps.stable_diffusion.web.ui.utils import (
    predefined_upscaler_models,
    cancel_sd,
 )
+from apps.stable_diffusion.web.ui.common_ui_events import (
+    lora_changed,
+    lora_strength_changed,
+)
 from apps.stable_diffusion.web.utils.common_label_calc import status_label
 from apps.stable_diffusion.src import (
    args,
@@ -46,8 +48,7 @@ def upscaler_inf(
    batch_count: int,
    batch_size: int,
    scheduler: str,
-    custom_model: str,
-    hf_model_id: str,
+    model_id: str,
    custom_vae: str,
    precision: str,
    device: str,
@@ -55,7 +56,7 @@ def upscaler_inf(
    save_metadata_to_json: bool,
    save_metadata_to_png: bool,
    lora_weights: str,
-    lora_hf_id: str,
+    lora_strength: float,
    ondemand: bool,
    repeatable_seeds: bool,
 ):
@@ -85,30 +86,25 @@ def upscaler_inf(
    args.ckpt_loc = ""
    args.hf_model_id = ""
    args.custom_vae = ""
-    if custom_model == "None":
-        if not hf_model_id:
-            return (
-                None,
-                "Please provide either custom model or huggingface model ID, "
-                "both must not be empty.",
-            )
-        if "civitai" in hf_model_id:
-            args.ckpt_loc = hf_model_id
-        else:
-            args.hf_model_id = hf_model_id
-    elif ".ckpt" in custom_model or ".safetensors" in custom_model:
-        args.ckpt_loc = get_custom_model_pathfile(custom_model)
+
+    # .safetensor or .chkpt on the custom model path
+    if model_id in get_custom_model_files(custom_checkpoint_type="upscaler"):
+        args.ckpt_loc = get_custom_model_pathfile(model_id)
+    # civitai download
+    elif "civitai" in model_id:
+        args.ckpt_loc = model_id
+    # either predefined or huggingface
    else:
-        args.hf_model_id = custom_model
+        args.hf_model_id = model_id
+
    if custom_vae != "None":
        args.custom_vae = get_custom_model_pathfile(custom_vae, model="vae")

    args.save_metadata_to_json = save_metadata_to_json
    args.write_metadata_to_png = save_metadata_to_png

-    args.use_lora = get_custom_vae_or_lora_weights(
-        lora_weights, lora_hf_id, "lora"
-    )
+    args.use_lora = get_custom_vae_or_lora_weights(lora_weights, "lora")
+    args.lora_strength = lora_strength

    dtype = torch.float32 if precision == "fp32" else torch.half
    cpu_scheduling = not scheduler.startswith("Shark")
@@ -126,7 +122,8 @@ def upscaler_inf(
        args.width,
        device,
        use_lora=args.use_lora,
-        use_stencil=None,
+        lora_strength=args.lora_strength,
+        stencils=[],
        ondemand=ondemand,
    )
    if (
@@ -165,6 +162,7 @@ def upscaler_inf(
                args.use_tuned,
                low_cpu_mem_usage=args.low_cpu_mem_usage,
                use_lora=args.use_lora,
+                lora_strength=args.lora_strength,
                ondemand=args.ondemand,
            )
        )
@@ -252,83 +250,6 @@ def upscaler_inf(
    yield generated_imgs, text_output, ""


-def decode_base64_to_image(encoding):
-    if encoding.startswith("data:image/"):
-        encoding = encoding.split(";", 1)[1].split(",", 1)[1]
-    try:
-        image = Image.open(BytesIO(base64.b64decode(encoding)))
-        return image
-    except Exception as err:
-        print(err)
-        raise HTTPException(status_code=500, detail="Invalid encoded image")
-
-
-def encode_pil_to_base64(images):
-    encoded_imgs = []
-    for image in images:
-        with BytesIO() as output_bytes:
-            if args.output_img_format.lower() == "png":
-                image.save(output_bytes, format="PNG")
-
-            elif args.output_img_format.lower() in ("jpg", "jpeg"):
-                image.save(output_bytes, format="JPEG")
-            else:
-                raise HTTPException(
-                    status_code=500, detail="Invalid image format"
-                )
-            bytes_data = output_bytes.getvalue()
-            encoded_imgs.append(base64.b64encode(bytes_data))
-    return encoded_imgs
-
-
-# Upscaler Rest API.
-def upscaler_api(
-    InputData: dict,
-):
-    print(
-        f'Prompt: {InputData["prompt"]}, '
-        f'Negative Prompt: {InputData["negative_prompt"]}, '
-        f'Seed: {InputData["seed"]}'
-    )
-    init_image = decode_base64_to_image(InputData["init_images"][0])
-    res = upscaler_inf(
-        InputData["prompt"],
-        InputData["negative_prompt"],
-        init_image,
-        InputData["height"],
-        InputData["width"],
-        InputData["steps"],
-        InputData["noise_level"],
-        InputData["cfg_scale"],
-        InputData["seed"],
-        batch_count=1,
-        batch_size=1,
-        scheduler="EulerDiscrete",
-        custom_model="None",
-        hf_model_id=InputData["hf_model_id"]
-        if "hf_model_id" in InputData.keys()
-        else "stabilityai/stable-diffusion-2-1-base",
-        custom_vae="None",
-        precision="fp16",
-        device=available_devices[0],
-        max_length=64,
-        save_metadata_to_json=False,
-        save_metadata_to_png=False,
-        lora_weights="None",
-        lora_hf_id="",
-        ondemand=False,
-        repeatable_seeds=False,
-    )
-    # Converts generator type to subscriptable
-    res = next(res)
-
-    return {
-        "images": encode_pil_to_base64(res[0]),
-        "parameters": {},
-        "info": res[1],
-    }
-
-
 with gr.Blocks(title="Upscaler") as upscaler_web:
    with gr.Row(elem_id="ui_title"):
        nod_logo = Image.open(nodlogo_loc)
@@ -338,6 +259,7 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                    value=nod_logo,
                    show_label=False,
                    interactive=False,
+                    show_download_button=False,
                    elem_id="top_logo",
                    width=150,
                    height=50,
@@ -345,37 +267,28 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
    with gr.Row(elem_id="ui_body"):
        with gr.Row():
            with gr.Column(scale=1, min_width=600):
+                upscaler_init_image = gr.Image(
+                    label="Input Image",
+                    type="pil",
+                    sources=["upload"],
+                )
                with gr.Row():
-                    # janky fix for overflowing text
                    upscaler_model_info = (
-                        str(get_custom_model_path())
-                    ).replace("\\", "\n\\")
-                    upscaler_model_info = (
-                        f"Custom Model Path: {upscaler_model_info}"
+                        f"Custom Model Path: {str(get_custom_model_path())}"
                    )
                    upscaler_custom_model = gr.Dropdown(
                        label=f"Models",
-                        info=upscaler_model_info,
+                        info="Select, or enter HuggingFace Model ID or Civitai model download URL",
                        elem_id="custom_model",
                        value=os.path.basename(args.ckpt_loc)
                        if args.ckpt_loc
                        else "stabilityai/stable-diffusion-x4-upscaler",
-                        choices=["None"]
-                        + get_custom_model_files(
+                        choices=get_custom_model_files(
                            custom_checkpoint_type="upscaler"
                        )
                        + predefined_upscaler_models,
-                    )
-                    upscaler_hf_model_id = gr.Textbox(
-                        elem_id="hf_model_id",
-                        placeholder="Select 'None' in the Models dropdown "
-                        "on the left and enter model ID here "
-                        "e.g: SG161222/Realistic_Vision_V1.3, "
-                        "https://civitai.com/api/download/models/15236",
-                        value="",
-                        label="HuggingFace Model ID or Civitai model "
-                        "download URL",
-                        lines=3,
+                        allow_custom_value=True,
+                        scale=2,
                    )
                    # janky fix for overflowing text
                    upscaler_vae_info = (
@@ -390,6 +303,8 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                        if args.custom_vae
                        else "None",
                        choices=["None"] + get_custom_model_files("vae"),
+                        allow_custom_value=True,
+                        scale=1,
                    )

                with gr.Group(elem_id="prompt_box_outer"):
@@ -405,36 +320,32 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                        lines=2,
                        elem_id="negative_prompt_box",
                    )
-
-                upscaler_init_image = gr.Image(
-                    label="Input Image",
-                    type="pil",
-                    height=300,
-                )
-
                with gr.Accordion(label="LoRA Options", open=False):
                    with gr.Row():
-                        # janky fix for overflowing text
-                        upscaler_lora_info = (
-                            str(get_custom_model_path("lora"))
-                        ).replace("\\", "\n\\")
-                        upscaler_lora_info = f"LoRA Path: {upscaler_lora_info}"
                        lora_weights = gr.Dropdown(
-                            label=f"Standalone LoRA Weights",
-                            info=upscaler_lora_info,
+                            label=f"LoRA Weights",
+                            info=f"Select from LoRA in {str(get_custom_model_path('lora'))}, or enter HuggingFace Model ID",
                            elem_id="lora_weights",
                            value="None",
                            choices=["None"] + get_custom_model_files("lora"),
+                            allow_custom_value=True,
+                            scale=3,
                        )
-                        lora_hf_id = gr.Textbox(
-                            elem_id="lora_hf_id",
-                            placeholder="Select 'None' in the Standalone LoRA "
-                            "weights dropdown on the left if you want to use "
-                            "a standalone HuggingFace model ID for LoRA here "
-                            "e.g: sayakpaul/sd-model-finetuned-lora-t4",
-                            value="",
-                            label="HuggingFace Model ID",
-                            lines=3,
+                        lora_strength = gr.Number(
+                            label="LoRA Strength",
+                            info="Will be baked into the .vmfb",
+                            step=0.01,
+                            # number is checked on change so to allow 0.n values
+                            # we have to allow 0 or you can't type 0.n in
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=args.lora_strength,
+                            scale=1,
+                        )
+                    with gr.Row():
+                        lora_tags = gr.HTML(
+                            value="<div><i>No LoRA selected</i></div>",
+                            elem_classes="lora-tags",
                        )
                with gr.Accordion(label="Advanced Options", open=False):
                    with gr.Row():
@@ -443,6 +354,7 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                            label="Scheduler",
                            value="DDIM",
                            choices=scheduler_list_cpu_only,
+                            allow_custom_value=True,
                        )
                        with gr.Group():
                            save_metadata_to_png = gr.Checkbox(
@@ -547,17 +459,8 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                        label="Device",
                        value=available_devices[0],
                        choices=available_devices,
+                        allow_custom_value=True,
                    )
-                with gr.Row():
-                    random_seed = gr.Button("Randomize Seed")
-                    random_seed.click(
-                        lambda: -1,
-                        inputs=[],
-                        outputs=[seed],
-                        queue=False,
-                    )
-                    stop_batch = gr.Button("Stop Batch")
-                    stable_diffusion = gr.Button("Generate Image(s)")

            with gr.Column(scale=1, min_width=600):
                with gr.Group():
@@ -569,14 +472,26 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                        object_fit="contain",
                    )
                    std_output = gr.Textbox(
-                        value=f"Images will be saved at "
+                        value=f"{upscaler_model_info}\n"
+                        f"Images will be saved at "
                        f"{get_generated_imgs_path()}",
-                        lines=1,
+                        lines=2,
                        elem_id="std_output",
                        show_label=False,
                    )
                    upscaler_status = gr.Textbox(visible=False)
-
+                with gr.Row():
+                    stable_diffusion = gr.Button("Generate Image(s)")
+                    random_seed = gr.Button("Randomize Seed")
+                    random_seed.click(
+                        lambda: -1,
+                        inputs=[],
+                        outputs=[seed],
+                        queue=False,
+                    )
+                    stop_batch = gr.Button("Stop Batch")
+                with gr.Row():
+                    blank_thing_for_row = None
                with gr.Row():
                    upscaler_sendto_img2img = gr.Button(value="SendTo Img2Img")
                    upscaler_sendto_inpaint = gr.Button(value="SendTo Inpaint")
@@ -600,7 +515,6 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                batch_size,
                scheduler,
                upscaler_custom_model,
-                upscaler_hf_model_id,
                custom_vae,
                precision,
                device,
@@ -608,7 +522,7 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                save_metadata_to_json,
                save_metadata_to_png,
                lora_weights,
-                lora_hf_id,
+                lora_strength,
                ondemand,
                repeatable_seeds,
            ],
@@ -630,3 +544,18 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
            fn=cancel_sd,
            cancels=[prompt_submit, neg_prompt_submit, generate_click],
        )
+
+        lora_weights.change(
+            fn=lora_changed,
+            inputs=[lora_weights],
+            outputs=[lora_tags],
+            queue=True,
+        )
+
+        lora_strength.change(
+            fn=lora_strength_changed,
+            inputs=lora_strength,
+            outputs=lora_strength,
+            queue=False,
+            show_progress=False,
+        )
--- a/apps/stable_diffusion/web/ui/utils.py
+++ b/apps/stable_diffusion/web/ui/utils.py
@@ -1,10 +1,19 @@
 import os
 import sys
-from apps.stable_diffusion.src import get_available_devices
 import glob
+import math
+import json
+import safetensors
+import gradio as gr
+import PIL.Image as Image
+
 from pathlib import Path
 from apps.stable_diffusion.src import args
 from dataclasses import dataclass
+from enum import IntEnum
+from gradio.components.image_editor import EditorValue
+
+from apps.stable_diffusion.src import get_available_devices
 import apps.stable_diffusion.web.utils.global_obj as global_obj
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
    SD_STATE_CANCEL,
@@ -24,8 +33,18 @@ class Config:
    width: int
    device: str
    use_lora: str
-    use_stencil: str
-    ondemand: str
+    lora_strength: float
+    stencils: list[str]
+    ondemand: str  # should this be expecting a bool instead?
+
+
+class HSLHue(IntEnum):
+    RED = 0
+    YELLOW = 60
+    GREEN = 120
+    CYAN = 180
+    BLUE = 240
+    MAGENTA = 300


 custom_model_filetypes = (
@@ -49,9 +68,11 @@ scheduler_list_cpu_only = [
    "DPMSolverSinglestep",
    "DDPM",
    "HeunDiscrete",
+    "LCMScheduler",
 ]
 scheduler_list = scheduler_list_cpu_only + [
    "SharkEulerDiscrete",
+    "SharkEulerAncestralDiscrete",
 ]

 predefined_models = [
@@ -72,6 +93,10 @@ predefined_paint_models = [
 predefined_upscaler_models = [
    "stabilityai/stable-diffusion-x4-upscaler",
 ]
+predefined_sdxl_models = [
+    "stabilityai/sdxl-turbo",
+    "stabilityai/stable-diffusion-xl-base-1.0",
+]


 def resource_path(relative_path):
@@ -125,6 +150,12 @@ def get_custom_model_files(model="models", custom_checkpoint_type=""):
            )
        ]
        match custom_checkpoint_type:
+            case "sdxl":
+                files = [
+                    val
+                    for val in files
+                    if any(x in val for x in ["XL", "xl", "Xl"])
+                ]
            case "inpainting":
                files = [
                    val
@@ -150,17 +181,82 @@ def get_custom_model_files(model="models", custom_checkpoint_type=""):
    return sorted(ckpt_files, key=str.casefold)


-def get_custom_vae_or_lora_weights(weights, hf_id, model):
-    use_weight = ""
-    if weights == "None" and not hf_id:
+def get_custom_vae_or_lora_weights(weights, model):
+    if weights == "None":
        use_weight = ""
-    elif not hf_id:
-        use_weight = get_custom_model_pathfile(weights, model)
    else:
-        use_weight = hf_id
+        custom_weights = get_custom_model_pathfile(str(weights), model)
+        if os.path.isfile(custom_weights):
+            use_weight = custom_weights
+        else:
+            use_weight = weights
+
    return use_weight


+def hsl_color(alpha: float, start, end):
+    b = (end - start) * (alpha if alpha > 0 else 0)
+    result = b + start
+
+    # Return a CSS HSL string
+    return f"hsl({math.floor(result)}, 80%, 35%)"
+
+
+def get_lora_metadata(lora_filename):
+    # get the metadata from the file
+    filename = get_custom_model_pathfile(lora_filename, "lora")
+    with safetensors.safe_open(filename, framework="pt", device="cpu") as f:
+        metadata = f.metadata()
+
+    # guard clause for if there isn't any metadata
+    if not metadata:
+        return None
+
+    # metadata is a dictionary of strings, the values of the keys we're
+    # interested in are actually json, and need to be loaded as such
+    tag_frequencies = json.loads(metadata.get("ss_tag_frequency", str("{}")))
+    dataset_dirs = json.loads(metadata.get("ss_dataset_dirs", str("{}")))
+    tag_dirs = [dir for dir in tag_frequencies.keys()]
+
+    # gather the tag frequency information for all the datasets trained
+    all_frequencies = {}
+    for dataset in tag_dirs:
+        frequencies = sorted(
+            [entry for entry in tag_frequencies[dataset].items()],
+            reverse=True,
+            key=lambda x: x[1],
+        )
+
+        # get a figure for the total number of images processed for this dataset
+        # either then number actually listed or in its dataset_dir entry or
+        # the highest frequency's number if that doesn't exist
+        img_count = dataset_dirs.get(dir, {}).get(
+            "img_count", frequencies[0][1]
+        )
+
+        # add the dataset frequencies to the overall frequencies replacing the
+        # frequency counts on the tags with a percentage/ratio
+        all_frequencies.update(
+            [(entry[0], entry[1] / img_count) for entry in frequencies]
+        )
+
+    trained_model_id = " ".join(
+        [
+            metadata.get("ss_sd_model_hash", ""),
+            metadata.get("ss_sd_model_name", ""),
+            metadata.get("ss_base_model_version", ""),
+        ]
+    ).strip()
+
+    # return the topmost <count> of all frequencies in all datasets
+    return {
+        "model": trained_model_id,
+        "frequencies": sorted(
+            all_frequencies.items(), reverse=True, key=lambda x: x[1]
+        ),
+    }
+
+
 def cancel_sd():
    # Try catch it, as gc can delete global_obj.sd_obj while switching model
    try:
@@ -169,5 +265,116 @@ def cancel_sd():
        pass


+def set_model_default_configs(model_ckpt_or_id, jsonconfig=None):
+    import gradio as gr
+
+    config_modelname = default_config_exists(model_ckpt_or_id)
+    if jsonconfig:
+        return get_config_from_json(jsonconfig)
+    elif config_modelname:
+        return default_configs[config_modelname]
+    # TODO: Use HF metadata to setup pipeline if available
+    # elif is_valid_hf_id(model_ckpt_or_id):
+    #     return get_HF_default_configs(model_ckpt_or_id)
+    else:
+        # We don't have default metadata to setup a good config. Do not change configs.
+        return [
+            gr.Textbox(label="Prompt", interactive=True, visible=True),
+            gr.Textbox(label="Negative Prompt", interactive=True),
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.update(),
+            gr.Checkbox(
+                label="Auto-Generate",
+                visible=False,
+                interactive=False,
+                value=False,
+            ),
+        ]
+
+
+def get_config_from_json(model_ckpt_or_id, jsonconfig):
+    # TODO: make this work properly. It is currently not user-exposed.
+    cfgdata = json.load(jsonconfig)
+    return [
+        cfgdata["prompt_box_behavior"],
+        cfgdata["neg_prompt_box_behavior"],
+        cfgdata["steps"],
+        cfgdata["scheduler"],
+        cfgdata["guidance_scale"],
+        cfgdata["width"],
+        cfgdata["height"],
+        cfgdata["custom_vae"],
+    ]
+
+
+def default_config_exists(model_ckpt_or_id):
+    if model_ckpt_or_id in default_configs.keys():
+        return model_ckpt_or_id
+    elif "turbo" in model_ckpt_or_id.lower():
+        return "stabilityai/sdxl-turbo"
+    else:
+        return None
+
+
+def mask_editor_value_for_image_file(filepath):
+    image = Image.open(filepath)
+    mask = Image.new(mode="RGBA", size=image.size, color=(0, 0, 0, 0))
+    return {"background": image, "layers": [mask], "composite": image}
+
+
+def mask_editor_value_for_gallery_data(gallery_data):
+    filepath = (
+        gallery_data.root[0].image.path
+        if len(gallery_data.root) != 0
+        else None
+    )
+
+    if os.path.isfile(filepath):
+        return mask_editor_value_for_image_file(filepath)
+
+    return EditorValue()
+
+
+default_configs = {
+    "stabilityai/sdxl-turbo": [
+        gr.Textbox(label="", interactive=False, value=None, visible=False),
+        gr.Textbox(
+            label="Prompt",
+            value="masterpiece, a graceful shark leaping out of the water to catch a fish, eclipsing the sunset, epic, rays of light, silhouette",
+        ),
+        gr.Slider(0, 10, value=2),
+        "EulerAncestralDiscrete",
+        gr.Slider(0, value=0),
+        512,
+        512,
+        "madebyollin/sdxl-vae-fp16-fix",
+        gr.Checkbox(
+            label="Auto-Generate", visible=False, interactive=True, value=False
+        ),
+    ],
+    "stabilityai/stable-diffusion-xl-base-1.0": [
+        gr.Textbox(label="Prompt", interactive=True, visible=True),
+        gr.Textbox(label="Negative Prompt", interactive=True),
+        40,
+        "EulerDiscrete",
+        7.5,
+        gr.Slider(value=768, interactive=True),
+        gr.Slider(value=768, interactive=True),
+        "madebyollin/sdxl-vae-fp16-fix",
+        gr.Checkbox(
+            label="Auto-Generate",
+            visible=False,
+            interactive=False,
+            value=False,
+        ),
+    ],
+}
+
+
 nodlogo_loc = resource_path("logos/nod-logo.png")
+nodicon_loc = resource_path("logos/nod-icon.png")
 available_devices = get_available_devices()
--- a/apps/stable_diffusion/web/utils/app.py
+++ b/apps/stable_diffusion/web/utils/app.py
@@ -0,0 +1,105 @@
+import os
+import sys
+import webview
+import webview.util
+import socket
+
+from contextlib import closing
+from multiprocessing import Process
+
+from apps.stable_diffusion.src import args
+
+
+def webview2_installed():
+    if sys.platform != "win32":
+        return False
+
+    # On windows we want to ensure we have MS webview2 available so we don't fall back
+    # to MSHTML (aka ye olde Internet Explorer) which is deprecated by pywebview, and
+    # apparently causes SHARK not to load in properly.
+
+    # Checking these registry entries is how Microsoft says to detect a webview2 installation:
+    # https://learn.microsoft.com/en-us/microsoft-edge/webview2/concepts/distribution
+    import winreg
+
+    path = r"SOFTWARE\WOW6432Node\Microsoft\EdgeUpdate\Clients\{F3017226-FE2A-4295-8BDF-00C3A9A7E4C5}"
+
+    # only way can find if a registry entry even exists is to try and open it
+    try:
+        # check for an all user install
+        with winreg.OpenKey(
+            winreg.HKEY_LOCAL_MACHINE,
+            path,
+            0,
+            winreg.KEY_QUERY_VALUE | winreg.KEY_WOW64_64KEY,
+        ) as registry_key:
+            value, type = winreg.QueryValueEx(registry_key, "pv")
+
+    # if it didn't exist, we want to continue on...
+    except WindowsError:
+        try:
+            # ...to check for a current user install
+            with winreg.OpenKey(
+                winreg.HKEY_CURRENT_USER,
+                path,
+                0,
+                winreg.KEY_QUERY_VALUE | winreg.KEY_WOW64_64KEY,
+            ) as registry_key:
+                value, type = winreg.QueryValueEx(registry_key, "pv")
+        except WindowsError:
+            value = None
+    finally:
+        return (value is not None) and value != "" and value != "0.0.0.0"
+
+
+def window(address):
+    from tkinter import Tk
+
+    window = Tk()
+
+    # get screen width and height of display and make it more reasonably
+    # sized as we aren't making it full-screen or maximized
+    width = int(window.winfo_screenwidth() * 0.81)
+    height = int(window.winfo_screenheight() * 0.91)
+    webview.create_window(
+        "SHARK AI Studio",
+        url=address,
+        width=width,
+        height=height,
+        text_select=True,
+    )
+    webview.start(private_mode=False, storage_path=os.getcwd())
+
+
+def usable_port():
+    # Make sure we can actually use the port given in args.server_port. If
+    # not ask the OS for a port and return that as our port to use.
+
+    port = args.server_port
+
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        try:
+            sock.bind(("0.0.0.0", port))
+        except OSError:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
+                sock.bind(("0.0.0.0", 0))
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                return sock.getsockname()[1]
+
+    return port
+
+
+def launch(port):
+    # setup to launch as an app if app mode has been requested and we're able
+    # to do it, answering whether we succeeded.
+    if args.ui == "app" and (sys.platform != "win32" or webview2_installed()):
+        try:
+            t = Process(target=window, args=[f"http://localhost:{port}"])
+            t.start()
+            return True
+        except webview.util.WebViewException:
+            return False
+    else:
+        return False
--- a/apps/stable_diffusion/web/utils/metadata/png_metadata.py
+++ b/apps/stable_diffusion/web/utils/metadata/png_metadata.py
@@ -122,20 +122,26 @@ def find_vae_from_png_metadata(

 def find_lora_from_png_metadata(
    key: str, metadata: dict[str, str | int]
-) -> tuple[str, str]:
-    lora_hf_id = ""
+) -> tuple[str, float]:
    lora_custom = ""
+    lora_strength = 1.0

    if key in metadata:
-        lora_file = metadata[key]
+        split_metadata = metadata[key].split(":")
+        lora_file = split_metadata[0]
+        if len(split_metadata) == 2:
+            try:
+                lora_strength = float(split_metadata[1])
+            except ValueError:
+                pass
+
        lora_custom = try_find_model_base_from_png_metadata(lora_file, "lora")
        # If nothing had matched, check vendor/hf_model_id
        if not lora_custom and lora_file.count("/"):
-            lora_hf_id = lora_file
+            lora_custom = lora_file

    # LoRA input is optional, should not print or throw an error if missing
-
-    return lora_custom, lora_hf_id
+    return lora_custom, lora_strength


 def import_png_metadata(
@@ -149,9 +155,7 @@ def import_png_metadata(
    width,
    height,
    custom_model,
-    hf_model_id,
    custom_lora,
-    hf_lora_id,
    custom_vae,
 ):
    try:
@@ -161,9 +165,10 @@ def import_png_metadata(
        (png_custom_model, png_hf_model_id) = find_model_from_png_metadata(
            "Model", metadata
        )
-        (lora_custom_model, lora_hf_model_id) = find_lora_from_png_metadata(
-            "LoRA", metadata
-        )
+        (
+            custom_lora,
+            custom_lora_strength,
+        ) = find_lora_from_png_metadata("LoRA", metadata)
        vae_custom_model = find_vae_from_png_metadata("VAE", metadata)

        negative_prompt = metadata["Negative prompt"]
@@ -175,17 +180,11 @@ def import_png_metadata(

        if "Model" in metadata and png_custom_model:
            custom_model = png_custom_model
-            hf_model_id = ""
-        if "Model" in metadata and png_hf_model_id:
-            custom_model = "None"
-            hf_model_id = png_hf_model_id
+        elif "Model" in metadata and png_hf_model_id:
+            custom_model = png_hf_model_id

-        if "LoRA" in metadata and lora_custom_model:
-            custom_lora = lora_custom_model
-            hf_lora_id = ""
-        if "LoRA" in metadata and lora_hf_model_id:
+        if "LoRA" in metadata and not custom_lora:
            custom_lora = "None"
-            hf_lora_id = lora_hf_model_id

        if "VAE" in metadata and vae_custom_model:
            custom_vae = vae_custom_model
@@ -217,8 +216,7 @@ def import_png_metadata(
        width,
        height,
        custom_model,
-        hf_model_id,
        custom_lora,
-        hf_lora_id,
+        custom_lora_strength,
        custom_vae,
    )
--- a/apps/stable_diffusion/web/utils/gradio_configs.py
+++ b/apps/stable_diffusion/web/utils/gradio_configs.py
@@ -5,11 +5,25 @@ from time import time
 shark_tmp = os.path.join(os.getcwd(), "shark_tmp/")


-def config_gradio_tmp_imgs_folder():
-    # create shark_tmp if it does not exist
-    if not os.path.exists(shark_tmp):
-        os.mkdir(shark_tmp)
+def clear_tmp_mlir():
+    cleanup_start = time()
+    print(
+        "Clearing .mlir temporary files from a prior run. This may take some time..."
+    )
+    mlir_files = [
+        filename
+        for filename in os.listdir(shark_tmp)
+        if os.path.isfile(os.path.join(shark_tmp, filename))
+        and filename.endswith(".mlir")
+    ]
+    for filename in mlir_files:
+        os.remove(shark_tmp + filename)
+    print(
+        f"Clearing .mlir temporary files took {time() - cleanup_start:.4f} seconds."
+    )

+
+def clear_tmp_imgs():
    # tell gradio to use a directory under shark_tmp for its temporary
    # image files unless somewhere else has been set
    if "GRADIO_TEMP_DIR" not in os.environ:
@@ -52,3 +66,12 @@ def config_gradio_tmp_imgs_folder():
            )
        else:
            print("No temporary images files to clear.")
+
+
+def config_tmp():
+    # create shark_tmp if it does not exist
+    if not os.path.exists(shark_tmp):
+        os.mkdir(shark_tmp)
+
+    clear_tmp_mlir()
+    clear_tmp_imgs()
--- a/benchmarks/tests/test_benchmark.py
+++ b/benchmarks/tests/test_benchmark.py
@@ -129,12 +129,12 @@ pytest_benchmark_param = pytest.mark.parametrize(
        pytest.param(True, "cpu", marks=pytest.mark.skip),
        pytest.param(
            False,
-            "gpu",
+            "cuda",
            marks=pytest.mark.skipif(
-                check_device_drivers("gpu"), reason="nvidia-smi not found"
+                check_device_drivers("cuda"), reason="nvidia-smi not found"
            ),
        ),
-        pytest.param(True, "gpu", marks=pytest.mark.skip),
+        pytest.param(True, "cuda", marks=pytest.mark.skip),
        pytest.param(
            False,
            "vulkan",
--- a/build_tools/image_comparison.py
+++ b/build_tools/image_comparison.py
@@ -24,13 +24,13 @@ def get_image(url, local_filename):
            shutil.copyfileobj(res.raw, f)


-def compare_images(new_filename, golden_filename):
+def compare_images(new_filename, golden_filename, upload=False):
    new = np.array(Image.open(new_filename)) / 255.0
    golden = np.array(Image.open(golden_filename)) / 255.0
    diff = np.abs(new - golden)
    mean = np.mean(diff)
    if mean > 0.1:
-        if os.name != "nt":
+        if os.name != "nt" and upload == True:
            subprocess.run(
                [
                    "gsutil",
@@ -39,7 +39,7 @@ def compare_images(new_filename, golden_filename):
                    "gs://shark_tank/testdata/builder/",
                ]
            )
-        raise SystemExit("new and golden not close")
+        raise AssertionError("new and golden not close")
    else:
        print("SUCCESS")

--- a/build_tools/populate_sharktank_ci.sh
+++ b/build_tools/populate_sharktank_ci.sh
@@ -1,5 +1,6 @@
 #!/bin/bash

-IMPORTER=1 BENCHMARK=1 ./setup_venv.sh
+IMPORTER=1 BENCHMARK=1 NO_BREVITAS=1 ./setup_venv.sh
 source $GITHUB_WORKSPACE/shark.venv/bin/activate
+python build_tools/stable_diffusion_testing.py --gen
 python tank/generate_sharktank.py
--- a/build_tools/stable_diffusion_testing.py
+++ b/build_tools/stable_diffusion_testing.py
@@ -63,7 +63,14 @@ def get_inpaint_inputs():
    open("./test_images/inputs/mask.png", "wb").write(mask.content)


-def test_loop(device="vulkan", beta=False, extra_flags=[]):
+def test_loop(
+    device="vulkan",
+    beta=False,
+    extra_flags=[],
+    upload_bool=True,
+    exit_on_fail=True,
+    do_gen=False,
+):
    # Get golden values from tank
    shutil.rmtree("./test_images", ignore_errors=True)
    model_metrics = []
@@ -71,7 +78,10 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
    os.mkdir("./test_images/golden")
    get_inpaint_inputs()
    hf_model_names = model_config_dicts[0].values()
-    tuned_options = ["--no-use_tuned", "--use_tuned"]
+    tuned_options = [
+        "--no-use_tuned",
+        "--use_tuned",
+    ]
    import_options = ["--import_mlir", "--no-import_mlir"]
    prompt_text = "--prompt=cyberpunk forest by Salvador Dali"
    inpaint_prompt_text = "--prompt=Face of a yellow cat, high resolution, sitting on a park bench"
@@ -81,6 +91,8 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
    if beta:
        extra_flags.append("--beta_models=True")
    extra_flags.append("--no-progress_bar")
+    if do_gen:
+        extra_flags.append("--import_debug")
    to_skip = [
        "Linaqruf/anything-v3.0",
        "prompthero/openjourney",
@@ -103,6 +115,8 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
                    and use_tune == tuned_options[1]
                ):
                    continue
+                elif use_tune == tuned_options[1]:
+                    continue
                command = (
                    [
                        executable,  # executable is the python from the venv used to run this
@@ -181,7 +195,14 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
                        "./test_images/golden/" + model_name + "/*.png"
                    )
                    golden_file = glob(golden_path)[0]
-                    compare_images(test_file, golden_file)
+                    try:
+                        compare_images(
+                            test_file, golden_file, upload=upload_bool
+                        )
+                    except AssertionError as e:
+                        print(e)
+                        if exit_on_fail == True:
+                            raise
                else:
                    print(command)
                    print("failed to generate image for this configuration")
@@ -200,6 +221,9 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
                            extra_flags.remove(
                                "--iree_vulkan_target_triple=rdna2-unknown-windows"
                            )
+            if do_gen:
+                prepare_artifacts()
+
    with open(os.path.join(os.getcwd(), "sd_testing_metrics.csv"), "w+") as f:
        header = "model_name;device;use_tune;import_opt;Clip Inference time(ms);Average Step (ms/it);VAE Inference time(ms);total image generation(s);command\n"
        f.write(header)
@@ -218,15 +242,49 @@ def test_loop(device="vulkan", beta=False, extra_flags=[]):
            f.write(";".join(output) + "\n")


+def prepare_artifacts():
+    gen_path = os.path.join(os.getcwd(), "gen_shark_tank")
+    if not os.path.isdir(gen_path):
+        os.mkdir(gen_path)
+    for dirname in os.listdir(os.getcwd()):
+        for modelname in ["clip", "unet", "vae"]:
+            if modelname in dirname and "vmfb" not in dirname:
+                if not os.path.isdir(os.path.join(gen_path, dirname)):
+                    shutil.move(os.path.join(os.getcwd(), dirname), gen_path)
+                    print(f"Moved dir: {dirname} to {gen_path}.")
+
+
 parser = argparse.ArgumentParser()

 parser.add_argument("-d", "--device", default="vulkan")
 parser.add_argument(
    "-b", "--beta", action=argparse.BooleanOptionalAction, default=False
 )
-
+parser.add_argument("-e", "--extra_args", type=str, default=None)
+parser.add_argument(
+    "-u", "--upload", action=argparse.BooleanOptionalAction, default=True
+)
+parser.add_argument(
+    "-x", "--exit_on_fail", action=argparse.BooleanOptionalAction, default=True
+)
+parser.add_argument(
+    "-g", "--gen", action=argparse.BooleanOptionalAction, default=False
+)

 if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
-    test_loop(args.device, args.beta, [])
+    extra_args = []
+    if args.extra_args:
+        for arg in args.extra_args.split(","):
+            extra_args.append(arg)
+    test_loop(
+        args.device,
+        args.beta,
+        extra_args,
+        args.upload,
+        args.exit_on_fail,
+        args.gen,
+    )
+    if args.gen:
+        prepare_artifacts()
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -27,7 +27,7 @@ include(FetchContent)

 FetchContent_Declare(
  iree
-  GIT_REPOSITORY https://github.com/nod-ai/shark-runtime.git
+  GIT_REPOSITORY https://github.com/nod-ai/srt.git
  GIT_TAG shark 
  GIT_SUBMODULES_RECURSE OFF
  GIT_SHALLOW OFF
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -40,7 +40,7 @@ cmake --build build/
 *Prepare the model*
 ```bash
 wget https://storage.googleapis.com/shark_tank/latest/resnet50_tf/resnet50_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 resnet50_tf.mlir -o resnet50_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --iree-llvmcpu-embedded-linker-path=`python3 -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])'`/iree/compiler/tools/../_mlir_libs/iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=ist/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  resnet50_tf.mlir -o resnet50_tf.vmfb
 ```
 *Prepare the input*

@@ -65,18 +65,18 @@ A tool for benchmarking other models is built and can be invoked with a command
 see `./build/vulkan_gui/iree-vulkan-gui --help` for an explanation on the function input. For example, stable diffusion unet can be tested with the following commands:
 ```bash
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/stable_diff_tf.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 stable_diff_tf.mlir -o stable_diff_tf.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  stable_diff_tf.mlir -o stable_diff_tf.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=2x4x64x64xf32 --function_input=1xf32 --function_input=2x77x768xf32
 ```
 VAE and Autoencoder are also available
 ```bash
 # VAE
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/vae_tf/vae.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 vae.mlir -o vae.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  vae.mlir -o vae.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x4x64x64xf32

 # CLIP Autoencoder
 wget https://storage.googleapis.com/shark_tank/quinn/stable_diff_tf/clip_tf/clip_autoencoder.mlir
-iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 clip_autoencoder.mlir -o clip_autoencoder.vmfb
+iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=vulkan --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-cpu-features=host -iree-vulkan-target-triple=rdna2-unknown-linux  clip_autoencoder.mlir -o clip_autoencoder.vmfb
 ./build/vulkan_gui/iree-vulkan-gui --module-file=stable_diff_tf.vmfb --function_input=1x77xi32 --function_input=1x77xi32
 ```
--- a/dataset/annotation_tool.py
+++ b/dataset/annotation_tool.py
@@ -23,6 +23,7 @@ with gr.Blocks(title="Dataset Annotation Tool", css=demo_css) as shark_web:
                value=nod_logo,
                show_label=False,
                interactive=False,
+                show_download_button=False,
                elem_id="top_logo",
                width=150,
                height=100,
--- a/docs/shark_iree_profiling.md
+++ b/docs/shark_iree_profiling.md
@@ -55,7 +55,7 @@ The command line for compilation will start something like this, where the `-` n
 The `-o output_filename.vmfb` flag can be used to specify the location to save the compiled vmfb. Note that a dump of the
 dispatches that can be compiled + run in isolation can be generated by adding `--iree-hal-dump-executable-benchmarks-to=/some/directory`. Say, if they are in the `benchmarks` directory, the following compile/run commands would work for Vulkan on RDNA3.
 ```
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux --iree-stream-resource-index-bits=64 --iree-vm-target-index-bits=64 benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb
+iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna3-unknown-linux  benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.mlir -o benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb

 iree-benchmark-module --module=benchmarks/module_forward_dispatch_${NUM}_vulkan_spirv_fb.vmfb --function=forward --device=vulkan
 ```
@@ -63,8 +63,8 @@ Where `${NUM}` is the dispatch number that you want to benchmark/profile in isol

 ### Enabling Tracy for Vulkan profiling

-To begin profiling with Tracy, a build of IREE runtime with tracing enabled is needed. SHARK-Runtime builds an
-instrumented version alongside the normal version nightly (.whls typically found [here](https://github.com/nod-ai/SHARK-Runtime/releases)), however this is only available for Linux. For Windows, tracing can be enabled by enabling a CMake flag.
+To begin profiling with Tracy, a build of IREE runtime with tracing enabled is needed. SHARK-Runtime (SRT) builds an
+instrumented version alongside the normal version nightly (.whls typically found [here](https://github.com/nod-ai/SRT/releases)), however this is only available for Linux. For Windows, tracing can be enabled by enabling a CMake flag.
 ```
 $env:IREE_ENABLE_RUNTIME_TRACING="ON"
 ```
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`from apps.stable_diffusion.web.api.sdapi_v1 import sdapi`