Update requirements.txt

Update requirements.txt (#2157 )
2026-01-11 14:58:11 -05:00 · 2024-08-06 19:29:40 -07:00 · 2024-08-06 17:15:42 -07:00 · 2024-06-18 13:41:35 -07:00 · 2024-05-31 18:14:27 -04:00 · 2024-05-31 18:48:28 +05:30
189 changed files with 5648 additions and 43799 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -50,10 +50,11 @@ jobs:
      shell: powershell
      run: |
        ./setup_venv.ps1
-        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
-        pip wheel -v -w dist . --pre -f https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
        python process_skipfiles.py
-        pyinstaller .\apps\stable_diffusion\shark_sd.spec
+        $env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
+        pip install -e .
+        pip freeze -l
+        pyinstaller .\apps\shark_studio\shark_studio.spec
        mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
        signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
  
@@ -74,80 +75,3 @@ jobs:
        GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
      with:
        release_id: ${{ steps.create_release.outputs.id }}
-
-  linux-build:
-
-    runs-on: a100
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.11"]
-        backend: [IREE, SHARK]
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    
-    - name: Setup pip cache
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
-
-    - name: Install dependencies
-      run: |
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml
-        if [ -f requirements.txt ]; then pip install -r requirements.txt -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html; fi
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude shark.venv,lit.cfg.py 
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude shark.venv,lit.cfg.py 
-    - name: Build and validate the IREE package
-      if: ${{ matrix.backend == 'IREE' }}
-      continue-on-error: true
-      run: |
-        cd $GITHUB_WORKSPACE
-        USE_IREE=1 VENV_DIR=iree.venv ./setup_venv.sh
-        source iree.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://openxla.github.io/iree/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        /bin/bash "$GITHUB_WORKSPACE/build_tools/populate_sharktank_ci.sh"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="./gen_shark_tank/" -k "not metal" |
-          tail -n 1 |
-          tee -a pytest_results.txt
-        if !(grep -Fxq " failed" pytest_results.txt) 
-          then 
-            export SHA=$(git log -1 --format='%h')
-            gsutil -m cp -r $GITHUB_WORKSPACE/gen_shark_tank/* gs://shark_tank/${DATE}_$SHA
-            gsutil -m cp -r gs://shark_tank/${DATE}_$SHA/* gs://shark_tank/nightly/
-        fi
-        rm -rf ./wheelhouse/nodai*
-
-    - name: Build and validate the SHARK Runtime package
-      if: ${{ matrix.backend == 'SHARK' }}
-      run: |
-        cd $GITHUB_WORKSPACE
-        ./setup_venv.sh
-        source shark.venv/bin/activate
-        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
-        SHARK_PACKAGE_VERSION=${package_version} \
-        pip wheel -v -w wheelhouse . --pre -f https://download.pytorch.org/whl/nightly/torch -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html
-        # Install the built wheel
-        pip install ./wheelhouse/nodai*
-        # Validate the Models
-        pytest --ci --ci_sha=${SHORT_SHA} -k "not metal" |
-          tail -n 1 |
-          tee -a pytest_results.txt
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -1,164 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Validate Models on Shark Runtime
-
-on:
-  push:
-    branches: [ main ]
-    paths-ignore:
-      - '**.md'
-      - 'shark/examples/**'
-  pull_request:
-    branches: [ main ]
-    paths-ignore:
-      - '**.md'
-      - 'shark/examples/**'
-  workflow_dispatch:
-
-# Ensure that only a single job or workflow using the same
-# concurrency group will run at a time. This would cancel
-# any in-progress jobs in the same github workflow and github
-# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build-validate:
-    strategy:
-      fail-fast: true
-      matrix:
-        os: [7950x, icelake, a100, MacStudio, ubuntu-latest]
-        suite: [cpu,cuda,vulkan]
-        python-version: ["3.11"]
-        include:
-          - os: ubuntu-latest
-            suite: lint
-          - os: MacStudio
-            suite: metal
-        exclude:
-          - os: ubuntu-latest
-            suite: vulkan
-          - os: ubuntu-latest
-            suite: cuda
-          - os: ubuntu-latest
-            suite: cpu
-          - os: MacStudio
-            suite: cuda
-          - os: MacStudio
-            suite: cpu
-          - os: MacStudio
-            suite: vulkan
-          - os: icelake
-            suite: vulkan
-          - os: icelake
-            suite: cuda
-          - os: a100
-            suite: cpu
-          - os: 7950x
-            suite: cpu
-          - os: 7950x
-            suite: cuda
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-    - uses: actions/checkout@v3
-    
-    - name: Set Environment Variables
-      if: matrix.os != '7950x'
-      run: |
-        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
-        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
-        
-    - name: Set up Python Version File ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
-      run: |
-        # See https://github.com/actions/setup-python/issues/433
-        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
-    
-    - name: Set up Python ${{ matrix.python-version }}
-      if: matrix.os == 'a100' ||  matrix.os == 'ubuntu-latest' ||  matrix.os == 'icelake'
-      uses: actions/setup-python@v4
-      with:
-        python-version: '${{ matrix.python-version }}'
-        #cache: 'pip'
-        #cache-dependency-path: |
-        #  **/requirements-importer.txt
-        #  **/requirements.txt
-          
-    - name: Install dependencies
-      if: matrix.suite == 'lint'
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install flake8 pytest toml black
-        
-    - name: Lint with flake8
-      if: matrix.suite == 'lint'
-      run: |
-        # black format check
-        black --version
-        black --check .
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --isolated --count --exit-zero --max-complexity=10 --max-line-length=127 \
-          --statistics --exclude lit.cfg.py
-
-    - name: Validate Models on CPU
-      if: matrix.suite == 'cpu'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest --benchmark=native --update_tank -k cpu 
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
-        python build_tools/vicuna_testing.py
-
-    - name: Validate Models on NVIDIA GPU
-      if: matrix.suite == 'cuda'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest --benchmark=native --update_tank -k cuda
-        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv
-        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cuda_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cuda_latest.csv
-        # Disabled due to black image bug
-        # python build_tools/stable_diffusion_testing.py --device=cuda 
-
-    - name: Validate Vulkan Models (MacOS)
-      if: matrix.suite == 'metal' && matrix.os == 'MacStudio'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
-        source shark.venv/bin/activate
-        echo $PATH
-        pip list | grep -E "torch|iree"
-      # disabled due to a low-visibility memory issue with pytest on macos.
-      # pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k metal
-
-    - name: Validate Vulkan Models (a100)
-      if: matrix.suite == 'vulkan' && matrix.os == 'a100'
-      run: |
-        cd $GITHUB_WORKSPACE
-        PYTHON=python${{ matrix.python-version }} IMPORTER=1 ./setup_venv.sh
-        source shark.venv/bin/activate
-        pytest --update_tank -k vulkan
-        python build_tools/stable_diffusion_testing.py --device=vulkan --no-exit_on_fail
-
-    - name: Validate Vulkan Models (Windows)
-      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
-      run: |
-        ./setup_venv.ps1
-        pytest -k vulkan -s --ci
-
-    - name: Validate Stable Diffusion Models (Windows)
-      if: matrix.suite == 'vulkan' && matrix.os == '7950x'
-      run: |
-        ./setup_venv.ps1
-        python process_skipfiles.py
-        pyinstaller .\apps\stable_diffusion\shark_sd.spec
-        python build_tools/stable_diffusion_testing.py --device=vulkan
--- a/.github/workflows/test-studio.yml
+++ b/.github/workflows/test-studio.yml
@@ -0,0 +1,85 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Validate Shark Studio
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
+  pull_request:
+    branches: [ main ]
+    paths-ignore:
+      - '**.md'
+      - 'shark/examples/**'
+  workflow_dispatch:
+
+# Ensure that only a single job or workflow using the same
+# concurrency group will run at a time. This would cancel
+# any in-progress jobs in the same github workflow and github
+# ref (e.g. refs/heads/main or refs/pull/<pr_number>/merge).
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-validate:
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [nodai-ubuntu-builder-large]
+        suite: [cpu] #,cuda,vulkan]
+        python-version: ["3.11"]
+        include:
+          - os: nodai-ubuntu-builder-large
+            suite: lint
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Set Environment Variables
+      run: |
+        echo "SHORT_SHA=`git rev-parse --short=4 HEAD`" >> $GITHUB_ENV
+        echo "DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+        
+    - name: Set up Python Version File ${{ matrix.python-version }}
+      run: |
+        echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
+    
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: '${{ matrix.python-version }}'
+          
+    - name: Install dependencies
+      if: matrix.suite == 'lint'
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest toml black
+        
+    - name: Lint with flake8
+      if: matrix.suite == 'lint'
+      run: |
+        # black format check
+        black --version
+        black --check apps/shark_studio 
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --isolated --count --exit-zero --max-complexity=10 --max-line-length=127 \
+          --statistics --exclude lit.cfg.py
+
+    - name: Validate Models on CPU
+      if: matrix.suite == 'cpu'
+      run: |
+        cd $GITHUB_WORKSPACE
+        python${{ matrix.python-version }} -m venv shark.venv
+        source shark.venv/bin/activate
+        pip install -r requirements.txt --no-cache-dir
+        pip install -e .
+        # Disabled due to hang when exporting test llama2
+        # python apps/shark_studio/tests/api_test.py
--- a/.gitignore
+++ b/.gitignore
@@ -164,7 +164,7 @@ cython_debug/
 # vscode related
 .vscode

-# Shark related artefacts
+# Shark related artifacts
 *venv/
 shark_tmp/
 *.vmfb
@@ -172,6 +172,7 @@ shark_tmp/
 tank/dict_configs.py
 *.csv
 reproducers/
+apps/shark_studio/web/configs

 # ORT related artefacts
 cache_models/
@@ -183,10 +184,16 @@ generated_imgs/
 # Custom model related artefacts
 variants.json
 /models/
+*.safetensors

 # models folder
 apps/stable_diffusion/web/models/

+# model artifacts (SHARK)
+*.tempfile
+*.mlir
+*.vmfb
+
 # Stencil annotators.
 stencil_annotator/

--- a/README.md
+++ b/README.md
@@ -2,18 +2,20 @@

 High Performance Machine Learning Distribution

+*We are currently rebuilding SHARK to take advantage of [Turbine](https://github.com/nod-ai/SHARK-Turbine). Until that is complete make sure you use an .exe release or a checkout of the `SHARK-1.0` branch, for a working SHARK*
+
 [![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
 [![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)


 <details>
  <summary>Prerequisites - Drivers </summary>
-  
+
 #### Install your Windows hardware drivers
 * [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
-* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work. 
+* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work.
 * [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)
-  
+
 #### Linux Drivers
 * MESA / RADV drivers wont work with FP16. Please use the latest AMGPU-PRO drivers (non-pro OSS drivers also wont work) or the latest NVidia Linux Drivers.

@@ -22,23 +24,23 @@ Other users please ensure you have your latest vendor drivers and Vulkan SDK fro
 </details>


- 
+
 ### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users

-Install the Driver from [Prerequisites](https://github.com/nod-ai/SHARK#install-your-hardware-drivers) above 
+Install the Driver from (Prerequisites)[https://github.com/nod-ai/SHARK#install-your-hardware-drivers] above

-Download the [stable release](https://github.com/nod-ai/shark/releases/latest)
+Download the [stable release](https://github.com/nod-ai/shark/releases/latest) or the most recent [SHARK 1.0 pre-release](https://github.com/nod-ai/shark/releases).

-Double click the .exe and you should have the [UI](http://localhost:8080/) in the browser. 
+Double click the .exe, or [run from the command line](#running) (recommended), and you should have the [UI](http://localhost:8080/) in the browser.

-If you have custom models put them in a `models/` directory where the .exe is. 
+If you have custom models put them in a `models/` directory where the .exe is.

-Enjoy. 
+Enjoy.

 <details>
  <summary>More installation notes</summary>
-* We recommend that you download EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files with `rm *.vmfb`. You can also use `--clear_all` flag once to clean all the old files. 
-* If you recently updated the driver or this binary (EXE file), we recommend you clear all the local artifacts with `--clear_all` 
+* We recommend that you download EXE in a new folder, whenever you download a new EXE version. If you download it in the same folder as a previous install, you must delete the old `*.vmfb` files with `rm *.vmfb`. You can also use `--clear_all` flag once to clean all the old files.
+* If you recently updated the driver or this binary (EXE file), we recommend you clear all the local artifacts with `--clear_all`

 ## Running

@@ -46,17 +48,22 @@ Enjoy.
 * The first run may take few minutes when the models are downloaded and compiled. Your patience is appreciated. The download could be about 5GB.
 * You will likely see a Windows Defender message asking you to give permission to open a web server port. Accept it.
 * Open a browser to access the Stable Diffusion web server. By default, the port is 8080, so you can go to http://localhost:8080/.
+* If you prefer to always run in the browser, use the `--ui=web` command argument when running the EXE.

 ## Stopping

-* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment or close the terminal. 
+* Select the command prompt that's running the EXE. Press CTRL-C and wait a moment or close the terminal.
 </details>

 <details>
  <summary>Advanced Installation (Only for developers)</summary>
-  
+
 ## Advanced Installation (Windows, Linux and macOS) for developers

+### Windows 10/11 Users
+
+* Install Git for Windows from [here](https://git-scm.com/download/win) if you don't already have it.
+
 ## Check out the code

 ```shell
@@ -64,14 +71,22 @@ git clone https://github.com/nod-ai/SHARK.git
 cd SHARK
 ```

+## Switch to the Correct Branch (IMPORTANT!)
+
+Currently SHARK is being rebuilt for [Turbine](https://github.com/nod-ai/SHARK-Turbine) on the `main` branch. For now you are strongly discouraged from using `main` unless you are working on the rebuild effort, and should not expect the code there to produce a working application for Image Generation, So for now you'll need switch over to the `SHARK-1.0` branch and use the stable code.
+
+```shell
+git checkout SHARK-1.0
+```
+
+The following setup instructions assume you are on this branch.
+
 ## Setup your Python VirtualEnvironment and Dependencies

 ### Windows 10/11 Users

 * Install the latest Python 3.11.x version from [here](https://www.python.org/downloads/windows/)

-* Install Git for Windows from [here](https://git-scm.com/download/win)
-
 #### Allow the install script to run in Powershell
 ```powershell
 set-executionpolicy remotesigned
@@ -86,21 +101,20 @@ set-executionpolicy remotesigned

 ```shell
 ./setup_venv.sh
-source shark.venv/bin/activate
+source shark1.venv/bin/activate
 ```

-
 ### Run Stable Diffusion on your device - WebUI

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
-(shark.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
+(shark1.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
+(shark1.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
 ```
 #### Linux / macOS Users
 ```shell
-(shark.venv) > cd apps/stable_diffusion/web
-(shark.venv) > python index.py
+(shark1.venv) > cd apps/stable_diffusion/web
+(shark1.venv) > python index.py
 ```

 #### Access Stable Diffusion on http://localhost:8080/?__theme=dark
@@ -114,7 +128,7 @@ source shark.venv/bin/activate

 #### Windows 10/11 Users
 ```powershell
-(shark.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
+(shark1.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
 ```

 #### Linux / macOS Users
@@ -142,7 +156,7 @@ Here are some samples generated:
 ![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)


-Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware. 
+Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.


 <details>
@@ -205,7 +219,7 @@ python ./minilm_jit.py --device="cpu"  #use cuda or vulkan or metal
 If you want to use Python3.11 and with TF Import tools you can use the environment variables like:
 Set `USE_IREE=1` to use upstream IREE
 ```
-# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh 
+# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh
 ```

 ### Run any of the hundreds of SHARK tank models via the test framework
@@ -214,7 +228,7 @@ python -m  shark.examples.shark_inference.resnet50_script --device="cpu" # Use g
 # Or a pytest
 pytest tank/test_models.py -k "MiniLM"
 ```
-  
+
 ### How to use your locally built IREE / Torch-MLIR with SHARK
 If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
 the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
@@ -240,12 +254,12 @@ Now the SHARK will use your locally build Torch-MLIR repo.

 ## Benchmarking Dispatches

-To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your pytest command line argument.  
+To produce benchmarks of individual dispatches, you can add `--dispatch_benchmarks=All --dispatch_benchmarks_dir=<output_dir>` to your pytest command line argument.
 If you only want to compile specific dispatches, you can specify them with a space seperated string instead of `"All"`.  E.G. `--dispatch_benchmarks="0 1 2 10"`

 For example, to generate and run dispatch benchmarks for MiniLM on CUDA:
 ```
-pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s --dispatch_benchmarks_dir=./my_dispatch_benchmarks                                                                                
+pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s --dispatch_benchmarks_dir=./my_dispatch_benchmarks
 ```
 The given command will populate `<dispatch_benchmarks_dir>/<model_name>/` with an `ordered_dispatches.txt` that lists and orders the dispatches and their latencies, as well as folders for each dispatch that contain .mlir, .vmfb, and results of the benchmark for that dispatch.

@@ -264,7 +278,7 @@ shark_module = SharkInference(
 Output will include:
 - An ordered list ordered-dispatches.txt of all the dispatches with their runtime
 - Inside the specified directory, there will be a directory for each dispatch (there will be mlir files for all dispatches, but only compiled binaries and benchmark data for the specified dispatches)
- An .mlir file containing the dispatch benchmark 
+- An .mlir file containing the dispatch benchmark
 - A compiled .vmfb file containing the dispatch benchmark
 - An .mlir file containing just the hal executable
 - A compiled .vmfb file of the hal executable
@@ -332,7 +346,7 @@ result = shark_module.forward((arg0, arg1))

 ## Supported and Validated Models

-SHARK is maintained to support the latest innovations in ML Models: 
+SHARK is maintained to support the latest innovations in ML Models:

 | TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
 |---------------------|----------|----------|-------------|
@@ -358,7 +372,7 @@ For a complete list of the models supported in SHARK, please refer to [tank/READ

 *   [Upstream IREE issues](https://github.com/google/iree/issues): Feature requests,
    bugs, and other work tracking
-*   [Upstream IREE Discord server](https://discord.gg/26P4xW4): Daily development
+*   [Upstream IREE Discord server](https://discord.gg/wEWh6Z9nMU): Daily development
    discussions with the core team and collaborators
 *   [iree-discuss email list](https://groups.google.com/forum/#!forum/iree-discuss):
    Announcements, general and low-priority discussion
@@ -373,7 +387,7 @@ For a complete list of the models supported in SHARK, please refer to [tank/READ
 *  Weekly meetings on Mondays 9AM PST. See [here](https://discourse.llvm.org/t/community-meeting-developer-hour-refactoring-recurring-meetings/62575) for more information.
 * [MLIR topic within LLVM Discourse](https://llvm.discourse.group/c/llvm-project/mlir/31) SHARK and IREE is enabled by and heavily relies on [MLIR](https://mlir.llvm.org).
 </details>
-  
+
 ## License

 nod.ai SHARK is licensed under the terms of the Apache 2.0 License with LLVM Exceptions.
--- a/apps/language_models/README.md
+++ b/apps/language_models/README.md
@@ -1,16 +0,0 @@
-## CodeGen Setup using SHARK-server
-
-### Setup Server
- clone SHARK and setup the venv
- host the server using `python apps/stable_diffusion/web/index.py --api --server_port=<PORT>`
- default server address is `http://0.0.0.0:8080`
-
-### Setup Client
-1. fauxpilot-vscode (VSCode Extension):
- Code for the extension can be found [here](https://github.com/Venthe/vscode-fauxpilot)
- PreReq: VSCode extension (will need [`nodejs` and `npm`](https://nodejs.org/en/download) to compile and run the extension)
- Compile and Run the extension on VSCode (press F5 on VSCode), this opens a new VSCode window with the extension running
- Open VSCode settings, search for fauxpilot in settings and modify `server : http://<IP>:<PORT>`, `Model : codegen` , `Max Lines : 30`
-
-2. Others (REST API curl, OpenAI Python bindings) as shown [here](https://github.com/fauxpilot/fauxpilot/blob/main/documentation/client.md)
- using Github Copilot VSCode extension with SHARK-server needs more work to be functional.
--- a/apps/language_models/langchain/README.md
+++ b/apps/language_models/langchain/README.md
@@ -1,18 +0,0 @@
-# Langchain
-
-## How to run the model
-
-1.) Install all the dependencies by running:
-```shell
-pip install -r apps/language_models/langchain/langchain_requirements.txt
-sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
-```
-
-2.) Create a folder named `user_path` in `apps/language_models/langchain/` directory.
-
-Now, you are ready to use the model.
-
-3.) To run the model, run the following command:
-```shell
-python apps/language_models/langchain/gen.py --cli=True
-```
--- a/apps/language_models/langchain/cli.py
+++ b/apps/language_models/langchain/cli.py
@@ -1,186 +0,0 @@
-import copy
-import torch
-
-from evaluate_params import eval_func_param_names
-from gen import Langchain
-from prompter import non_hf_types
-from utils import clear_torch_cache, NullContext, get_kwargs
-
-
-def run_cli(  # for local function:
-    base_model=None,
-    lora_weights=None,
-    inference_server=None,
-    debug=None,
-    chat_context=None,
-    examples=None,
-    memory_restriction_level=None,
-    # for get_model:
-    score_model=None,
-    load_8bit=None,
-    load_4bit=None,
-    load_half=None,
-    load_gptq=None,
-    use_safetensors=None,
-    infer_devices=None,
-    tokenizer_base_model=None,
-    gpu_id=None,
-    local_files_only=None,
-    resume_download=None,
-    use_auth_token=None,
-    trust_remote_code=None,
-    offload_folder=None,
-    compile_model=None,
-    # for some evaluate args
-    stream_output=None,
-    prompt_type=None,
-    prompt_dict=None,
-    temperature=None,
-    top_p=None,
-    top_k=None,
-    num_beams=None,
-    max_new_tokens=None,
-    min_new_tokens=None,
-    early_stopping=None,
-    max_time=None,
-    repetition_penalty=None,
-    num_return_sequences=None,
-    do_sample=None,
-    chat=None,
-    langchain_mode=None,
-    langchain_action=None,
-    document_choice=None,
-    top_k_docs=None,
-    chunk=None,
-    chunk_size=None,
-    # for evaluate kwargs
-    src_lang=None,
-    tgt_lang=None,
-    concurrency_count=None,
-    save_dir=None,
-    sanitize_bot_response=None,
-    model_state0=None,
-    max_max_new_tokens=None,
-    is_public=None,
-    max_max_time=None,
-    raise_generate_gpu_exceptions=None,
-    load_db_if_exists=None,
-    dbs=None,
-    user_path=None,
-    detect_user_path_changes_every_query=None,
-    use_openai_embedding=None,
-    use_openai_model=None,
-    hf_embedding_model=None,
-    db_type=None,
-    n_jobs=None,
-    first_para=None,
-    text_limit=None,
-    verbose=None,
-    cli=None,
-    reverse_docs=None,
-    use_cache=None,
-    auto_reduce_chunks=None,
-    max_chunks=None,
-    model_lock=None,
-    force_langchain_evaluate=None,
-    model_state_none=None,
-    # unique to this function:
-    cli_loop=None,
-):
-    Langchain.check_locals(**locals())
-
-    score_model = ""  # FIXME: For now, so user doesn't have to pass
-    n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
-    device = "cpu" if n_gpus == 0 else "cuda"
-    context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device
-
-    with context_class(device):
-        from functools import partial
-
-        # get score model
-        smodel, stokenizer, sdevice = Langchain.get_score_model(
-            reward_type=True,
-            **get_kwargs(
-                Langchain.get_score_model,
-                exclude_names=["reward_type"],
-                **locals()
-            )
-        )
-
-        model, tokenizer, device = Langchain.get_model(
-            reward_type=False,
-            **get_kwargs(
-                Langchain.get_model, exclude_names=["reward_type"], **locals()
-            )
-        )
-        model_dict = dict(
-            base_model=base_model,
-            tokenizer_base_model=tokenizer_base_model,
-            lora_weights=lora_weights,
-            inference_server=inference_server,
-            prompt_type=prompt_type,
-            prompt_dict=prompt_dict,
-        )
-        model_state = dict(model=model, tokenizer=tokenizer, device=device)
-        model_state.update(model_dict)
-        my_db_state = [None]
-        fun = partial(
-            Langchain.evaluate,
-            model_state,
-            my_db_state,
-            **get_kwargs(
-                Langchain.evaluate,
-                exclude_names=["model_state", "my_db_state"]
-                + eval_func_param_names,
-                **locals()
-            )
-        )
-
-        example1 = examples[-1]  # pick reference example
-        all_generations = []
-        while True:
-            clear_torch_cache()
-            instruction = input("\nEnter an instruction: ")
-            if instruction == "exit":
-                break
-
-            eval_vars = copy.deepcopy(example1)
-            eval_vars[eval_func_param_names.index("instruction")] = eval_vars[
-                eval_func_param_names.index("instruction_nochat")
-            ] = instruction
-            eval_vars[eval_func_param_names.index("iinput")] = eval_vars[
-                eval_func_param_names.index("iinput_nochat")
-            ] = ""  # no input yet
-            eval_vars[
-                eval_func_param_names.index("context")
-            ] = ""  # no context yet
-
-            # grab other parameters, like langchain_mode
-            for k in eval_func_param_names:
-                if k in locals():
-                    eval_vars[eval_func_param_names.index(k)] = locals()[k]
-
-            gener = fun(*tuple(eval_vars))
-            outr = ""
-            res_old = ""
-            for gen_output in gener:
-                res = gen_output["response"]
-                extra = gen_output["sources"]
-                if base_model not in non_hf_types or base_model in ["llama"]:
-                    if not stream_output:
-                        print(res)
-                    else:
-                        # then stream output for gradio that has full output each generation, so need here to show only new chars
-                        diff = res[len(res_old) :]
-                        print(diff, end="", flush=True)
-                        res_old = res
-                    outr = res  # don't accumulate
-                else:
-                    outr += res  # just is one thing
-                    if extra:
-                        # show sources at end after model itself had streamed to std rest of response
-                        print(extra, flush=True)
-            all_generations.append(outr + "\n")
-            if not cli_loop:
-                break
-    return all_generations
--- a/apps/language_models/langchain/create_data.py
+++ b/apps/language_models/langchain/create_data.py
--- a/apps/language_models/langchain/enums.py
+++ b/apps/language_models/langchain/enums.py
@@ -1,103 +0,0 @@
-from enum import Enum
-
-
-class PromptType(Enum):
-    custom = -1
-    plain = 0
-    instruct = 1
-    quality = 2
-    human_bot = 3
-    dai_faq = 4
-    summarize = 5
-    simple_instruct = 6
-    instruct_vicuna = 7
-    instruct_with_end = 8
-    human_bot_orig = 9
-    prompt_answer = 10
-    open_assistant = 11
-    wizard_lm = 12
-    wizard_mega = 13
-    instruct_vicuna2 = 14
-    instruct_vicuna3 = 15
-    wizard2 = 16
-    wizard3 = 17
-    instruct_simple = 18
-    wizard_vicuna = 19
-    openai = 20
-    openai_chat = 21
-    gptj = 22
-    prompt_answer_openllama = 23
-    vicuna11 = 24
-    mptinstruct = 25
-    mptchat = 26
-    falcon = 27
-
-
-class DocumentChoices(Enum):
-    All_Relevant = 0
-    All_Relevant_Only_Sources = 1
-    Only_All_Sources = 2
-    Just_LLM = 3
-
-
-non_query_commands = [
-    DocumentChoices.All_Relevant_Only_Sources.name,
-    DocumentChoices.Only_All_Sources.name,
-]
-
-
-class LangChainMode(Enum):
-    """LangChain mode"""
-
-    DISABLED = "Disabled"
-    CHAT_LLM = "ChatLLM"
-    LLM = "LLM"
-    ALL = "All"
-    WIKI = "wiki"
-    WIKI_FULL = "wiki_full"
-    USER_DATA = "UserData"
-    MY_DATA = "MyData"
-    GITHUB_H2OGPT = "github h2oGPT"
-    H2O_DAI_DOCS = "DriverlessAI docs"
-
-
-class LangChainAction(Enum):
-    """LangChain action"""
-
-    QUERY = "Query"
-    # WIP:
-    # SUMMARIZE_MAP = "Summarize_map_reduce"
-    SUMMARIZE_MAP = "Summarize"
-    SUMMARIZE_ALL = "Summarize_all"
-    SUMMARIZE_REFINE = "Summarize_refine"
-
-
-no_server_str = no_lora_str = no_model_str = "[None/Remove]"
-
-# from site-packages/langchain/llms/openai.py
-# but needed since ChatOpenAI doesn't have this information
-model_token_mapping = {
-    "gpt-4": 8192,
-    "gpt-4-0314": 8192,
-    "gpt-4-32k": 32768,
-    "gpt-4-32k-0314": 32768,
-    "gpt-3.5-turbo": 4096,
-    "gpt-3.5-turbo-16k": 16 * 1024,
-    "gpt-3.5-turbo-0301": 4096,
-    "text-ada-001": 2049,
-    "ada": 2049,
-    "text-babbage-001": 2040,
-    "babbage": 2049,
-    "text-curie-001": 2049,
-    "curie": 2049,
-    "davinci": 2049,
-    "text-davinci-003": 4097,
-    "text-davinci-002": 4097,
-    "code-davinci-002": 8001,
-    "code-davinci-001": 8001,
-    "code-cushman-002": 2048,
-    "code-cushman-001": 2048,
-}
-
-source_prefix = "Sources [Score | Link]:"
-source_postfix = "End Sources<p>"
--- a/apps/language_models/langchain/evaluate_params.py
+++ b/apps/language_models/langchain/evaluate_params.py
@@ -1,53 +0,0 @@
-no_default_param_names = [
-    "instruction",
-    "iinput",
-    "context",
-    "instruction_nochat",
-    "iinput_nochat",
-]
-
-gen_hyper = [
-    "temperature",
-    "top_p",
-    "top_k",
-    "num_beams",
-    "max_new_tokens",
-    "min_new_tokens",
-    "early_stopping",
-    "max_time",
-    "repetition_penalty",
-    "num_return_sequences",
-    "do_sample",
-]
-
-eval_func_param_names = (
-    [
-        "instruction",
-        "iinput",
-        "context",
-        "stream_output",
-        "prompt_type",
-        "prompt_dict",
-    ]
-    + gen_hyper
-    + [
-        "chat",
-        "instruction_nochat",
-        "iinput_nochat",
-        "langchain_mode",
-        "langchain_action",
-        "top_k_docs",
-        "chunk",
-        "chunk_size",
-        "document_choice",
-    ]
-)
-
-# form evaluate defaults for submit_nochat_api
-eval_func_param_names_defaults = eval_func_param_names.copy()
-for k in no_default_param_names:
-    if k in eval_func_param_names_defaults:
-        eval_func_param_names_defaults.remove(k)
-
-
-eval_extra_columns = ["prompt", "response", "score"]
--- a/apps/language_models/langchain/expanded_pipelines.py
+++ b/apps/language_models/langchain/expanded_pipelines.py
@@ -1,846 +0,0 @@
-from __future__ import annotations
-from typing import (
-    Any,
-    Mapping,
-    Optional,
-    Dict,
-    List,
-    Sequence,
-    Tuple,
-    Union,
-    Protocol,
-)
-import inspect
-import json
-import warnings
-from pathlib import Path
-import yaml
-from abc import ABC, abstractmethod
-import langchain
-from langchain.base_language import BaseLanguageModel
-from langchain.callbacks.base import BaseCallbackManager
-from langchain.chains.question_answering import stuff_prompt
-from langchain.prompts.base import BasePromptTemplate
-from langchain.docstore.document import Document
-from langchain.callbacks.manager import (
-    CallbackManager,
-    CallbackManagerForChainRun,
-    Callbacks,
-)
-from langchain.load.serializable import Serializable
-from langchain.schema import RUN_KEY, BaseMemory, RunInfo
-from langchain.input import get_colored_text
-from langchain.load.dump import dumpd
-from langchain.prompts.prompt import PromptTemplate
-from langchain.schema import LLMResult, PromptValue
-from pydantic import Extra, Field, root_validator, validator
-
-
-def _get_verbosity() -> bool:
-    return langchain.verbose
-
-
-def format_document(doc: Document, prompt: BasePromptTemplate) -> str:
-    """Format a document into a string based on a prompt template."""
-    base_info = {"page_content": doc.page_content}
-    base_info.update(doc.metadata)
-    missing_metadata = set(prompt.input_variables).difference(base_info)
-    if len(missing_metadata) > 0:
-        required_metadata = [
-            iv for iv in prompt.input_variables if iv != "page_content"
-        ]
-        raise ValueError(
-            f"Document prompt requires documents to have metadata variables: "
-            f"{required_metadata}. Received document with missing metadata: "
-            f"{list(missing_metadata)}."
-        )
-    document_info = {k: base_info[k] for k in prompt.input_variables}
-    return prompt.format(**document_info)
-
-
-class Chain(Serializable, ABC):
-    """Base interface that all chains should implement."""
-
-    memory: Optional[BaseMemory] = None
-    callbacks: Callbacks = Field(default=None, exclude=True)
-    callback_manager: Optional[BaseCallbackManager] = Field(
-        default=None, exclude=True
-    )
-    verbose: bool = Field(
-        default_factory=_get_verbosity
-    )  # Whether to print the response text
-    tags: Optional[List[str]] = None
-
-    class Config:
-        """Configuration for this pydantic object."""
-
-        arbitrary_types_allowed = True
-
-    @property
-    def _chain_type(self) -> str:
-        raise NotImplementedError("Saving not supported for this chain type.")
-
-    @root_validator()
-    def raise_deprecation(cls, values: Dict) -> Dict:
-        """Raise deprecation warning if callback_manager is used."""
-        if values.get("callback_manager") is not None:
-            warnings.warn(
-                "callback_manager is deprecated. Please use callbacks instead.",
-                DeprecationWarning,
-            )
-            values["callbacks"] = values.pop("callback_manager", None)
-        return values
-
-    @validator("verbose", pre=True, always=True)
-    def set_verbose(cls, verbose: Optional[bool]) -> bool:
-        """If verbose is None, set it.
-
-        This allows users to pass in None as verbose to access the global setting.
-        """
-        if verbose is None:
-            return _get_verbosity()
-        else:
-            return verbose
-
-    @property
-    @abstractmethod
-    def input_keys(self) -> List[str]:
-        """Input keys this chain expects."""
-
-    @property
-    @abstractmethod
-    def output_keys(self) -> List[str]:
-        """Output keys this chain expects."""
-
-    def _validate_inputs(self, inputs: Dict[str, Any]) -> None:
-        """Check that all inputs are present."""
-        missing_keys = set(self.input_keys).difference(inputs)
-        if missing_keys:
-            raise ValueError(f"Missing some input keys: {missing_keys}")
-
-    def _validate_outputs(self, outputs: Dict[str, Any]) -> None:
-        missing_keys = set(self.output_keys).difference(outputs)
-        if missing_keys:
-            raise ValueError(f"Missing some output keys: {missing_keys}")
-
-    @abstractmethod
-    def _call(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> Dict[str, Any]:
-        """Run the logic of this chain and return the output."""
-
-    def __call__(
-        self,
-        inputs: Union[Dict[str, Any], Any],
-        return_only_outputs: bool = False,
-        callbacks: Callbacks = None,
-        *,
-        tags: Optional[List[str]] = None,
-        include_run_info: bool = False,
-    ) -> Dict[str, Any]:
-        """Run the logic of this chain and add to output if desired.
-
-        Args:
-            inputs: Dictionary of inputs, or single input if chain expects
-                only one param.
-            return_only_outputs: boolean for whether to return only outputs in the
-                response. If True, only new keys generated by this chain will be
-                returned. If False, both input keys and new keys generated by this
-                chain will be returned. Defaults to False.
-            callbacks: Callbacks to use for this chain run. If not provided, will
-                use the callbacks provided to the chain.
-            include_run_info: Whether to include run info in the response. Defaults
-                to False.
-        """
-        input_docs = inputs["input_documents"]
-        missing_keys = set(self.input_keys).difference(inputs)
-        if missing_keys:
-            raise ValueError(f"Missing some input keys: {missing_keys}")
-
-        callback_manager = CallbackManager.configure(
-            callbacks, self.callbacks, self.verbose, tags, self.tags
-        )
-        run_manager = callback_manager.on_chain_start(
-            dumpd(self),
-            inputs,
-        )
-
-        if "is_first" in inputs.keys() and not inputs["is_first"]:
-            run_manager_ = run_manager
-            input_list = [inputs]
-            stop = None
-            prompts = []
-            for inputs in input_list:
-                selected_inputs = {
-                    k: inputs[k] for k in self.prompt.input_variables
-                }
-                prompt = self.prompt.format_prompt(**selected_inputs)
-                _colored_text = get_colored_text(prompt.to_string(), "green")
-                _text = "Prompt after formatting:\n" + _colored_text
-                if run_manager_:
-                    run_manager_.on_text(_text, end="\n", verbose=self.verbose)
-                if "stop" in inputs and inputs["stop"] != stop:
-                    raise ValueError(
-                        "If `stop` is present in any inputs, should be present in all."
-                    )
-                prompts.append(prompt)
-
-            prompt_strings = [p.to_string() for p in prompts]
-            prompts = prompt_strings
-            callbacks = run_manager_.get_child() if run_manager_ else None
-            tags = None
-
-            """Run the LLM on the given prompt and input."""
-            # If string is passed in directly no errors will be raised but outputs will
-            # not make sense.
-            if not isinstance(prompts, list):
-                raise ValueError(
-                    "Argument 'prompts' is expected to be of type List[str], received"
-                    f" argument of type {type(prompts)}."
-                )
-            params = self.llm.dict()
-            params["stop"] = stop
-            options = {"stop": stop}
-            disregard_cache = self.llm.cache is not None and not self.llm.cache
-            callback_manager = CallbackManager.configure(
-                callbacks,
-                self.llm.callbacks,
-                self.llm.verbose,
-                tags,
-                self.llm.tags,
-            )
-            if langchain.llm_cache is None or disregard_cache:
-                # This happens when langchain.cache is None, but self.cache is True
-                if self.llm.cache is not None and self.cache:
-                    raise ValueError(
-                        "Asked to cache, but no cache found at `langchain.cache`."
-                    )
-                run_manager_ = callback_manager.on_llm_start(
-                    dumpd(self),
-                    prompts,
-                    invocation_params=params,
-                    options=options,
-                )
-
-                generations = []
-                for prompt in prompts:
-                    inputs_ = prompt
-                    num_workers = None
-                    batch_size = None
-
-                    if num_workers is None:
-                        if self.llm.pipeline._num_workers is None:
-                            num_workers = 0
-                        else:
-                            num_workers = self.llm.pipeline._num_workers
-                    if batch_size is None:
-                        if self.llm.pipeline._batch_size is None:
-                            batch_size = 1
-                        else:
-                            batch_size = self.llm.pipeline._batch_size
-
-                    preprocess_params = {}
-                    generate_kwargs = {}
-                    preprocess_params.update(generate_kwargs)
-                    forward_params = generate_kwargs
-                    postprocess_params = {}
-                    # Fuse __init__ params and __call__ params without modifying the __init__ ones.
-                    preprocess_params = {
-                        **self.llm.pipeline._preprocess_params,
-                        **preprocess_params,
-                    }
-                    forward_params = {
-                        **self.llm.pipeline._forward_params,
-                        **forward_params,
-                    }
-                    postprocess_params = {
-                        **self.llm.pipeline._postprocess_params,
-                        **postprocess_params,
-                    }
-
-                    self.llm.pipeline.call_count += 1
-                    if (
-                        self.llm.pipeline.call_count > 10
-                        and self.llm.pipeline.framework == "pt"
-                        and self.llm.pipeline.device.type == "cuda"
-                    ):
-                        warnings.warn(
-                            "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
-                            " dataset",
-                            UserWarning,
-                        )
-
-                    model_inputs = self.llm.pipeline.preprocess(
-                        inputs_, **preprocess_params
-                    )
-                    model_outputs = self.llm.pipeline.forward(
-                        model_inputs, **forward_params
-                    )
-                    model_outputs["process"] = False
-                    return model_outputs
-                output = LLMResult(generations=generations)
-                run_manager_.on_llm_end(output)
-                if run_manager_:
-                    output.run = RunInfo(run_id=run_manager_.run_id)
-                response = output
-
-            outputs = [
-                # Get the text of the top generated string.
-                {self.output_key: generation[0].text}
-                for generation in response.generations
-            ][0]
-            run_manager.on_chain_end(outputs)
-            final_outputs: Dict[str, Any] = self.prep_outputs(
-                inputs, outputs, return_only_outputs
-            )
-            if include_run_info:
-                final_outputs[RUN_KEY] = RunInfo(run_id=run_manager.run_id)
-            return final_outputs
-        else:
-            _run_manager = (
-                run_manager or CallbackManagerForChainRun.get_noop_manager()
-            )
-            docs = inputs[self.input_key]
-            # Other keys are assumed to be needed for LLM prediction
-            other_keys = {
-                k: v for k, v in inputs.items() if k != self.input_key
-            }
-            doc_strings = [
-                format_document(doc, self.document_prompt) for doc in docs
-            ]
-            # Join the documents together to put them in the prompt.
-            inputs = {
-                k: v
-                for k, v in other_keys.items()
-                if k in self.llm_chain.prompt.input_variables
-            }
-            inputs[self.document_variable_name] = self.document_separator.join(
-                doc_strings
-            )
-            inputs["is_first"] = False
-            inputs["input_documents"] = input_docs
-
-            # Call predict on the LLM.
-            output = self.llm_chain(inputs, callbacks=_run_manager.get_child())
-            if "process" in output.keys() and not output["process"]:
-                return output
-            output = output[self.llm_chain.output_key]
-            extra_return_dict = {}
-        extra_return_dict[self.output_key] = output
-        outputs = extra_return_dict
-        run_manager.on_chain_end(outputs)
-        final_outputs: Dict[str, Any] = self.prep_outputs(
-            inputs, outputs, return_only_outputs
-        )
-        if include_run_info:
-            final_outputs[RUN_KEY] = RunInfo(run_id=run_manager.run_id)
-        return final_outputs
-
-    def prep_outputs(
-        self,
-        inputs: Dict[str, str],
-        outputs: Dict[str, str],
-        return_only_outputs: bool = False,
-    ) -> Dict[str, str]:
-        """Validate and prep outputs."""
-        self._validate_outputs(outputs)
-        if self.memory is not None:
-            self.memory.save_context(inputs, outputs)
-        if return_only_outputs:
-            return outputs
-        else:
-            return {**inputs, **outputs}
-
-    def prep_inputs(
-        self, inputs: Union[Dict[str, Any], Any]
-    ) -> Dict[str, str]:
-        """Validate and prep inputs."""
-        if not isinstance(inputs, dict):
-            _input_keys = set(self.input_keys)
-            if self.memory is not None:
-                # If there are multiple input keys, but some get set by memory so that
-                # only one is not set, we can still figure out which key it is.
-                _input_keys = _input_keys.difference(
-                    self.memory.memory_variables
-                )
-            if len(_input_keys) != 1:
-                raise ValueError(
-                    f"A single string input was passed in, but this chain expects "
-                    f"multiple inputs ({_input_keys}). When a chain expects "
-                    f"multiple inputs, please call it by passing in a dictionary, "
-                    "eg `chain({'foo': 1, 'bar': 2})`"
-                )
-            inputs = {list(_input_keys)[0]: inputs}
-        if self.memory is not None:
-            external_context = self.memory.load_memory_variables(inputs)
-            inputs = dict(inputs, **external_context)
-        self._validate_inputs(inputs)
-        return inputs
-
-    def apply(
-        self, input_list: List[Dict[str, Any]], callbacks: Callbacks = None
-    ) -> List[Dict[str, str]]:
-        """Call the chain on all inputs in the list."""
-        return [self(inputs, callbacks=callbacks) for inputs in input_list]
-
-    def run(
-        self,
-        *args: Any,
-        callbacks: Callbacks = None,
-        tags: Optional[List[str]] = None,
-        **kwargs: Any,
-    ) -> str:
-        """Run the chain as text in, text out or multiple variables, text out."""
-        if len(self.output_keys) != 1:
-            raise ValueError(
-                f"`run` not supported when there is not exactly "
-                f"one output key. Got {self.output_keys}."
-            )
-
-        if args and not kwargs:
-            if len(args) != 1:
-                raise ValueError(
-                    "`run` supports only one positional argument."
-                )
-            return self(args[0], callbacks=callbacks, tags=tags)[
-                self.output_keys[0]
-            ]
-
-        if kwargs and not args:
-            return self(kwargs, callbacks=callbacks, tags=tags)[
-                self.output_keys[0]
-            ]
-
-        if not kwargs and not args:
-            raise ValueError(
-                "`run` supported with either positional arguments or keyword arguments,"
-                " but none were provided."
-            )
-
-        raise ValueError(
-            f"`run` supported with either positional arguments or keyword arguments"
-            f" but not both. Got args: {args} and kwargs: {kwargs}."
-        )
-
-    def dict(self, **kwargs: Any) -> Dict:
-        """Return dictionary representation of chain."""
-        if self.memory is not None:
-            raise ValueError("Saving of memory is not yet supported.")
-        _dict = super().dict()
-        _dict["_type"] = self._chain_type
-        return _dict
-
-    def save(self, file_path: Union[Path, str]) -> None:
-        """Save the chain.
-
-        Args:
-            file_path: Path to file to save the chain to.
-
-        Example:
-        .. code-block:: python
-
-            chain.save(file_path="path/chain.yaml")
-        """
-        # Convert file to Path object.
-        if isinstance(file_path, str):
-            save_path = Path(file_path)
-        else:
-            save_path = file_path
-
-        directory_path = save_path.parent
-        directory_path.mkdir(parents=True, exist_ok=True)
-
-        # Fetch dictionary to save
-        chain_dict = self.dict()
-
-        if save_path.suffix == ".json":
-            with open(file_path, "w") as f:
-                json.dump(chain_dict, f, indent=4)
-        elif save_path.suffix == ".yaml":
-            with open(file_path, "w") as f:
-                yaml.dump(chain_dict, f, default_flow_style=False)
-        else:
-            raise ValueError(f"{save_path} must be json or yaml")
-
-
-class BaseCombineDocumentsChain(Chain, ABC):
-    """Base interface for chains combining documents."""
-
-    input_key: str = "input_documents"  #: :meta private:
-    output_key: str = "output_text"  #: :meta private:
-
-    @property
-    def input_keys(self) -> List[str]:
-        """Expect input key.
-
-        :meta private:
-        """
-        return [self.input_key]
-
-    @property
-    def output_keys(self) -> List[str]:
-        """Return output key.
-
-        :meta private:
-        """
-        return [self.output_key]
-
-    def prompt_length(
-        self, docs: List[Document], **kwargs: Any
-    ) -> Optional[int]:
-        """Return the prompt length given the documents passed in.
-
-        Returns None if the method does not depend on the prompt length.
-        """
-        return None
-
-    def _call(
-        self,
-        inputs: Dict[str, List[Document]],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> Dict[str, str]:
-        _run_manager = (
-            run_manager or CallbackManagerForChainRun.get_noop_manager()
-        )
-        docs = inputs[self.input_key]
-        # Other keys are assumed to be needed for LLM prediction
-        other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
-        doc_strings = [
-            format_document(doc, self.document_prompt) for doc in docs
-        ]
-        # Join the documents together to put them in the prompt.
-        inputs = {
-            k: v
-            for k, v in other_keys.items()
-            if k in self.llm_chain.prompt.input_variables
-        }
-        inputs[self.document_variable_name] = self.document_separator.join(
-            doc_strings
-        )
-
-        # Call predict on the LLM.
-        output, extra_return_dict = (
-            self.llm_chain(inputs, callbacks=_run_manager.get_child())[
-                self.llm_chain.output_key
-            ],
-            {},
-        )
-
-        extra_return_dict[self.output_key] = output
-        return extra_return_dict
-
-
-from pydantic import BaseModel
-
-
-class Generation(Serializable):
-    """Output of a single generation."""
-
-    text: str
-    """Generated text output."""
-
-    generation_info: Optional[Dict[str, Any]] = None
-    """Raw generation info response from the provider"""
-    """May include things like reason for finishing (e.g. in OpenAI)"""
-    # TODO: add log probs
-
-
-VALID_TASKS = ("text2text-generation", "text-generation", "summarization")
-
-
-class LLMChain(Chain):
-    """Chain to run queries against LLMs.
-
-    Example:
-        .. code-block:: python
-
-            from langchain import LLMChain, OpenAI, PromptTemplate
-            prompt_template = "Tell me a {adjective} joke"
-            prompt = PromptTemplate(
-                input_variables=["adjective"], template=prompt_template
-            )
-            llm = LLMChain(llm=OpenAI(), prompt=prompt)
-    """
-
-    @property
-    def lc_serializable(self) -> bool:
-        return True
-
-    prompt: BasePromptTemplate
-    """Prompt object to use."""
-    llm: BaseLanguageModel
-    output_key: str = "text"  #: :meta private:
-
-    class Config:
-        """Configuration for this pydantic object."""
-
-        extra = Extra.forbid
-        arbitrary_types_allowed = True
-
-    @property
-    def input_keys(self) -> List[str]:
-        """Will be whatever keys the prompt expects.
-
-        :meta private:
-        """
-        return self.prompt.input_variables
-
-    @property
-    def output_keys(self) -> List[str]:
-        """Will always return text key.
-
-        :meta private:
-        """
-        return [self.output_key]
-
-    def _call(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> Dict[str, str]:
-        prompts, stop = self.prep_prompts([inputs], run_manager=run_manager)
-        response = self.llm.generate_prompt(
-            prompts,
-            stop,
-            callbacks=run_manager.get_child() if run_manager else None,
-        )
-        return self.create_outputs(response)[0]
-
-    def prep_prompts(
-        self,
-        input_list: List[Dict[str, Any]],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> Tuple[List[PromptValue], Optional[List[str]]]:
-        """Prepare prompts from inputs."""
-        stop = None
-        if "stop" in input_list[0]:
-            stop = input_list[0]["stop"]
-        prompts = []
-        for inputs in input_list:
-            selected_inputs = {
-                k: inputs[k] for k in self.prompt.input_variables
-            }
-            prompt = self.prompt.format_prompt(**selected_inputs)
-            _colored_text = get_colored_text(prompt.to_string(), "green")
-            _text = "Prompt after formatting:\n" + _colored_text
-            if run_manager:
-                run_manager.on_text(_text, end="\n", verbose=self.verbose)
-            if "stop" in inputs and inputs["stop"] != stop:
-                raise ValueError(
-                    "If `stop` is present in any inputs, should be present in all."
-                )
-            prompts.append(prompt)
-        return prompts, stop
-
-    def apply(
-        self, input_list: List[Dict[str, Any]], callbacks: Callbacks = None
-    ) -> List[Dict[str, str]]:
-        """Utilize the LLM generate method for speed gains."""
-        callback_manager = CallbackManager.configure(
-            callbacks, self.callbacks, self.verbose
-        )
-        run_manager = callback_manager.on_chain_start(
-            dumpd(self),
-            {"input_list": input_list},
-        )
-        try:
-            response = self.generate(input_list, run_manager=run_manager)
-        except (KeyboardInterrupt, Exception) as e:
-            run_manager.on_chain_error(e)
-            raise e
-        outputs = self.create_outputs(response)
-        run_manager.on_chain_end({"outputs": outputs})
-        return outputs
-
-    def create_outputs(self, response: LLMResult) -> List[Dict[str, str]]:
-        """Create outputs from response."""
-        return [
-            # Get the text of the top generated string.
-            {self.output_key: generation[0].text}
-            for generation in response.generations
-        ]
-
-    def predict_and_parse(
-        self, callbacks: Callbacks = None, **kwargs: Any
-    ) -> Union[str, List[str], Dict[str, Any]]:
-        """Call predict and then parse the results."""
-        result = self.predict(callbacks=callbacks, **kwargs)
-        if self.prompt.output_parser is not None:
-            return self.prompt.output_parser.parse(result)
-        else:
-            return result
-
-    def apply_and_parse(
-        self, input_list: List[Dict[str, Any]], callbacks: Callbacks = None
-    ) -> Sequence[Union[str, List[str], Dict[str, str]]]:
-        """Call apply and then parse the results."""
-        result = self.apply(input_list, callbacks=callbacks)
-        return self._parse_result(result)
-
-    def _parse_result(
-        self, result: List[Dict[str, str]]
-    ) -> Sequence[Union[str, List[str], Dict[str, str]]]:
-        if self.prompt.output_parser is not None:
-            return [
-                self.prompt.output_parser.parse(res[self.output_key])
-                for res in result
-            ]
-        else:
-            return result
-
-    @property
-    def _chain_type(self) -> str:
-        return "llm_chain"
-
-    @classmethod
-    def from_string(cls, llm: BaseLanguageModel, template: str) -> LLMChain:
-        """Create LLMChain from LLM and template."""
-        prompt_template = PromptTemplate.from_template(template)
-        return cls(llm=llm, prompt=prompt_template)
-
-
-def _get_default_document_prompt() -> PromptTemplate:
-    return PromptTemplate(
-        input_variables=["page_content"], template="{page_content}"
-    )
-
-
-class StuffDocumentsChain(BaseCombineDocumentsChain):
-    """Chain that combines documents by stuffing into context."""
-
-    llm_chain: LLMChain
-    """LLM wrapper to use after formatting documents."""
-    document_prompt: BasePromptTemplate = Field(
-        default_factory=_get_default_document_prompt
-    )
-    """Prompt to use to format each document."""
-    document_variable_name: str
-    """The variable name in the llm_chain to put the documents in.
-    If only one variable in the llm_chain, this need not be provided."""
-    document_separator: str = "\n\n"
-    """The string with which to join the formatted documents"""
-
-    class Config:
-        """Configuration for this pydantic object."""
-
-        extra = Extra.forbid
-        arbitrary_types_allowed = True
-
-    @root_validator(pre=True)
-    def get_default_document_variable_name(cls, values: Dict) -> Dict:
-        """Get default document variable name, if not provided."""
-        llm_chain_variables = values["llm_chain"].prompt.input_variables
-        if "document_variable_name" not in values:
-            if len(llm_chain_variables) == 1:
-                values["document_variable_name"] = llm_chain_variables[0]
-            else:
-                raise ValueError(
-                    "document_variable_name must be provided if there are "
-                    "multiple llm_chain_variables"
-                )
-        else:
-            if values["document_variable_name"] not in llm_chain_variables:
-                raise ValueError(
-                    f"document_variable_name {values['document_variable_name']} was "
-                    f"not found in llm_chain input_variables: {llm_chain_variables}"
-                )
-        return values
-
-    def _get_inputs(self, docs: List[Document], **kwargs: Any) -> dict:
-        # Format each document according to the prompt
-        doc_strings = [
-            format_document(doc, self.document_prompt) for doc in docs
-        ]
-        # Join the documents together to put them in the prompt.
-        inputs = {
-            k: v
-            for k, v in kwargs.items()
-            if k in self.llm_chain.prompt.input_variables
-        }
-        inputs[self.document_variable_name] = self.document_separator.join(
-            doc_strings
-        )
-        return inputs
-
-    def prompt_length(
-        self, docs: List[Document], **kwargs: Any
-    ) -> Optional[int]:
-        """Get the prompt length by formatting the prompt."""
-        inputs = self._get_inputs(docs, **kwargs)
-        prompt = self.llm_chain.prompt.format(**inputs)
-        return self.llm_chain.llm.get_num_tokens(prompt)
-
-    @property
-    def _chain_type(self) -> str:
-        return "stuff_documents_chain"
-
-
-class LoadingCallable(Protocol):
-    """Interface for loading the combine documents chain."""
-
-    def __call__(
-        self, llm: BaseLanguageModel, **kwargs: Any
-    ) -> BaseCombineDocumentsChain:
-        """Callable to load the combine documents chain."""
-
-
-def _load_stuff_chain(
-    llm: BaseLanguageModel,
-    prompt: Optional[BasePromptTemplate] = None,
-    document_variable_name: str = "context",
-    verbose: Optional[bool] = None,
-    callback_manager: Optional[BaseCallbackManager] = None,
-    callbacks: Callbacks = None,
-    **kwargs: Any,
-) -> StuffDocumentsChain:
-    _prompt = prompt or stuff_prompt.PROMPT_SELECTOR.get_prompt(llm)
-    llm_chain = LLMChain(
-        llm=llm,
-        prompt=_prompt,
-        verbose=verbose,
-        callback_manager=callback_manager,
-        callbacks=callbacks,
-    )
-    # TODO: document prompt
-    return StuffDocumentsChain(
-        llm_chain=llm_chain,
-        document_variable_name=document_variable_name,
-        verbose=verbose,
-        callback_manager=callback_manager,
-        **kwargs,
-    )
-
-
-def load_qa_chain(
-    llm: BaseLanguageModel,
-    chain_type: str = "stuff",
-    verbose: Optional[bool] = None,
-    callback_manager: Optional[BaseCallbackManager] = None,
-    **kwargs: Any,
-) -> BaseCombineDocumentsChain:
-    """Load question answering chain.
-
-    Args:
-        llm: Language Model to use in the chain.
-        chain_type: Type of document combining chain to use. Should be one of "stuff",
-            "map_reduce", "map_rerank", and "refine".
-        verbose: Whether chains should be run in verbose mode or not. Note that this
-            applies to all chains that make up the final chain.
-        callback_manager: Callback manager to use for the chain.
-
-    Returns:
-        A chain to use for question answering.
-    """
-    loader_mapping: Mapping[str, LoadingCallable] = {
-        "stuff": _load_stuff_chain,
-    }
-    if chain_type not in loader_mapping:
-        raise ValueError(
-            f"Got unsupported chain type: {chain_type}. "
-            f"Should be one of {loader_mapping.keys()}"
-        )
-    return loader_mapping[chain_type](
-        llm, verbose=verbose, callback_manager=callback_manager, **kwargs
-    )
--- a/apps/language_models/langchain/gen.py
+++ b/apps/language_models/langchain/gen.py
--- a/apps/language_models/langchain/gpt4all_llm.py
+++ b/apps/language_models/langchain/gpt4all_llm.py
@@ -1,380 +0,0 @@
-import inspect
-import os
-from functools import partial
-from typing import Dict, Any, Optional, List
-from langchain.callbacks.manager import CallbackManagerForLLMRun
-from pydantic import root_validator
-from langchain.llms import gpt4all
-from dotenv import dotenv_values
-
-from utils import FakeTokenizer
-
-
-def get_model_tokenizer_gpt4all(base_model, **kwargs):
-    # defaults (some of these are generation parameters, so need to be passed in at generation time)
-    model_kwargs = dict(
-        n_threads=os.cpu_count() // 2,
-        temp=kwargs.get("temperature", 0.2),
-        top_p=kwargs.get("top_p", 0.75),
-        top_k=kwargs.get("top_k", 40),
-        n_ctx=2048 - 256,
-    )
-    env_gpt4all_file = ".env_gpt4all"
-    model_kwargs.update(dotenv_values(env_gpt4all_file))
-    # make int or float if can to satisfy types for class
-    for k, v in model_kwargs.items():
-        try:
-            if float(v) == int(v):
-                model_kwargs[k] = int(v)
-            else:
-                model_kwargs[k] = float(v)
-        except:
-            pass
-
-    if base_model == "llama":
-        if "model_path_llama" not in model_kwargs:
-            raise ValueError("No model_path_llama in %s" % env_gpt4all_file)
-        model_path = model_kwargs.pop("model_path_llama")
-        # FIXME: GPT4All version of llama doesn't handle new quantization, so use llama_cpp_python
-        from llama_cpp import Llama
-
-        # llama sets some things at init model time, not generation time
-        func_names = list(inspect.signature(Llama.__init__).parameters)
-        model_kwargs = {
-            k: v for k, v in model_kwargs.items() if k in func_names
-        }
-        model_kwargs["n_ctx"] = int(model_kwargs["n_ctx"])
-        model = Llama(model_path=model_path, **model_kwargs)
-    elif base_model in "gpt4all_llama":
-        if (
-            "model_name_gpt4all_llama" not in model_kwargs
-            and "model_path_gpt4all_llama" not in model_kwargs
-        ):
-            raise ValueError(
-                "No model_name_gpt4all_llama or model_path_gpt4all_llama in %s"
-                % env_gpt4all_file
-            )
-        model_name = model_kwargs.pop("model_name_gpt4all_llama")
-        model_type = "llama"
-        from gpt4all import GPT4All as GPT4AllModel
-
-        model = GPT4AllModel(model_name=model_name, model_type=model_type)
-    elif base_model in "gptj":
-        if (
-            "model_name_gptj" not in model_kwargs
-            and "model_path_gptj" not in model_kwargs
-        ):
-            raise ValueError(
-                "No model_name_gpt4j or model_path_gpt4j in %s"
-                % env_gpt4all_file
-            )
-        model_name = model_kwargs.pop("model_name_gptj")
-        model_type = "gptj"
-        from gpt4all import GPT4All as GPT4AllModel
-
-        model = GPT4AllModel(model_name=model_name, model_type=model_type)
-    else:
-        raise ValueError("No such base_model %s" % base_model)
-    return model, FakeTokenizer(), "cpu"
-
-
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-
-
-class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler):
-    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
-        """Run on new LLM token. Only available when streaming is enabled."""
-        # streaming to std already occurs without this
-        # sys.stdout.write(token)
-        # sys.stdout.flush()
-        pass
-
-
-def get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=[]):
-    # default from class
-    model_kwargs = {
-        k: v.default
-        for k, v in dict(inspect.signature(cls).parameters).items()
-        if k not in exclude_list
-    }
-    # from our defaults
-    model_kwargs.update(default_kwargs)
-    # from user defaults
-    model_kwargs.update(env_kwargs)
-    # ensure only valid keys
-    func_names = list(inspect.signature(cls).parameters)
-    model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
-    return model_kwargs
-
-
-def get_llm_gpt4all(
-    model_name,
-    model=None,
-    max_new_tokens=256,
-    temperature=0.1,
-    repetition_penalty=1.0,
-    top_k=40,
-    top_p=0.7,
-    streaming=False,
-    callbacks=None,
-    prompter=None,
-    verbose=False,
-):
-    assert prompter is not None
-    env_gpt4all_file = ".env_gpt4all"
-    env_kwargs = dotenv_values(env_gpt4all_file)
-    n_ctx = env_kwargs.pop("n_ctx", 2048 - max_new_tokens)
-    default_kwargs = dict(
-        context_erase=0.5,
-        n_batch=1,
-        n_ctx=n_ctx,
-        n_predict=max_new_tokens,
-        repeat_last_n=64 if repetition_penalty != 1.0 else 0,
-        repeat_penalty=repetition_penalty,
-        temp=temperature,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        use_mlock=True,
-        verbose=verbose,
-    )
-    if model_name == "llama":
-        cls = H2OLlamaCpp
-        model_path = (
-            env_kwargs.pop("model_path_llama") if model is None else model
-        )
-        model_kwargs = get_model_kwargs(
-            env_kwargs, default_kwargs, cls, exclude_list=["lc_kwargs"]
-        )
-        model_kwargs.update(
-            dict(
-                model_path=model_path,
-                callbacks=callbacks,
-                streaming=streaming,
-                prompter=prompter,
-            )
-        )
-        llm = cls(**model_kwargs)
-        llm.client.verbose = verbose
-    elif model_name == "gpt4all_llama":
-        cls = H2OGPT4All
-        model_path = (
-            env_kwargs.pop("model_path_gpt4all_llama")
-            if model is None
-            else model
-        )
-        model_kwargs = get_model_kwargs(
-            env_kwargs, default_kwargs, cls, exclude_list=["lc_kwargs"]
-        )
-        model_kwargs.update(
-            dict(
-                model=model_path,
-                backend="llama",
-                callbacks=callbacks,
-                streaming=streaming,
-                prompter=prompter,
-            )
-        )
-        llm = cls(**model_kwargs)
-    elif model_name == "gptj":
-        cls = H2OGPT4All
-        model_path = (
-            env_kwargs.pop("model_path_gptj") if model is None else model
-        )
-        model_kwargs = get_model_kwargs(
-            env_kwargs, default_kwargs, cls, exclude_list=["lc_kwargs"]
-        )
-        model_kwargs.update(
-            dict(
-                model=model_path,
-                backend="gptj",
-                callbacks=callbacks,
-                streaming=streaming,
-                prompter=prompter,
-            )
-        )
-        llm = cls(**model_kwargs)
-    else:
-        raise RuntimeError("No such model_name %s" % model_name)
-    return llm
-
-
-class H2OGPT4All(gpt4all.GPT4All):
-    model: Any
-    prompter: Any
-    """Path to the pre-trained GPT4All model file."""
-
-    @root_validator()
-    def validate_environment(cls, values: Dict) -> Dict:
-        """Validate that the python package exists in the environment."""
-        try:
-            if isinstance(values["model"], str):
-                from gpt4all import GPT4All as GPT4AllModel
-
-                full_path = values["model"]
-                model_path, delimiter, model_name = full_path.rpartition("/")
-                model_path += delimiter
-
-                values["client"] = GPT4AllModel(
-                    model_name=model_name,
-                    model_path=model_path or None,
-                    model_type=values["backend"],
-                    allow_download=False,
-                )
-                if values["n_threads"] is not None:
-                    # set n_threads
-                    values["client"].model.set_thread_count(
-                        values["n_threads"]
-                    )
-            else:
-                values["client"] = values["model"]
-            try:
-                values["backend"] = values["client"].model_type
-            except AttributeError:
-                # The below is for compatibility with GPT4All Python bindings <= 0.2.3.
-                values["backend"] = values["client"].model.model_type
-
-        except ImportError:
-            raise ValueError(
-                "Could not import gpt4all python package. "
-                "Please install it with `pip install gpt4all`."
-            )
-        return values
-
-    def _call(
-        self,
-        prompt: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs,
-    ) -> str:
-        # Roughly 4 chars per token if natural language
-        prompt = prompt[-self.n_ctx * 4 :]
-
-        # use instruct prompting
-        data_point = dict(context="", instruction=prompt, input="")
-        prompt = self.prompter.generate_prompt(data_point)
-
-        verbose = False
-        if verbose:
-            print("_call prompt: %s" % prompt, flush=True)
-        # FIXME: GPT4ALl doesn't support yield during generate, so cannot support streaming except via itself to stdout
-        return super()._call(prompt, stop=stop, run_manager=run_manager)
-
-
-from langchain.llms import LlamaCpp
-
-
-class H2OLlamaCpp(LlamaCpp):
-    model_path: Any
-    prompter: Any
-    """Path to the pre-trained GPT4All model file."""
-
-    @root_validator()
-    def validate_environment(cls, values: Dict) -> Dict:
-        """Validate that llama-cpp-python library is installed."""
-        if isinstance(values["model_path"], str):
-            model_path = values["model_path"]
-            model_param_names = [
-                "lora_path",
-                "lora_base",
-                "n_ctx",
-                "n_parts",
-                "seed",
-                "f16_kv",
-                "logits_all",
-                "vocab_only",
-                "use_mlock",
-                "n_threads",
-                "n_batch",
-                "use_mmap",
-                "last_n_tokens_size",
-            ]
-            model_params = {k: values[k] for k in model_param_names}
-            # For backwards compatibility, only include if non-null.
-            if values["n_gpu_layers"] is not None:
-                model_params["n_gpu_layers"] = values["n_gpu_layers"]
-
-            try:
-                from llama_cpp import Llama
-
-                values["client"] = Llama(model_path, **model_params)
-            except ImportError:
-                raise ModuleNotFoundError(
-                    "Could not import llama-cpp-python library. "
-                    "Please install the llama-cpp-python library to "
-                    "use this embedding model: pip install llama-cpp-python"
-                )
-            except Exception as e:
-                raise ValueError(
-                    f"Could not load Llama model from path: {model_path}. "
-                    f"Received error {e}"
-                )
-        else:
-            values["client"] = values["model_path"]
-        return values
-
-    def _call(
-        self,
-        prompt: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs,
-    ) -> str:
-        verbose = False
-        # tokenize twice, just to count tokens, since llama cpp python wrapper has no way to truncate
-        # still have to avoid crazy sizes, else hit llama_tokenize: too many tokens -- might still hit, not fatal
-        prompt = prompt[-self.n_ctx * 4 :]
-        prompt_tokens = self.client.tokenize(b" " + prompt.encode("utf-8"))
-        num_prompt_tokens = len(prompt_tokens)
-        if num_prompt_tokens > self.n_ctx:
-            # conservative by using int()
-            chars_per_token = int(len(prompt) / num_prompt_tokens)
-            prompt = prompt[-self.n_ctx * chars_per_token :]
-            if verbose:
-                print(
-                    "reducing tokens, assuming average of %s chars/token: %s"
-                    % chars_per_token,
-                    flush=True,
-                )
-                prompt_tokens2 = self.client.tokenize(
-                    b" " + prompt.encode("utf-8")
-                )
-                num_prompt_tokens2 = len(prompt_tokens2)
-                print(
-                    "reduced tokens from %d -> %d"
-                    % (num_prompt_tokens, num_prompt_tokens2),
-                    flush=True,
-                )
-
-        # use instruct prompting
-        data_point = dict(context="", instruction=prompt, input="")
-        prompt = self.prompter.generate_prompt(data_point)
-
-        if verbose:
-            print("_call prompt: %s" % prompt, flush=True)
-
-        if self.streaming:
-            text_callback = None
-            if run_manager:
-                text_callback = partial(
-                    run_manager.on_llm_new_token, verbose=self.verbose
-                )
-            # parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter
-            if text_callback:
-                text_callback(prompt)
-            text = ""
-            for token in self.stream(
-                prompt=prompt, stop=stop, run_manager=run_manager
-            ):
-                text_chunk = token["choices"][0]["text"]
-                # self.stream already calls text_callback
-                # if text_callback:
-                #    text_callback(text_chunk)
-                text += text_chunk
-            return text
-        else:
-            params = self._get_parameters(stop)
-            params = {**params, **kwargs}
-            result = self.client(prompt=prompt, **params)
-            return result["choices"][0]["text"]
--- a/apps/language_models/langchain/gpt_langchain.py
+++ b/apps/language_models/langchain/gpt_langchain.py
--- a/apps/language_models/langchain/gradio_utils/grclient.py
+++ b/apps/language_models/langchain/gradio_utils/grclient.py
@@ -1,93 +0,0 @@
-import traceback
-from typing import Callable
-import os
-
-from gradio_client.client import Job
-
-os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
-
-from gradio_client import Client
-
-
-class GradioClient(Client):
-    """
-    Parent class of gradio client
-    To handle automatically refreshing client if detect gradio server changed
-    """
-
-    def __init__(self, *args, **kwargs):
-        self.args = args
-        self.kwargs = kwargs
-        super().__init__(*args, **kwargs)
-        self.server_hash = self.get_server_hash()
-
-    def get_server_hash(self):
-        """
-        Get server hash using super without any refresh action triggered
-        Returns: git hash of gradio server
-        """
-        return super().submit(api_name="/system_hash").result()
-
-    def refresh_client_if_should(self):
-        # get current hash in order to update api_name -> fn_index map in case gradio server changed
-        # FIXME: Could add cli api as hash
-        server_hash = self.get_server_hash()
-        if self.server_hash != server_hash:
-            self.refresh_client()
-            self.server_hash = server_hash
-        else:
-            self.reset_session()
-
-    def refresh_client(self):
-        """
-        Ensure every client call is independent
-        Also ensure map between api_name and fn_index is updated in case server changed (e.g. restarted with new code)
-        Returns:
-        """
-        # need session hash to be new every time, to avoid "generator already executing"
-        self.reset_session()
-
-        client = Client(*self.args, **self.kwargs)
-        for k, v in client.__dict__.items():
-            setattr(self, k, v)
-
-    def submit(
-        self,
-        *args,
-        api_name: str | None = None,
-        fn_index: int | None = None,
-        result_callbacks: Callable | list[Callable] | None = None,
-    ) -> Job:
-        # Note predict calls submit
-        try:
-            self.refresh_client_if_should()
-            job = super().submit(*args, api_name=api_name, fn_index=fn_index)
-        except Exception as e:
-            print("Hit e=%s" % str(e), flush=True)
-            # force reconfig in case only that
-            self.refresh_client()
-            job = super().submit(*args, api_name=api_name, fn_index=fn_index)
-
-        # see if immediately failed
-        e = job.future._exception
-        if e is not None:
-            print(
-                "GR job failed: %s %s"
-                % (str(e), "".join(traceback.format_tb(e.__traceback__))),
-                flush=True,
-            )
-            # force reconfig in case only that
-            self.refresh_client()
-            job = super().submit(*args, api_name=api_name, fn_index=fn_index)
-            e2 = job.future._exception
-            if e2 is not None:
-                print(
-                    "GR job failed again: %s\n%s"
-                    % (
-                        str(e2),
-                        "".join(traceback.format_tb(e2.__traceback__)),
-                    ),
-                    flush=True,
-                )
-
-        return job
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -1,765 +0,0 @@
-import os
-from apps.stable_diffusion.src.utils.utils import _compile_module
-from io import BytesIO
-import torch_mlir
-
-from stopping import get_stopping
-from prompter import Prompter, PromptType
-
-from transformers import TextGenerationPipeline
-from transformers.pipelines.text_generation import ReturnType
-from transformers.generation import (
-    GenerationConfig,
-    LogitsProcessorList,
-    StoppingCriteriaList,
-)
-import copy
-import torch
-from transformers import AutoConfig, AutoModelForCausalLM
-import gc
-from pathlib import Path
-from shark.shark_inference import SharkInference
-from shark.shark_downloader import download_public_file
-from shark.shark_importer import import_with_fx, save_mlir
-from apps.stable_diffusion.src import args
-
-# Brevitas
-from typing import List, Tuple
-from brevitas_examples.common.generative.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-
-
-# fmt: off
-def quant〇matmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
-    if len(lhs) == 3 and len(rhs) == 2:
-        return [lhs[0], lhs[1], rhs[0]]
-    elif len(lhs) == 2 and len(rhs) == 2:
-        return [lhs[0], rhs[0]]
-    else:
-        raise ValueError("Input shapes not supported.")
-
-
-def quant〇matmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
-    # output dtype is the dtype of the lhs float input
-    lhs_rank, lhs_dtype = lhs_rank_dtype
-    return lhs_dtype
-
-
-def quant〇matmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
-    return
-
-
-brevitas_matmul_rhs_group_quant_library = [
-    quant〇matmul_rhs_group_quant〡shape,
-    quant〇matmul_rhs_group_quant〡dtype,
-    quant〇matmul_rhs_group_quant〡has_value_semantics]
-# fmt: on
-
-global_device = "cuda"
-global_precision = "fp16"
-
-if not args.run_docuchat_web:
-    args.device = global_device
-    args.precision = global_precision
-tensor_device = "cpu" if args.device == "cpu" else "cuda"
-
-
-class H2OGPTModel(torch.nn.Module):
-    def __init__(self, device, precision):
-        super().__init__()
-        torch_dtype = (
-            torch.float32
-            if precision == "fp32" or device == "cpu"
-            else torch.float16
-        )
-        device_map = {"": "cpu"} if device == "cpu" else {"": 0}
-        model_kwargs = {
-            "local_files_only": False,
-            "torch_dtype": torch_dtype,
-            "resume_download": True,
-            "use_auth_token": False,
-            "trust_remote_code": True,
-            "offload_folder": "offline_folder",
-            "device_map": device_map,
-        }
-        config = AutoConfig.from_pretrained(
-            "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
-            use_auth_token=False,
-            trust_remote_code=True,
-            offload_folder="offline_folder",
-        )
-        self.model = AutoModelForCausalLM.from_pretrained(
-            "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
-            config=config,
-            **model_kwargs,
-        )
-        if precision in ["int4", "int8"]:
-            print("Applying weight quantization..")
-            weight_bit_width = 4 if precision == "int4" else 8
-            quantize_model(
-                self.model.transformer.h,
-                dtype=torch.float32,
-                weight_bit_width=weight_bit_width,
-                weight_param_method="stats",
-                weight_scale_precision="float_scale",
-                weight_quant_type="asym",
-                weight_quant_granularity="per_group",
-                weight_group_size=128,
-                quantize_weight_zero_point=False,
-            )
-            print("Weight quantization applied.")
-
-    def forward(self, input_ids, attention_mask):
-        input_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": None,
-            "use_cache": True,
-        }
-        output = self.model(
-            **input_dict,
-            return_dict=True,
-            output_attentions=False,
-            output_hidden_states=False,
-        )
-        return output.logits[:, -1, :]
-
-
-class H2OGPTSHARKModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        model_name = "h2ogpt_falcon_7b"
-        extended_model_name = (
-            model_name + "_" + args.precision + "_" + args.device
-        )
-        vmfb_path = Path(extended_model_name + ".vmfb")
-        mlir_path = Path(model_name + "_" + args.precision + ".mlir")
-        shark_module = None
-
-        need_to_compile = False
-        if not vmfb_path.exists():
-            need_to_compile = True
-            # Downloading VMFB from shark_tank
-            print("Trying to download pre-compiled vmfb from shark tank.")
-            download_public_file(
-                "gs://shark_tank/langchain/" + str(vmfb_path),
-                vmfb_path.absolute(),
-                single_file=True,
-            )
-            if vmfb_path.exists():
-                print(
-                    "Pre-compiled vmfb downloaded from shark tank successfully."
-                )
-                need_to_compile = False
-
-        if need_to_compile:
-            if not mlir_path.exists():
-                print("Trying to download pre-generated mlir from shark tank.")
-                # Downloading MLIR from shark_tank
-                download_public_file(
-                    "gs://shark_tank/langchain/" + str(mlir_path),
-                    mlir_path.absolute(),
-                    single_file=True,
-                )
-            if mlir_path.exists():
-                with open(mlir_path, "rb") as f:
-                    bytecode = f.read()
-            else:
-                # Generating the mlir
-                bytecode = self.get_bytecode(tensor_device, args.precision)
-
-            shark_module = SharkInference(
-                mlir_module=bytecode,
-                device=args.device,
-                mlir_dialect="linalg",
-            )
-            print(f"[DEBUG] generating vmfb.")
-            shark_module = _compile_module(
-                shark_module, extended_model_name, []
-            )
-            print("Saved newly generated vmfb.")
-
-        if shark_module is None:
-            if vmfb_path.exists():
-                print("Compiled vmfb found. Loading it from: ", vmfb_path)
-                shark_module = SharkInference(
-                    None, device=args.device, mlir_dialect="linalg"
-                )
-                shark_module.load_module(str(vmfb_path))
-                print("Compiled vmfb loaded successfully.")
-            else:
-                raise ValueError("Unable to download/generate a vmfb.")
-
-        self.model = shark_module
-
-    def get_bytecode(self, device, precision):
-        h2ogpt_model = H2OGPTModel(device, precision)
-
-        compilation_input_ids = torch.randint(
-            low=1, high=10000, size=(1, 400)
-        ).to(device=device)
-        compilation_attention_mask = torch.ones(1, 400, dtype=torch.int64).to(
-            device=device
-        )
-
-        h2ogptCompileInput = (
-            compilation_input_ids,
-            compilation_attention_mask,
-        )
-
-        print(f"[DEBUG] generating torchscript graph")
-        ts_graph = import_with_fx(
-            h2ogpt_model,
-            h2ogptCompileInput,
-            is_f16=False,
-            precision=precision,
-            f16_input_mask=[False, False],
-            mlir_type="torchscript",
-        )
-        del h2ogpt_model
-        del self.src_model
-
-        print(f"[DEBUG] generating torch mlir")
-        if precision in ["int4", "int8"]:
-            from torch_mlir.compiler_utils import (
-                run_pipeline_with_repro_report,
-            )
-
-            module = torch_mlir.compile(
-                ts_graph,
-                [*h2ogptCompileInput],
-                output_type=torch_mlir.OutputType.TORCH,
-                backend_legal_ops=["quant.matmul_rhs_group_quant"],
-                extra_library=brevitas_matmul_rhs_group_quant_library,
-                use_tracing=False,
-                verbose=False,
-            )
-            print(f"[DEBUG] converting torch to linalg")
-            run_pipeline_with_repro_report(
-                module,
-                "builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
-                description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
-            )
-        else:
-            module = torch_mlir.compile(
-                ts_graph,
-                [*h2ogptCompileInput],
-                torch_mlir.OutputType.LINALG_ON_TENSORS,
-                use_tracing=False,
-                verbose=False,
-            )
-        del ts_graph
-
-        print(f"[DEBUG] converting to bytecode")
-        bytecode_stream = BytesIO()
-        module.operation.write_bytecode(bytecode_stream)
-        bytecode = bytecode_stream.getvalue()
-        del module
-
-        bytecode = save_mlir(
-            bytecode,
-            model_name=f"h2ogpt_{precision}",
-            frontend="torch",
-        )
-        return bytecode
-
-    def forward(self, input_ids, attention_mask):
-        result = torch.from_numpy(
-            self.model(
-                "forward",
-                (input_ids.to(device="cpu"), attention_mask.to(device="cpu")),
-            )
-        ).to(device=tensor_device)
-        return result
-
-
-def decode_tokens(tokenizer, res_tokens):
-    for i in range(len(res_tokens)):
-        if type(res_tokens[i]) != int:
-            res_tokens[i] = int(res_tokens[i][0])
-
-    res_str = tokenizer.decode(res_tokens, skip_special_tokens=True)
-    return res_str
-
-
-def generate_token(h2ogpt_shark_model, model, tokenizer, **generate_kwargs):
-    del generate_kwargs["max_time"]
-    generate_kwargs["input_ids"] = generate_kwargs["input_ids"].to(
-        device=tensor_device
-    )
-    generate_kwargs["attention_mask"] = generate_kwargs["attention_mask"].to(
-        device=tensor_device
-    )
-    truncated_input_ids = []
-    stopping_criteria = generate_kwargs["stopping_criteria"]
-
-    generation_config_ = GenerationConfig.from_model_config(model.config)
-    generation_config = copy.deepcopy(generation_config_)
-    model_kwargs = generation_config.update(**generate_kwargs)
-
-    logits_processor = LogitsProcessorList()
-    stopping_criteria = (
-        stopping_criteria
-        if stopping_criteria is not None
-        else StoppingCriteriaList()
-    )
-
-    eos_token_id = generation_config.eos_token_id
-    generation_config.pad_token_id = eos_token_id
-
-    (
-        inputs_tensor,
-        model_input_name,
-        model_kwargs,
-    ) = model._prepare_model_inputs(
-        None, generation_config.bos_token_id, model_kwargs
-    )
-
-    model_kwargs["output_attentions"] = generation_config.output_attentions
-    model_kwargs[
-        "output_hidden_states"
-    ] = generation_config.output_hidden_states
-    model_kwargs["use_cache"] = generation_config.use_cache
-
-    input_ids = (
-        inputs_tensor
-        if model_input_name == "input_ids"
-        else model_kwargs.pop("input_ids")
-    )
-
-    input_ids_seq_length = input_ids.shape[-1]
-
-    generation_config.max_length = (
-        generation_config.max_new_tokens + input_ids_seq_length
-    )
-
-    logits_processor = model._get_logits_processor(
-        generation_config=generation_config,
-        input_ids_seq_length=input_ids_seq_length,
-        encoder_input_ids=inputs_tensor,
-        prefix_allowed_tokens_fn=None,
-        logits_processor=logits_processor,
-    )
-
-    stopping_criteria = model._get_stopping_criteria(
-        generation_config=generation_config,
-        stopping_criteria=stopping_criteria,
-    )
-
-    logits_warper = model._get_logits_warper(generation_config)
-
-    (
-        input_ids,
-        model_kwargs,
-    ) = model._expand_inputs_for_generation(
-        input_ids=input_ids,
-        expand_size=generation_config.num_return_sequences,  # 1
-        is_encoder_decoder=model.config.is_encoder_decoder,  # False
-        **model_kwargs,
-    )
-
-    if isinstance(eos_token_id, int):
-        eos_token_id = [eos_token_id]
-    eos_token_id_tensor = (
-        torch.tensor(eos_token_id).to(device=tensor_device)
-        if eos_token_id is not None
-        else None
-    )
-
-    pad_token_id = generation_config.pad_token_id
-    eos_token_id = eos_token_id
-
-    output_scores = generation_config.output_scores  # False
-    return_dict_in_generate = (
-        generation_config.return_dict_in_generate  # False
-    )
-
-    # init attention / hidden states / scores tuples
-    scores = () if (return_dict_in_generate and output_scores) else None
-
-    # keep track of which sequences are already finished
-    unfinished_sequences = torch.ones(
-        input_ids.shape[0],
-        dtype=torch.long,
-        device=input_ids.device,
-    )
-
-    timesRan = 0
-    import time
-
-    start = time.time()
-    print("\n")
-
-    res_tokens = []
-    while True:
-        model_inputs = model.prepare_inputs_for_generation(
-            input_ids, **model_kwargs
-        )
-
-        outputs = h2ogpt_shark_model.forward(
-            model_inputs["input_ids"], model_inputs["attention_mask"]
-        )
-
-        if args.precision == "fp16":
-            outputs = outputs.to(dtype=torch.float32)
-        next_token_logits = outputs
-
-        # pre-process distribution
-        next_token_scores = logits_processor(input_ids, next_token_logits)
-        next_token_scores = logits_warper(input_ids, next_token_scores)
-
-        # sample
-        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
-
-        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-        # finished sentences should have their next token be a padding token
-        if eos_token_id is not None:
-            if pad_token_id is None:
-                raise ValueError(
-                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
-                )
-            next_token = next_token * unfinished_sequences + pad_token_id * (
-                1 - unfinished_sequences
-            )
-
-        input_ids = torch.cat([input_ids, next_token[:, None]], dim=-1)
-
-        model_kwargs["past_key_values"] = None
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [
-                    attention_mask,
-                    attention_mask.new_ones((attention_mask.shape[0], 1)),
-                ],
-                dim=-1,
-            )
-
-        truncated_input_ids.append(input_ids[:, 0])
-        input_ids = input_ids[:, 1:]
-        model_kwargs["attention_mask"] = model_kwargs["attention_mask"][:, 1:]
-
-        new_word = tokenizer.decode(
-            next_token.cpu().numpy(),
-            add_special_tokens=False,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True,
-        )
-
-        res_tokens.append(next_token)
-        if new_word == "<0x0A>":
-            print("\n", end="", flush=True)
-        else:
-            print(f"{new_word}", end=" ", flush=True)
-
-        part_str = decode_tokens(tokenizer, res_tokens)
-        yield part_str
-
-        # if eos_token was found in one sentence, set sentence to finished
-        if eos_token_id_tensor is not None:
-            unfinished_sequences = unfinished_sequences.mul(
-                next_token.tile(eos_token_id_tensor.shape[0], 1)
-                .ne(eos_token_id_tensor.unsqueeze(1))
-                .prod(dim=0)
-            )
-            # stop when each sentence is finished
-            if unfinished_sequences.max() == 0 or stopping_criteria(
-                input_ids, scores
-            ):
-                break
-        timesRan = timesRan + 1
-
-    end = time.time()
-    print(
-        "\n\nTime taken is {:.2f} seconds/token\n".format(
-            (end - start) / timesRan
-        )
-    )
-
-    torch.cuda.empty_cache()
-    gc.collect()
-
-    res_str = decode_tokens(tokenizer, res_tokens)
-    yield res_str
-
-
-def pad_or_truncate_inputs(
-    input_ids, attention_mask, max_padding_length=400, do_truncation=False
-):
-    inp_shape = input_ids.shape
-    if inp_shape[1] < max_padding_length:
-        # do padding
-        num_add_token = max_padding_length - inp_shape[1]
-        padded_input_ids = torch.cat(
-            [
-                torch.tensor([[11] * num_add_token]).to(device=tensor_device),
-                input_ids,
-            ],
-            dim=1,
-        )
-        padded_attention_mask = torch.cat(
-            [
-                torch.tensor([[0] * num_add_token]).to(device=tensor_device),
-                attention_mask,
-            ],
-            dim=1,
-        )
-        return padded_input_ids, padded_attention_mask
-    elif inp_shape[1] > max_padding_length or do_truncation:
-        # do truncation
-        num_remove_token = inp_shape[1] - max_padding_length
-        truncated_input_ids = input_ids[:, num_remove_token:]
-        truncated_attention_mask = attention_mask[:, num_remove_token:]
-        return truncated_input_ids, truncated_attention_mask
-    else:
-        return input_ids, attention_mask
-
-
-class H2OTextGenerationPipeline(TextGenerationPipeline):
-    def __init__(
-        self,
-        *args,
-        debug=False,
-        chat=False,
-        stream_output=False,
-        sanitize_bot_response=False,
-        use_prompter=True,
-        prompter=None,
-        prompt_type=None,
-        prompt_dict=None,
-        max_input_tokens=2048 - 256,
-        **kwargs,
-    ):
-        """
-        HF-like pipeline, but handle instruction prompting and stopping (for some models)
-        :param args:
-        :param debug:
-        :param chat:
-        :param stream_output:
-        :param sanitize_bot_response:
-        :param use_prompter: Whether to use prompter.  If pass prompt_type, will make prompter
-        :param prompter: prompter, can pass if have already
-        :param prompt_type: prompt_type, e.g. human_bot.  See prompt_type to model mapping in from prompter.py.
-                            If use_prompter, then will make prompter and use it.
-        :param prompt_dict: dict of get_prompt(, return_dict=True) for prompt_type=custom
-        :param max_input_tokens:
-        :param kwargs:
-        """
-        super().__init__(*args, **kwargs)
-        self.prompt_text = None
-        self.use_prompter = use_prompter
-        self.prompt_type = prompt_type
-        self.prompt_dict = prompt_dict
-        self.prompter = prompter
-        if self.use_prompter:
-            if self.prompter is not None:
-                assert self.prompter.prompt_type is not None
-            else:
-                self.prompter = Prompter(
-                    self.prompt_type,
-                    self.prompt_dict,
-                    debug=debug,
-                    chat=chat,
-                    stream_output=stream_output,
-                )
-            self.human = self.prompter.humanstr
-            self.bot = self.prompter.botstr
-            self.can_stop = True
-        else:
-            self.prompter = None
-            self.human = None
-            self.bot = None
-            self.can_stop = False
-        self.sanitize_bot_response = sanitize_bot_response
-        self.max_input_tokens = (
-            max_input_tokens  # not for generate, so ok that not kwargs
-        )
-
-    @staticmethod
-    def limit_prompt(prompt_text, tokenizer, max_prompt_length=None):
-        verbose = bool(int(os.getenv("VERBOSE_PIPELINE", "0")))
-
-        if hasattr(tokenizer, "model_max_length"):
-            # model_max_length only defined for generate.py, not raw use of h2oai_pipeline.py
-            model_max_length = tokenizer.model_max_length
-            if max_prompt_length is not None:
-                model_max_length = min(model_max_length, max_prompt_length)
-            # cut at some upper likely limit to avoid excessive tokenization etc
-            # upper bound of 10 chars/token, e.g. special chars sometimes are long
-            if len(prompt_text) > model_max_length * 10:
-                len0 = len(prompt_text)
-                prompt_text = prompt_text[-model_max_length * 10 :]
-                if verbose:
-                    print(
-                        "Cut of input: %s -> %s" % (len0, len(prompt_text)),
-                        flush=True,
-                    )
-        else:
-            # unknown
-            model_max_length = None
-
-        num_prompt_tokens = None
-        if model_max_length is not None:
-            # can't wait for "hole" if not plain prompt_type, since would lose prefix like <human>:
-            # For https://github.com/h2oai/h2ogpt/issues/192
-            for trial in range(0, 3):
-                prompt_tokens = tokenizer(prompt_text)["input_ids"]
-                num_prompt_tokens = len(prompt_tokens)
-                if num_prompt_tokens > model_max_length:
-                    # conservative by using int()
-                    chars_per_token = int(len(prompt_text) / num_prompt_tokens)
-                    # keep tail, where question is if using langchain
-                    prompt_text = prompt_text[
-                        -model_max_length * chars_per_token :
-                    ]
-                    if verbose:
-                        print(
-                            "reducing %s tokens, assuming average of %s chars/token for %s characters"
-                            % (
-                                num_prompt_tokens,
-                                chars_per_token,
-                                len(prompt_text),
-                            ),
-                            flush=True,
-                        )
-                else:
-                    if verbose:
-                        print(
-                            "using %s tokens with %s chars"
-                            % (num_prompt_tokens, len(prompt_text)),
-                            flush=True,
-                        )
-                    break
-
-        return prompt_text, num_prompt_tokens
-
-    def preprocess(
-        self,
-        prompt_text,
-        prefix="",
-        handle_long_generation=None,
-        **generate_kwargs,
-    ):
-        (
-            prompt_text,
-            num_prompt_tokens,
-        ) = H2OTextGenerationPipeline.limit_prompt(prompt_text, self.tokenizer)
-
-        data_point = dict(context="", instruction=prompt_text, input="")
-        if self.prompter is not None:
-            prompt_text = self.prompter.generate_prompt(data_point)
-        self.prompt_text = prompt_text
-        if handle_long_generation is None:
-            # forces truncation of inputs to avoid critical failure
-            handle_long_generation = None  # disable with new approaches
-        return super().preprocess(
-            prompt_text,
-            prefix=prefix,
-            handle_long_generation=handle_long_generation,
-            **generate_kwargs,
-        )
-
-    def postprocess(
-        self,
-        model_outputs,
-        return_type=ReturnType.FULL_TEXT,
-        clean_up_tokenization_spaces=True,
-    ):
-        records = super().postprocess(
-            model_outputs,
-            return_type=return_type,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-        )
-        for rec in records:
-            if self.use_prompter:
-                outputs = rec["generated_text"]
-                outputs = self.prompter.get_response(
-                    outputs,
-                    prompt=self.prompt_text,
-                    sanitize_bot_response=self.sanitize_bot_response,
-                )
-            elif self.bot and self.human:
-                outputs = (
-                    rec["generated_text"]
-                    .split(self.bot)[1]
-                    .split(self.human)[0]
-                )
-            else:
-                outputs = rec["generated_text"]
-            rec["generated_text"] = outputs
-            print(
-                "prompt: %s\noutputs: %s\n\n" % (self.prompt_text, outputs),
-                flush=True,
-            )
-        return records
-
-    def _forward(self, model_inputs, **generate_kwargs):
-        if self.can_stop:
-            stopping_criteria = get_stopping(
-                self.prompt_type,
-                self.prompt_dict,
-                self.tokenizer,
-                self.device,
-                human=self.human,
-                bot=self.bot,
-                model_max_length=self.tokenizer.model_max_length,
-            )
-            generate_kwargs["stopping_criteria"] = stopping_criteria
-        # return super()._forward(model_inputs, **generate_kwargs)
-        return self.__forward(model_inputs, **generate_kwargs)
-
-    # FIXME: Copy-paste of original _forward, but removed copy.deepcopy()
-    # FIXME: https://github.com/h2oai/h2ogpt/issues/172
-    def __forward(self, model_inputs, **generate_kwargs):
-        input_ids = model_inputs["input_ids"]
-        attention_mask = model_inputs.get("attention_mask", None)
-        # Allow empty prompts
-        if input_ids.shape[1] == 0:
-            input_ids = None
-            attention_mask = None
-            in_b = 1
-        else:
-            in_b = input_ids.shape[0]
-        prompt_text = model_inputs.pop("prompt_text")
-
-        ## If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
-        ## generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
-        # generate_kwargs = copy.deepcopy(generate_kwargs)
-        prefix_length = generate_kwargs.pop("prefix_length", 0)
-        if prefix_length > 0:
-            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
-                "generation_config" in generate_kwargs
-                and generate_kwargs["generation_config"].max_new_tokens
-                is not None
-            )
-            if not has_max_new_tokens:
-                generate_kwargs["max_length"] = (
-                    generate_kwargs.get("max_length")
-                    or self.model.config.max_length
-                )
-                generate_kwargs["max_length"] += prefix_length
-            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
-                "generation_config" in generate_kwargs
-                and generate_kwargs["generation_config"].min_new_tokens
-                is not None
-            )
-            if not has_min_new_tokens and "min_length" in generate_kwargs:
-                generate_kwargs["min_length"] += prefix_length
-
-        # BS x SL
-        # pad or truncate the input_ids and attention_mask
-        max_padding_length = 400
-        input_ids, attention_mask = pad_or_truncate_inputs(
-            input_ids, attention_mask, max_padding_length=max_padding_length
-        )
-
-        return_dict = {
-            "model": self.model,
-            "tokenizer": self.tokenizer,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "attention_mask": attention_mask,
-        }
-        return_dict = {**return_dict, **generate_kwargs}
-        return return_dict
--- a/apps/language_models/langchain/image_captions.py
+++ b/apps/language_models/langchain/image_captions.py
@@ -1,247 +0,0 @@
-"""
-Based upon ImageCaptionLoader in LangChain version: langchain/document_loaders/image_captions.py
-But accepts preloaded model to avoid slowness in use and CUDA forking issues
-
-Loader that loads image captions
-By default, the loader utilizes the pre-trained BLIP image captioning model.
-https://huggingface.co/Salesforce/blip-image-captioning-base
-
-"""
-from typing import List, Union, Any, Tuple
-
-import requests
-from langchain.docstore.document import Document
-from langchain.document_loaders import ImageCaptionLoader
-
-from utils import get_device, NullContext
-
-import pkg_resources
-
-try:
-    assert pkg_resources.get_distribution("bitsandbytes") is not None
-    have_bitsandbytes = True
-except (pkg_resources.DistributionNotFound, AssertionError):
-    have_bitsandbytes = False
-
-
-class H2OImageCaptionLoader(ImageCaptionLoader):
-    """Loader that loads the captions of an image"""
-
-    def __init__(
-        self,
-        path_images: Union[str, List[str]] = None,
-        blip_processor: str = None,
-        blip_model: str = None,
-        caption_gpu=True,
-        load_in_8bit=True,
-        # True doesn't seem to work, even though https://huggingface.co/Salesforce/blip2-flan-t5-xxl#in-8-bit-precision-int8
-        load_half=False,
-        load_gptq="",
-        use_safetensors=False,
-        min_new_tokens=20,
-        max_tokens=50,
-    ):
-        if blip_model is None or blip_model is None:
-            blip_processor = "Salesforce/blip-image-captioning-base"
-            blip_model = "Salesforce/blip-image-captioning-base"
-
-        super().__init__(path_images, blip_processor, blip_model)
-        self.blip_processor = blip_processor
-        self.blip_model = blip_model
-        self.processor = None
-        self.model = None
-        self.caption_gpu = caption_gpu
-        self.context_class = NullContext
-        self.device = "cpu"
-        self.load_in_8bit = (
-            load_in_8bit and have_bitsandbytes
-        )  # only for blip2
-        self.load_half = load_half
-        self.load_gptq = load_gptq
-        self.use_safetensors = use_safetensors
-        self.gpu_id = "auto"
-        # default prompt
-        self.prompt = "image of"
-        self.min_new_tokens = min_new_tokens
-        self.max_tokens = max_tokens
-
-    def set_context(self):
-        if get_device() == "cuda" and self.caption_gpu:
-            import torch
-
-            n_gpus = (
-                torch.cuda.device_count() if torch.cuda.is_available else 0
-            )
-            if n_gpus > 0:
-                self.context_class = torch.device
-                self.device = "cuda"
-
-    def load_model(self):
-        try:
-            import transformers
-        except ImportError:
-            raise ValueError(
-                "`transformers` package not found, please install with "
-                "`pip install transformers`."
-            )
-        self.set_context()
-        if self.caption_gpu:
-            if self.gpu_id == "auto":
-                # blip2 has issues with multi-GPU.  Error says need to somehow set language model in device map
-                # device_map = 'auto'
-                device_map = {"": 0}
-            else:
-                if self.device == "cuda":
-                    device_map = {"": self.gpu_id}
-                else:
-                    device_map = {"": "cpu"}
-        else:
-            device_map = {"": "cpu"}
-        import torch
-
-        with torch.no_grad():
-            with self.context_class(self.device):
-                context_class_cast = (
-                    NullContext if self.device == "cpu" else torch.autocast
-                )
-                with context_class_cast(self.device):
-                    if "blip2" in self.blip_processor.lower():
-                        from transformers import (
-                            Blip2Processor,
-                            Blip2ForConditionalGeneration,
-                        )
-
-                        if self.load_half and not self.load_in_8bit:
-                            self.processor = Blip2Processor.from_pretrained(
-                                self.blip_processor, device_map=device_map
-                            ).half()
-                            self.model = (
-                                Blip2ForConditionalGeneration.from_pretrained(
-                                    self.blip_model, device_map=device_map
-                                ).half()
-                            )
-                        else:
-                            self.processor = Blip2Processor.from_pretrained(
-                                self.blip_processor,
-                                load_in_8bit=self.load_in_8bit,
-                                device_map=device_map,
-                            )
-                            self.model = (
-                                Blip2ForConditionalGeneration.from_pretrained(
-                                    self.blip_model,
-                                    load_in_8bit=self.load_in_8bit,
-                                    device_map=device_map,
-                                )
-                            )
-                    else:
-                        from transformers import (
-                            BlipForConditionalGeneration,
-                            BlipProcessor,
-                        )
-
-                        self.load_half = False  # not supported
-                        if self.caption_gpu:
-                            if device_map == "auto":
-                                # Blip doesn't support device_map='auto'
-                                if self.device == "cuda":
-                                    if self.gpu_id == "auto":
-                                        device_map = {"": 0}
-                                    else:
-                                        device_map = {"": self.gpu_id}
-                                else:
-                                    device_map = {"": "cpu"}
-                        else:
-                            device_map = {"": "cpu"}
-                        self.processor = BlipProcessor.from_pretrained(
-                            self.blip_processor, device_map=device_map
-                        )
-                        self.model = (
-                            BlipForConditionalGeneration.from_pretrained(
-                                self.blip_model, device_map=device_map
-                            )
-                        )
-        return self
-
-    def set_image_paths(self, path_images: Union[str, List[str]]):
-        """
-        Load from a list of image files
-        """
-        if isinstance(path_images, str):
-            self.image_paths = [path_images]
-        else:
-            self.image_paths = path_images
-
-    def load(self, prompt=None) -> List[Document]:
-        if self.processor is None or self.model is None:
-            self.load_model()
-        results = []
-        for path_image in self.image_paths:
-            caption, metadata = self._get_captions_and_metadata(
-                model=self.model,
-                processor=self.processor,
-                path_image=path_image,
-                prompt=prompt,
-            )
-            doc = Document(page_content=caption, metadata=metadata)
-            results.append(doc)
-
-        return results
-
-    def _get_captions_and_metadata(
-        self, model: Any, processor: Any, path_image: str, prompt=None
-    ) -> Tuple[str, dict]:
-        """
-        Helper function for getting the captions and metadata of an image
-        """
-        if prompt is None:
-            prompt = self.prompt
-        try:
-            from PIL import Image
-        except ImportError:
-            raise ValueError(
-                "`PIL` package not found, please install with `pip install pillow`"
-            )
-
-        try:
-            if path_image.startswith("http://") or path_image.startswith(
-                "https://"
-            ):
-                image = Image.open(
-                    requests.get(path_image, stream=True).raw
-                ).convert("RGB")
-            else:
-                image = Image.open(path_image).convert("RGB")
-        except Exception:
-            raise ValueError(f"Could not get image data for {path_image}")
-
-        import torch
-
-        with torch.no_grad():
-            with self.context_class(self.device):
-                context_class_cast = (
-                    NullContext if self.device == "cpu" else torch.autocast
-                )
-                with context_class_cast(self.device):
-                    if self.load_half:
-                        inputs = processor(
-                            image, prompt, return_tensors="pt"
-                        ).half()
-                    else:
-                        inputs = processor(image, prompt, return_tensors="pt")
-                    min_length = len(prompt) // 4 + self.min_new_tokens
-                    self.max_tokens = max(self.max_tokens, min_length)
-                    output = model.generate(
-                        **inputs,
-                        min_length=min_length,
-                        max_length=self.max_tokens,
-                    )
-
-                    caption: str = processor.decode(
-                        output[0], skip_special_tokens=True
-                    )
-                    prompti = caption.find(prompt)
-                    if prompti >= 0:
-                        caption = caption[prompti + len(prompt) :]
-                    metadata: dict = {"image_path": path_image}
-
-        return caption, metadata
--- a/apps/language_models/langchain/langchain_requirements.txt
+++ b/apps/language_models/langchain/langchain_requirements.txt
@@ -1,120 +0,0 @@
-# for generate (gradio server) and finetune
-datasets==2.13.0
-sentencepiece==0.1.99
-huggingface_hub==0.16.4
-appdirs==1.4.4
-fire==0.5.0
-docutils==0.20.1
-evaluate==0.4.0
-rouge_score==0.1.2
-sacrebleu==2.3.1
-scikit-learn==1.2.2
-alt-profanity-check==1.2.2
-better-profanity==0.7.0
-numpy==1.24.3
-pandas==2.0.2
-matplotlib==3.7.1
-loralib==0.1.1
-bitsandbytes==0.39.0
-accelerate==0.20.3
-peft==0.4.0
-# 4.31.0+ breaks load_in_8bit=True (https://github.com/huggingface/transformers/issues/25026)
-transformers==4.30.2
-tokenizers==0.13.3
-APScheduler==3.10.1
-
-# optional for generate
-pynvml==11.5.0
-psutil==5.9.5
-boto3==1.26.101
-botocore==1.29.101
-
-# optional for finetune
-tensorboard==2.13.0
-neptune==1.2.0
-
-# for gradio client
-gradio_client==0.2.10
-beautifulsoup4==4.12.2
-markdown==3.4.3
-
-# data and testing
-pytest==7.2.2
-pytest-xdist==3.2.1
-nltk==3.8.1
-textstat==0.7.3
-# pandoc==2.3
-pypandoc==1.11; sys_platform == "darwin" and platform_machine == "arm64"
-pypandoc_binary==1.11; platform_machine == "x86_64"
-pypandoc_binary==1.11; sys_platform == "win32"
-openpyxl==3.1.2
-lm_dataformat==0.0.20
-bioc==2.0
-
-# falcon
-einops==0.6.1
-instructorembedding==1.0.1
-
-# for gpt4all .env file, but avoid worrying about imports
-python-dotenv==1.0.0
-
-text-generation==0.6.0
-# for tokenization when don't have HF tokenizer
-tiktoken==0.4.0
-# optional: for OpenAI endpoint or embeddings (requires key)
-openai==0.27.8
-
-# optional for chat with PDF
-langchain==0.0.329
-pypdf==3.17.0
-# avoid textract, requires old six
-#textract==1.6.5
-
-# for HF embeddings
-sentence_transformers==2.2.2
-
-# local vector db
-chromadb==0.3.25
-# server vector db
-#pymilvus==2.2.8
-
-# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
-# unstructured==0.8.1
-
-# strong support for images
-# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
-unstructured[local-inference]==0.7.4
-#pdf2image==1.16.3
-#pytesseract==0.3.10
-pillow
-
-pdfminer.six==20221105
-urllib3
-requests_file
-
-#pdf2image==1.16.3
-#pytesseract==0.3.10
-tabulate==0.9.0
-# FYI pandoc already part of requirements.txt
-
-# JSONLoader, but makes some trouble for some users
-# jq==1.4.1
-
-# to check licenses
-# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
-pip-licenses==4.3.0
-
-# weaviate vector db
-weaviate-client==3.22.1
-
-gpt4all==1.0.5
-llama-cpp-python==0.1.73
-
-arxiv==1.4.8
-pymupdf==1.22.5 # AGPL license
-# extract-msg==0.41.1  # GPL3
-
-# sometimes unstructured fails, these work in those cases.  See https://github.com/h2oai/h2ogpt/issues/320
-playwright==1.36.0
-# requires Chrome binary to be in path
-selenium==4.10.0
--- a/apps/language_models/langchain/llama_flash_attn_monkey_patch.py
+++ b/apps/language_models/langchain/llama_flash_attn_monkey_patch.py
@@ -1,124 +0,0 @@
-from typing import List, Optional, Tuple
-
-import torch
-
-import transformers
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
-
-from einops import rearrange
-
-from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-from flash_attn.bert_padding import unpad_input, pad_input
-
-
-def forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[
-    torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]
-]:
-    """Input shape: Batch x Time x Channel
-    attention_mask: [bsz, q_len]
-    """
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = (
-        self.q_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    key_states = (
-        self.k_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    value_states = (
-        self.v_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]
-
-    kv_seq_len = key_states.shape[-2]
-    assert past_key_value is None, "past_key_value is not supported"
-
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-    # [bsz, nh, t, hd]
-    assert not output_attentions, "output_attentions is not supported"
-    assert not use_cache, "use_cache is not supported"
-
-    # Flash attention codes from
-    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
-
-    # transform the data into the format required by flash attention
-    qkv = torch.stack(
-        [query_states, key_states, value_states], dim=2
-    )  # [bsz, nh, 3, q_len, hd]
-    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-    # We have disabled _prepare_decoder_attention_mask in LlamaModel
-    # the attention_mask should be the same as the key_padding_mask
-    key_padding_mask = attention_mask
-
-    if key_padding_mask is None:
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-        max_s = q_len
-        cu_q_lens = torch.arange(
-            0,
-            (bsz + 1) * q_len,
-            step=q_len,
-            dtype=torch.int32,
-            device=qkv.device,
-        )
-        output = flash_attn_unpadded_qkvpacked_func(
-            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    else:
-        nheads = qkv.shape[-2]
-        x = rearrange(qkv, "b s three h d -> b s (three h d)")
-        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
-        x_unpad = rearrange(
-            x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
-        )
-        output_unpad = flash_attn_unpadded_qkvpacked_func(
-            x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
-        )
-        output = rearrange(
-            pad_input(
-                rearrange(output_unpad, "nnz h d -> nnz (h d)"),
-                indices,
-                bsz,
-                q_len,
-            ),
-            "b s (h d) -> b s h d",
-            h=nheads,
-        )
-    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
-
-
-# Disable the transformation of the attention mask in LlamaModel as the flash attention
-# requires the attention mask to be the same as the key_padding_mask
-def _prepare_decoder_attention_mask(
-    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # [bsz, seq_len]
-    return attention_mask
-
-
-def replace_llama_attn_with_flash_attn():
-    print(
-        "Replacing original LLaMa attention with flash attention", flush=True
-    )
-    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
-        _prepare_decoder_attention_mask
-    )
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--- a/apps/language_models/langchain/loaders.py
+++ b/apps/language_models/langchain/loaders.py
@@ -1,109 +0,0 @@
-import functools
-
-
-def get_loaders(model_name, reward_type, llama_type=None, load_gptq=""):
-    # NOTE: Some models need specific new prompt_type
-    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
-    if load_gptq:
-        from transformers import AutoTokenizer
-        from auto_gptq import AutoGPTQForCausalLM
-
-        use_triton = False
-        functools.partial(
-            AutoGPTQForCausalLM.from_quantized,
-            quantize_config=None,
-            use_triton=use_triton,
-        )
-        return AutoGPTQForCausalLM.from_quantized, AutoTokenizer
-    if llama_type is None:
-        llama_type = "llama" in model_name.lower()
-    if llama_type:
-        from transformers import LlamaForCausalLM, LlamaTokenizer
-
-        return LlamaForCausalLM.from_pretrained, LlamaTokenizer
-    elif "distilgpt2" in model_name.lower():
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
-        return AutoModelForCausalLM.from_pretrained, AutoTokenizer
-    elif "gpt2" in model_name.lower():
-        from transformers import GPT2LMHeadModel, GPT2Tokenizer
-
-        return GPT2LMHeadModel.from_pretrained, GPT2Tokenizer
-    elif "mbart-" in model_name.lower():
-        from transformers import (
-            MBartForConditionalGeneration,
-            MBart50TokenizerFast,
-        )
-
-        return (
-            MBartForConditionalGeneration.from_pretrained,
-            MBart50TokenizerFast,
-        )
-    elif (
-        "t5" == model_name.lower()
-        or "t5-" in model_name.lower()
-        or "flan-" in model_name.lower()
-    ):
-        from transformers import AutoTokenizer, T5ForConditionalGeneration
-
-        return T5ForConditionalGeneration.from_pretrained, AutoTokenizer
-    elif "bigbird" in model_name:
-        from transformers import (
-            BigBirdPegasusForConditionalGeneration,
-            AutoTokenizer,
-        )
-
-        return (
-            BigBirdPegasusForConditionalGeneration.from_pretrained,
-            AutoTokenizer,
-        )
-    elif (
-        "bart-large-cnn-samsum" in model_name
-        or "flan-t5-base-samsum" in model_name
-    ):
-        from transformers import pipeline
-
-        return pipeline, "summarization"
-    elif (
-        reward_type
-        or "OpenAssistant/reward-model".lower() in model_name.lower()
-    ):
-        from transformers import (
-            AutoModelForSequenceClassification,
-            AutoTokenizer,
-        )
-
-        return (
-            AutoModelForSequenceClassification.from_pretrained,
-            AutoTokenizer,
-        )
-    else:
-        from transformers import AutoTokenizer, AutoModelForCausalLM
-
-        model_loader = AutoModelForCausalLM
-        tokenizer_loader = AutoTokenizer
-        return model_loader.from_pretrained, tokenizer_loader
-
-
-def get_tokenizer(
-    tokenizer_loader,
-    tokenizer_base_model,
-    local_files_only,
-    resume_download,
-    use_auth_token,
-):
-    tokenizer = tokenizer_loader.from_pretrained(
-        tokenizer_base_model,
-        local_files_only=local_files_only,
-        resume_download=resume_download,
-        use_auth_token=use_auth_token,
-        padding_side="left",
-    )
-
-    tokenizer.pad_token_id = 0  # different from the eos token
-    # when generating, we will use the logits of right-most token to predict the next token
-    # so the padding should be on the left,
-    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
-    tokenizer.padding_side = "left"  # Allow batched inference
-
-    return tokenizer
--- a/apps/language_models/langchain/make_db.py
+++ b/apps/language_models/langchain/make_db.py
@@ -1,203 +0,0 @@
-import os
-
-from gpt_langchain import (
-    path_to_docs,
-    get_some_dbs_from_hf,
-    all_db_zips,
-    some_db_zips,
-    create_or_update_db,
-)
-from utils import get_ngpus_vis
-
-
-def glob_to_db(
-    user_path,
-    chunk=True,
-    chunk_size=512,
-    verbose=False,
-    fail_any_exception=False,
-    n_jobs=-1,
-    url=None,
-    enable_captions=True,
-    captions_model=None,
-    caption_loader=None,
-    enable_ocr=False,
-):
-    sources1 = path_to_docs(
-        user_path,
-        verbose=verbose,
-        fail_any_exception=fail_any_exception,
-        n_jobs=n_jobs,
-        chunk=chunk,
-        chunk_size=chunk_size,
-        url=url,
-        enable_captions=enable_captions,
-        captions_model=captions_model,
-        caption_loader=caption_loader,
-        enable_ocr=enable_ocr,
-    )
-    return sources1
-
-
-def make_db_main(
-    use_openai_embedding: bool = False,
-    hf_embedding_model: str = None,
-    persist_directory: str = "db_dir_UserData",
-    user_path: str = "user_path",
-    url: str = None,
-    add_if_exists: bool = True,
-    collection_name: str = "UserData",
-    verbose: bool = False,
-    chunk: bool = True,
-    chunk_size: int = 512,
-    fail_any_exception: bool = False,
-    download_all: bool = False,
-    download_some: bool = False,
-    download_one: str = None,
-    download_dest: str = "./",
-    n_jobs: int = -1,
-    enable_captions: bool = True,
-    captions_model: str = "Salesforce/blip-image-captioning-base",
-    pre_load_caption_model: bool = False,
-    caption_gpu: bool = True,
-    enable_ocr: bool = False,
-    db_type: str = "chroma",
-):
-    """
-    # To make UserData db for generate.py, put pdfs, etc. into path user_path and run:
-    python make_db.py
-
-    # once db is made, can use in generate.py like:
-
-    python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b --langchain_mode=UserData
-
-    or zip-up the db_dir_UserData and share:
-
-    zip -r db_dir_UserData.zip db_dir_UserData
-
-    # To get all db files (except large wiki_full) do:
-    python make_db.py --download_some=True
-
-    # To get a single db file from HF:
-    python make_db.py --download_one=db_dir_DriverlessAI_docs.zip
-
-    :param use_openai_embedding: Whether to use OpenAI embedding
-    :param hf_embedding_model: HF embedding model to use. Like generate.py, uses 'hkunlp/instructor-large' if have GPUs, else "sentence-transformers/all-MiniLM-L6-v2"
-    :param persist_directory: where to persist db
-    :param user_path: where to pull documents from (None means url is not None.  If url is not None, this is ignored.)
-    :param url: url to generate documents from (None means user_path is not None)
-    :param add_if_exists: Add to db if already exists, but will not add duplicate sources
-    :param collection_name: Collection name for new db if not adding
-    :param verbose: whether to show verbose messages
-    :param chunk: whether to chunk data
-    :param chunk_size: chunk size for chunking
-    :param fail_any_exception: whether to fail if any exception hit during ingestion of files
-    :param download_all: whether to download all (including 23GB Wikipedia) example databases from h2o.ai HF
-    :param download_some: whether to download some small example databases from h2o.ai HF
-    :param download_one: whether to download one chosen example databases from h2o.ai HF
-    :param download_dest: Destination for downloads
-    :param n_jobs: Number of cores to use for ingesting multiple files
-    :param enable_captions: Whether to enable captions on images
-    :param captions_model: See generate.py
-    :param pre_load_caption_model: See generate.py
-    :param caption_gpu: Caption images on GPU if present
-    :param enable_ocr: Whether to enable OCR on images
-    :param db_type: Type of db to create. Currently only 'chroma' and 'weaviate' is supported.
-    :return: None
-    """
-    db = None
-
-    # match behavior of main() in generate.py for non-HF case
-    n_gpus = get_ngpus_vis()
-    if n_gpus == 0:
-        if hf_embedding_model is None:
-            # if no GPUs, use simpler embedding model to avoid cost in time
-            hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
-    else:
-        if hf_embedding_model is None:
-            # if still None, then set default
-            hf_embedding_model = "hkunlp/instructor-large"
-
-    if download_all:
-        print("Downloading all (and unzipping): %s" % all_db_zips, flush=True)
-        get_some_dbs_from_hf(download_dest, db_zips=all_db_zips)
-        if verbose:
-            print("DONE", flush=True)
-        return db, collection_name
-    elif download_some:
-        print(
-            "Downloading some (and unzipping): %s" % some_db_zips, flush=True
-        )
-        get_some_dbs_from_hf(download_dest, db_zips=some_db_zips)
-        if verbose:
-            print("DONE", flush=True)
-        return db, collection_name
-    elif download_one:
-        print("Downloading %s (and unzipping)" % download_one, flush=True)
-        get_some_dbs_from_hf(
-            download_dest, db_zips=[[download_one, "", "Unknown License"]]
-        )
-        if verbose:
-            print("DONE", flush=True)
-        return db, collection_name
-
-    if enable_captions and pre_load_caption_model:
-        # preload, else can be too slow or if on GPU have cuda context issues
-        # Inside ingestion, this will disable parallel loading of multiple other kinds of docs
-        # However, if have many images, all those images will be handled more quickly by preloaded model on GPU
-        from image_captions import H2OImageCaptionLoader
-
-        caption_loader = H2OImageCaptionLoader(
-            None,
-            blip_model=captions_model,
-            blip_processor=captions_model,
-            caption_gpu=caption_gpu,
-        ).load_model()
-    else:
-        if enable_captions:
-            caption_loader = "gpu" if caption_gpu else "cpu"
-        else:
-            caption_loader = False
-
-    if verbose:
-        print("Getting sources", flush=True)
-    assert (
-        user_path is not None or url is not None
-    ), "Can't have both user_path and url as None"
-    if not url:
-        assert os.path.isdir(user_path), (
-            "user_path=%s does not exist" % user_path
-        )
-    sources = glob_to_db(
-        user_path,
-        chunk=chunk,
-        chunk_size=chunk_size,
-        verbose=verbose,
-        fail_any_exception=fail_any_exception,
-        n_jobs=n_jobs,
-        url=url,
-        enable_captions=enable_captions,
-        captions_model=captions_model,
-        caption_loader=caption_loader,
-        enable_ocr=enable_ocr,
-    )
-    exceptions = [x for x in sources if x.metadata.get("exception")]
-    print("Exceptions: %s" % exceptions, flush=True)
-    sources = [x for x in sources if "exception" not in x.metadata]
-
-    assert len(sources) > 0, "No sources found"
-    db = create_or_update_db(
-        db_type,
-        persist_directory,
-        collection_name,
-        sources,
-        use_openai_embedding,
-        add_if_exists,
-        verbose,
-        hf_embedding_model,
-    )
-
-    assert db is not None
-    if verbose:
-        print("DONE", flush=True)
-    return db, collection_name
--- a/apps/language_models/langchain/prompter.py
+++ b/apps/language_models/langchain/prompter.py
--- a/apps/language_models/langchain/read_wiki_full.py
+++ b/apps/language_models/langchain/read_wiki_full.py
@@ -1,403 +0,0 @@
-"""Load Data from a MediaWiki dump xml."""
-import ast
-import glob
-import pickle
-import uuid
-from typing import List, Optional
-import os
-import bz2
-import csv
-import numpy as np
-import pandas as pd
-import pytest
-from matplotlib import pyplot as plt
-
-from langchain.docstore.document import Document
-from langchain.document_loaders import MWDumpLoader
-
-# path where downloaded wiki files exist, to be processed
-root_path = "/data/jon/h2o-llm"
-
-
-def unescape(x):
-    try:
-        x = ast.literal_eval(x)
-    except:
-        try:
-            x = x.encode("ascii", "ignore").decode("unicode_escape")
-        except:
-            pass
-    return x
-
-
-def get_views():
-    # views = pd.read_csv('wiki_page_views_more_1000month.csv')
-    views = pd.read_csv("wiki_page_views_more_5000month.csv")
-    views.index = views["title"]
-    views = views["views"]
-    views = views.to_dict()
-    views = {str(unescape(str(k))): v for k, v in views.items()}
-    views2 = {k.replace("_", " "): v for k, v in views.items()}
-    # views has _ but pages has " "
-    views.update(views2)
-    return views
-
-
-class MWDumpDirectLoader(MWDumpLoader):
-    def __init__(
-        self,
-        data: str,
-        encoding: Optional[str] = "utf8",
-        title_words_limit=None,
-        use_views=True,
-        verbose=True,
-    ):
-        """Initialize with file path."""
-        self.data = data
-        self.encoding = encoding
-        self.title_words_limit = title_words_limit
-        self.verbose = verbose
-        if use_views:
-            # self.views = get_views()
-            # faster to use global shared values
-            self.views = global_views
-        else:
-            self.views = None
-
-    def load(self) -> List[Document]:
-        """Load from file path."""
-        import mwparserfromhell
-        import mwxml
-
-        dump = mwxml.Dump.from_page_xml(self.data)
-
-        docs = []
-
-        for page in dump.pages:
-            if self.views is not None and page.title not in self.views:
-                if self.verbose:
-                    print("Skipped %s low views" % page.title, flush=True)
-                continue
-            for revision in page:
-                if self.title_words_limit is not None:
-                    num_words = len(" ".join(page.title.split("_")).split(" "))
-                    if num_words > self.title_words_limit:
-                        if self.verbose:
-                            print("Skipped %s" % page.title, flush=True)
-                        continue
-                if self.verbose:
-                    if self.views is not None:
-                        print(
-                            "Kept %s views: %s"
-                            % (page.title, self.views[page.title]),
-                            flush=True,
-                        )
-                    else:
-                        print("Kept %s" % page.title, flush=True)
-
-                code = mwparserfromhell.parse(revision.text)
-                text = code.strip_code(
-                    normalize=True, collapse=True, keep_template_params=False
-                )
-                title_url = str(page.title).replace(" ", "_")
-                metadata = dict(
-                    title=page.title,
-                    source="https://en.wikipedia.org/wiki/" + title_url,
-                    id=page.id,
-                    redirect=page.redirect,
-                    views=self.views[page.title]
-                    if self.views is not None
-                    else -1,
-                )
-                metadata = {k: v for k, v in metadata.items() if v is not None}
-                docs.append(Document(page_content=text, metadata=metadata))
-
-        return docs
-
-
-def search_index(search_term, index_filename):
-    byte_flag = False
-    data_length = start_byte = 0
-    index_file = open(index_filename, "r")
-    csv_reader = csv.reader(index_file, delimiter=":")
-    for line in csv_reader:
-        if not byte_flag and search_term == line[2]:
-            start_byte = int(line[0])
-            byte_flag = True
-        elif byte_flag and int(line[0]) != start_byte:
-            data_length = int(line[0]) - start_byte
-            break
-    index_file.close()
-    return start_byte, data_length
-
-
-def get_start_bytes(index_filename):
-    index_file = open(index_filename, "r")
-    csv_reader = csv.reader(index_file, delimiter=":")
-    start_bytes = set()
-    for line in csv_reader:
-        start_bytes.add(int(line[0]))
-    index_file.close()
-    return sorted(start_bytes)
-
-
-def get_wiki_filenames():
-    # requires
-    # wget http://ftp.acc.umu.se/mirror/wikimedia.org/dumps/enwiki/20230401/enwiki-20230401-pages-articles-multistream-index.txt.bz2
-    base_path = os.path.join(
-        root_path, "enwiki-20230401-pages-articles-multistream"
-    )
-    index_file = "enwiki-20230401-pages-articles-multistream-index.txt"
-    index_filename = os.path.join(base_path, index_file)
-    wiki_filename = os.path.join(
-        base_path, "enwiki-20230401-pages-articles-multistream.xml.bz2"
-    )
-    return index_filename, wiki_filename
-
-
-def get_documents_by_search_term(search_term):
-    index_filename, wiki_filename = get_wiki_filenames()
-    start_byte, data_length = search_index(search_term, index_filename)
-    with open(wiki_filename, "rb") as wiki_file:
-        wiki_file.seek(start_byte)
-        data = bz2.BZ2Decompressor().decompress(wiki_file.read(data_length))
-
-    loader = MWDumpDirectLoader(data.decode())
-    documents = loader.load()
-    return documents
-
-
-def get_one_chunk(
-    wiki_filename,
-    start_byte,
-    end_byte,
-    return_file=True,
-    title_words_limit=None,
-    use_views=True,
-):
-    data_length = end_byte - start_byte
-    with open(wiki_filename, "rb") as wiki_file:
-        wiki_file.seek(start_byte)
-        data = bz2.BZ2Decompressor().decompress(wiki_file.read(data_length))
-
-    loader = MWDumpDirectLoader(
-        data.decode(), title_words_limit=title_words_limit, use_views=use_views
-    )
-    documents1 = loader.load()
-    if return_file:
-        base_tmp = "temp_wiki"
-        if not os.path.isdir(base_tmp):
-            os.makedirs(base_tmp, exist_ok=True)
-        filename = os.path.join(base_tmp, str(uuid.uuid4()) + ".tmp.pickle")
-        with open(filename, "wb") as f:
-            pickle.dump(documents1, f)
-        return filename
-    return documents1
-
-
-from joblib import Parallel, delayed
-
-global_views = get_views()
-
-
-def get_all_documents(small_test=2, n_jobs=None, use_views=True):
-    print("DO get all wiki docs: %s" % small_test, flush=True)
-    index_filename, wiki_filename = get_wiki_filenames()
-    start_bytes = get_start_bytes(index_filename)
-    end_bytes = start_bytes[1:]
-    start_bytes = start_bytes[:-1]
-
-    if small_test:
-        start_bytes = start_bytes[:small_test]
-        end_bytes = end_bytes[:small_test]
-        if n_jobs is None:
-            n_jobs = 5
-    else:
-        if n_jobs is None:
-            n_jobs = os.cpu_count() // 4
-
-    # default loky backend leads to name space conflict problems
-    return_file = True  # large return from joblib hangs
-    documents = Parallel(n_jobs=n_jobs, verbose=10, backend="multiprocessing")(
-        delayed(get_one_chunk)(
-            wiki_filename,
-            start_byte,
-            end_byte,
-            return_file=return_file,
-            use_views=use_views,
-        )
-        for start_byte, end_byte in zip(start_bytes, end_bytes)
-    )
-    if return_file:
-        # then documents really are files
-        files = documents.copy()
-        documents = []
-        for fil in files:
-            with open(fil, "rb") as f:
-                documents.extend(pickle.load(f))
-            os.remove(fil)
-    else:
-        from functools import reduce
-        from operator import concat
-
-        documents = reduce(concat, documents)
-    assert isinstance(documents, list)
-
-    print("DONE get all wiki docs", flush=True)
-    return documents
-
-
-def test_by_search_term():
-    search_term = "Apollo"
-    assert len(get_documents_by_search_term(search_term)) == 100
-
-    search_term = "Abstract (law)"
-    assert len(get_documents_by_search_term(search_term)) == 100
-
-    search_term = "Artificial languages"
-    assert len(get_documents_by_search_term(search_term)) == 100
-
-
-def test_start_bytes():
-    index_filename, wiki_filename = get_wiki_filenames()
-    assert len(get_start_bytes(index_filename)) == 227850
-
-
-def test_get_all_documents():
-    small_test = 20  # 227850
-    n_jobs = os.cpu_count() // 4
-
-    assert (
-        len(
-            get_all_documents(
-                small_test=small_test, n_jobs=n_jobs, use_views=False
-            )
-        )
-        == small_test * 100
-    )
-
-    assert (
-        len(
-            get_all_documents(
-                small_test=small_test, n_jobs=n_jobs, use_views=True
-            )
-        )
-        == 429
-    )
-
-
-def get_one_pageviews(fil):
-    df1 = pd.read_csv(
-        fil,
-        sep=" ",
-        header=None,
-        names=["region", "title", "views", "foo"],
-        quoting=csv.QUOTE_NONE,
-    )
-    df1.index = df1["title"]
-    df1 = df1[df1["region"] == "en"]
-    df1 = df1.drop("region", axis=1)
-    df1 = df1.drop("foo", axis=1)
-    df1 = df1.drop("title", axis=1)  # already index
-
-    base_tmp = "temp_wiki_pageviews"
-    if not os.path.isdir(base_tmp):
-        os.makedirs(base_tmp, exist_ok=True)
-    filename = os.path.join(base_tmp, str(uuid.uuid4()) + ".tmp.csv")
-    df1.to_csv(filename, index=True)
-    return filename
-
-
-def test_agg_pageviews(gen_files=False):
-    if gen_files:
-        path = os.path.join(
-            root_path,
-            "wiki_pageviews/dumps.wikimedia.org/other/pageviews/2023/2023-04",
-        )
-        files = glob.glob(os.path.join(path, "pageviews*.gz"))
-        # files = files[:2]  # test
-        n_jobs = os.cpu_count() // 2
-        csv_files = Parallel(
-            n_jobs=n_jobs, verbose=10, backend="multiprocessing"
-        )(delayed(get_one_pageviews)(fil) for fil in files)
-    else:
-        # to continue without redoing above
-        csv_files = glob.glob(
-            os.path.join(root_path, "temp_wiki_pageviews/*.csv")
-        )
-
-    df_list = []
-    for csv_file in csv_files:
-        print(csv_file)
-        df1 = pd.read_csv(csv_file)
-        df_list.append(df1)
-    df = pd.concat(df_list, axis=0)
-    df = df.groupby("title")["views"].sum().reset_index()
-    df.to_csv("wiki_page_views.csv", index=True)
-
-
-def test_reduce_pageview():
-    filename = "wiki_page_views.csv"
-    df = pd.read_csv(filename)
-    df = df[df["views"] < 1e7]
-    #
-    plt.hist(df["views"], bins=100, log=True)
-    views_avg = np.mean(df["views"])
-    views_median = np.median(df["views"])
-    plt.title("Views avg: %s median: %s" % (views_avg, views_median))
-    plt.savefig(filename.replace(".csv", ".png"))
-    plt.close()
-    #
-    views_limit = 5000
-    df = df[df["views"] > views_limit]
-    filename = "wiki_page_views_more_5000month.csv"
-    df.to_csv(filename, index=True)
-    #
-    plt.hist(df["views"], bins=100, log=True)
-    views_avg = np.mean(df["views"])
-    views_median = np.median(df["views"])
-    plt.title("Views avg: %s median: %s" % (views_avg, views_median))
-    plt.savefig(filename.replace(".csv", ".png"))
-    plt.close()
-
-
-@pytest.mark.skip("Only if doing full processing again, some manual steps")
-def test_do_wiki_full_all():
-    # Install other requirements for wiki specific conversion:
-    # pip install -r reqs_optional/requirements_optional_wikiprocessing.txt
-
-    # Use "Transmission" in Ubuntu to get wiki dump using torrent:
-    # See: https://meta.wikimedia.org/wiki/Data_dump_torrents
-    # E.g. magnet:?xt=urn:btih:b2c74af2b1531d0b63f1166d2011116f44a8fed0&dn=enwiki-20230401-pages-articles-multistream.xml.bz2&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337
-
-    # Get index
-    os.system(
-        "wget http://ftp.acc.umu.se/mirror/wikimedia.org/dumps/enwiki/20230401/enwiki-20230401-pages-articles-multistream-index.txt.bz2"
-    )
-
-    # Test that can use LangChain to get docs from subset of wiki as sampled out of full wiki directly using bzip multistream
-    test_get_all_documents()
-
-    # Check can search wiki multistream
-    test_by_search_term()
-
-    # Test can get all start bytes in index
-    test_start_bytes()
-
-    # Get page views, e.g. for entire month of April 2023
-    os.system(
-        "wget -b -m -k -o wget.log -e robots=off https://dumps.wikimedia.org/other/pageviews/2023/2023-04/"
-    )
-
-    # Aggregate page views from many files into single file
-    test_agg_pageviews(gen_files=True)
-
-    # Reduce page views to some limit, so processing of full wiki is not too large
-    test_reduce_pageview()
-
-    # Start generate.py with requesting wiki_full in prep.  This will use page views as referenced in get_views.
-    # Note get_views as global() function done once is required to avoid very slow processing
-    # WARNING: Requires alot of memory to handle, used up to 300GB system RAM at peak
-    """
-    python generate.py --langchain_mode='wiki_full' --visible_langchain_modes="['wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']" &> lc_out.log
-    """
--- a/apps/language_models/langchain/stopping.py
+++ b/apps/language_models/langchain/stopping.py
@@ -1,121 +0,0 @@
-import torch
-from transformers import StoppingCriteria, StoppingCriteriaList
-
-from enums import PromptType
-
-
-class StoppingCriteriaSub(StoppingCriteria):
-    def __init__(
-        self, stops=[], encounters=[], device="cuda", model_max_length=None
-    ):
-        super().__init__()
-        assert (
-            len(stops) % len(encounters) == 0
-        ), "Number of stops and encounters must match"
-        self.encounters = encounters
-        self.stops = [stop.to(device) for stop in stops]
-        self.num_stops = [0] * len(stops)
-        self.model_max_length = model_max_length
-
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-    ) -> bool:
-        for stopi, stop in enumerate(self.stops):
-            if torch.all((stop == input_ids[0][-len(stop) :])).item():
-                self.num_stops[stopi] += 1
-                if (
-                    self.num_stops[stopi]
-                    >= self.encounters[stopi % len(self.encounters)]
-                ):
-                    # print("Stopped", flush=True)
-                    return True
-        if (
-            self.model_max_length is not None
-            and input_ids[0].shape[0] >= self.model_max_length
-        ):
-            # critical limit
-            return True
-        # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
-        # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
-        return False
-
-
-def get_stopping(
-    prompt_type,
-    prompt_dict,
-    tokenizer,
-    device,
-    human="<human>:",
-    bot="<bot>:",
-    model_max_length=None,
-):
-    # FIXME: prompt_dict unused currently
-    if prompt_type in [
-        PromptType.human_bot.name,
-        PromptType.instruct_vicuna.name,
-        PromptType.instruct_with_end.name,
-    ]:
-        if prompt_type == PromptType.human_bot.name:
-            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
-            # stopping only starts once output is beyond prompt
-            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
-            stop_words = [human, bot, "\n" + human, "\n" + bot]
-            encounters = [1, 2]
-        elif prompt_type == PromptType.instruct_vicuna.name:
-            # even below is not enough, generic strings and many ways to encode
-            stop_words = [
-                "### Human:",
-                """
-### Human:""",
-                """
-### Human:
-""",
-                "### Assistant:",
-                """
-### Assistant:""",
-                """
-### Assistant:
-""",
-            ]
-            encounters = [1, 2]
-        else:
-            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
-            stop_words = ["### End"]
-            encounters = [1]
-        stop_words_ids = [
-            tokenizer(stop_word, return_tensors="pt")["input_ids"].squeeze()
-            for stop_word in stop_words
-        ]
-        # handle single token case
-        stop_words_ids = [
-            x if len(x.shape) > 0 else torch.tensor([x])
-            for x in stop_words_ids
-        ]
-        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
-        # avoid padding in front of tokens
-        if (
-            tokenizer._pad_token
-        ):  # use hidden variable to avoid annoying properly logger bug
-            stop_words_ids = [
-                x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x
-                for x in stop_words_ids
-            ]
-        # handle fake \n added
-        stop_words_ids = [
-            x[1:] if y[0] == "\n" else x
-            for x, y in zip(stop_words_ids, stop_words)
-        ]
-        # build stopper
-        stopping_criteria = StoppingCriteriaList(
-            [
-                StoppingCriteriaSub(
-                    stops=stop_words_ids,
-                    encounters=encounters,
-                    device=device,
-                    model_max_length=model_max_length,
-                )
-            ]
-        )
-    else:
-        stopping_criteria = StoppingCriteriaList()
-    return stopping_criteria
--- a/apps/language_models/langchain/utils.py
+++ b/apps/language_models/langchain/utils.py
--- a/apps/language_models/langchain/utils_langchain.py
+++ b/apps/language_models/langchain/utils_langchain.py
@@ -1,69 +0,0 @@
-from typing import Any, Dict, List, Union, Optional
-import time
-import queue
-
-from langchain.callbacks.base import BaseCallbackHandler
-from langchain.schema import LLMResult
-
-
-class StreamingGradioCallbackHandler(BaseCallbackHandler):
-    """
-    Similar to H2OTextIteratorStreamer that is for HF backend, but here LangChain backend
-    """
-
-    def __init__(self, timeout: Optional[float] = None, block=True):
-        super().__init__()
-        self.text_queue = queue.SimpleQueue()
-        self.stop_signal = None
-        self.do_stop = False
-        self.timeout = timeout
-        self.block = block
-
-    def on_llm_start(
-        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
-    ) -> None:
-        """Run when LLM starts running. Clean the queue."""
-        while not self.text_queue.empty():
-            try:
-                self.text_queue.get(block=False)
-            except queue.Empty:
-                continue
-
-    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
-        """Run on new LLM token. Only available when streaming is enabled."""
-        self.text_queue.put(token)
-
-    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
-        """Run when LLM ends running."""
-        self.text_queue.put(self.stop_signal)
-
-    def on_llm_error(
-        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
-    ) -> None:
-        """Run when LLM errors."""
-        self.text_queue.put(self.stop_signal)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        while True:
-            try:
-                value = (
-                    self.stop_signal
-                )  # value looks unused in pycharm, not true
-                if self.do_stop:
-                    print("hit stop", flush=True)
-                    # could raise or break, maybe best to raise and make parent see if any exception in thread
-                    raise StopIteration()
-                    # break
-                value = self.text_queue.get(
-                    block=self.block, timeout=self.timeout
-                )
-                break
-            except queue.Empty:
-                time.sleep(0.01)
-        if value == self.stop_signal:
-            raise StopIteration()
-        else:
-            return value
--- a/apps/language_models/scripts/llama_ir_conversion_utils.py
+++ b/apps/language_models/scripts/llama_ir_conversion_utils.py
@@ -1,442 +0,0 @@
-from pathlib import Path
-import argparse
-from argparse import RawTextHelpFormatter
-import re, gc
-
-"""
-    This script can be used as a standalone utility to convert IRs to dynamic + combine them.
-    Following are the various ways this script can be used :-
-        a. To convert a single Linalg IR to dynamic IR:
-            --dynamic --first_ir_path=<PATH TO FIRST IR>
-        b. To convert two Linalg IRs to dynamic IR:
-            --dynamic --first_ir_path=<PATH TO SECOND IR> --first_ir_path=<PATH TO SECOND IR>
-        c. To combine two Linalg IRs into one:
-            --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>
-        d. To convert both IRs into dynamic as well as combine the IRs:
-            --dynamic --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>
-
-    NOTE: For dynamic you'll also need to provide the following set of flags:-
-           i. For First Llama : --dynamic_input_size (DEFAULT: 19)
-          ii. For Second Llama: --model_name (DEFAULT: llama2_7b)
-                                --precision (DEFAULT: 'int4')
-          You may use --save_dynamic to also save the dynamic IR in option d above.
-          Else for option a. and b. the dynamic IR(s) will get saved by default.
-"""
-
-
-def combine_mlir_scripts(
-    first_vicuna_mlir,
-    second_vicuna_mlir,
-    output_name,
-    return_ir=True,
-):
-    print(f"[DEBUG] combining first and second mlir")
-    print(f"[DEBUG] output_name = {output_name}")
-    maps1 = []
-    maps2 = []
-    constants = set()
-    f1 = []
-    f2 = []
-
-    print(f"[DEBUG] processing first vicuna mlir")
-    first_vicuna_mlir = first_vicuna_mlir.splitlines()
-    while first_vicuna_mlir:
-        line = first_vicuna_mlir.pop(0)
-        if re.search("#map\d*\s*=", line):
-            maps1.append(line)
-        elif re.search("arith.constant", line):
-            constants.add(line)
-        elif not re.search("module", line):
-            line = re.sub("forward", "first_vicuna_forward", line)
-            f1.append(line)
-    f1 = f1[:-1]
-    del first_vicuna_mlir
-    gc.collect()
-
-    for i, map_line in enumerate(maps1):
-        map_var = map_line.split(" ")[0]
-        map_line = re.sub(f"{map_var}(?!\d)", map_var + "_0", map_line)
-        maps1[i] = map_line
-        f1 = [
-            re.sub(f"{map_var}(?!\d)", map_var + "_0", func_line)
-            for func_line in f1
-        ]
-
-    print(f"[DEBUG] processing second vicuna mlir")
-    second_vicuna_mlir = second_vicuna_mlir.splitlines()
-    while second_vicuna_mlir:
-        line = second_vicuna_mlir.pop(0)
-        if re.search("#map\d*\s*=", line):
-            maps2.append(line)
-        elif "global_seed" in line:
-            continue
-        elif re.search("arith.constant", line):
-            constants.add(line)
-        elif not re.search("module", line):
-            line = re.sub("forward", "second_vicuna_forward", line)
-            f2.append(line)
-    f2 = f2[:-1]
-    del second_vicuna_mlir
-    gc.collect()
-
-    for i, map_line in enumerate(maps2):
-        map_var = map_line.split(" ")[0]
-        map_line = re.sub(f"{map_var}(?!\d)", map_var + "_1", map_line)
-        maps2[i] = map_line
-        f2 = [
-            re.sub(f"{map_var}(?!\d)", map_var + "_1", func_line)
-            for func_line in f2
-        ]
-
-    module_start = 'module attributes {torch.debug_module_name = "_lambda"} {'
-    module_end = "}"
-
-    global_vars = []
-    vnames = []
-    global_var_loading1 = []
-    global_var_loading2 = []
-
-    print(f"[DEBUG] processing constants")
-    counter = 0
-    constants = list(constants)
-    while constants:
-        constant = constants.pop(0)
-        vname, vbody = constant.split("=")
-        vname = re.sub("%", "", vname)
-        vname = vname.strip()
-        vbody = re.sub("arith.constant", "", vbody)
-        vbody = vbody.strip()
-        if len(vbody.split(":")) < 2:
-            print(constant)
-        vdtype = vbody.split(":")[-1].strip()
-        fixed_vdtype = vdtype
-        if "c1_i64" in vname:
-            print(constant)
-            counter += 1
-        if counter == 2:
-            counter = 0
-            print("detected duplicate")
-            continue
-        vnames.append(vname)
-        if "true" not in vname:
-            global_vars.append(
-                f"ml_program.global private @{vname}({vbody}) : {fixed_vdtype}"
-            )
-            global_var_loading1.append(
-                f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
-            )
-            global_var_loading2.append(
-                f"\t\t%{vname} = ml_program.global_load_const @{vname} : {fixed_vdtype}"
-            )
-        else:
-            global_vars.append(
-                f"ml_program.global private @{vname}({vbody}) : i1"
-            )
-            global_var_loading1.append(
-                f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
-            )
-            global_var_loading2.append(
-                f"\t\t%{vname} = ml_program.global_load_const @{vname} : i1"
-            )
-
-    new_f1, new_f2 = [], []
-
-    print(f"[DEBUG] processing f1")
-    for line in f1:
-        if "func.func" in line:
-            new_f1.append(line)
-            for global_var in global_var_loading1:
-                new_f1.append(global_var)
-        else:
-            new_f1.append(line)
-
-    print(f"[DEBUG] processing f2")
-    for line in f2:
-        if "func.func" in line:
-            new_f2.append(line)
-            for global_var in global_var_loading2:
-                if (
-                    "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64"
-                    in global_var
-                ):
-                    print(global_var)
-                new_f2.append(global_var)
-        else:
-            new_f2.append(line)
-
-    f1 = new_f1
-    f2 = new_f2
-
-    del new_f1
-    del new_f2
-    gc.collect()
-
-    print(
-        [
-            "c20_i64 = arith.addi %dim_i64, %c1_i64 : i64" in x
-            for x in [maps1, maps2, global_vars, f1, f2]
-        ]
-    )
-
-    # doing it this way rather than assembling the whole string
-    # to prevent OOM with 64GiB RAM when encoding the file.
-
-    print(f"[DEBUG] Saving mlir to {output_name}")
-    with open(output_name, "w+") as f_:
-        f_.writelines(line + "\n" for line in maps1)
-        f_.writelines(line + "\n" for line in maps2)
-        f_.writelines(line + "\n" for line in [module_start])
-        f_.writelines(line + "\n" for line in global_vars)
-        f_.writelines(line + "\n" for line in f1)
-        f_.writelines(line + "\n" for line in f2)
-        f_.writelines(line + "\n" for line in [module_end])
-
-    del maps1
-    del maps2
-    del module_start
-    del global_vars
-    del f1
-    del f2
-    del module_end
-    gc.collect()
-
-    if return_ir:
-        print(f"[DEBUG] Reading combined mlir back in")
-        with open(output_name, "rb") as f:
-            return f.read()
-
-
-def write_in_dynamic_inputs0(module, dynamic_input_size):
-    print("[DEBUG] writing dynamic inputs to first vicuna")
-    # Current solution for ensuring mlir files support dynamic inputs
-    # TODO: find a more elegant way to implement this
-    new_lines = []
-    module = module.splitlines()
-    while module:
-        line = module.pop(0)
-        line = re.sub(f"{dynamic_input_size}x", "?x", line)
-        if "?x" in line:
-            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
-        line = re.sub(f" {dynamic_input_size},", " %dim,", line)
-        if "tensor.empty" in line and "?x?" in line:
-            line = re.sub(
-                "tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
-            )
-        if "arith.cmpi" in line:
-            line = re.sub(f"c{dynamic_input_size}", "dim", line)
-        if "%0 = tensor.empty(%dim) : tensor<?xi64>" in line:
-            new_lines.append("%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>")
-        if "%dim = tensor.dim %arg0, %c1 : tensor<1x?xi64>" in line:
-            continue
-
-        new_lines.append(line)
-    return "\n".join(new_lines)
-
-
-def write_in_dynamic_inputs1(module, model_name, precision):
-    print("[DEBUG] writing dynamic inputs to second vicuna")
-
-    def remove_constant_dim(line):
-        if "c19_i64" in line:
-            line = re.sub("c19_i64", "dim_i64", line)
-        if "19x" in line:
-            line = re.sub("19x", "?x", line)
-            line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
-        if "tensor.empty" in line and "?x?" in line:
-            line = re.sub(
-                "tensor.empty\(%dim\)",
-                "tensor.empty(%dim, %dim)",
-                line,
-            )
-        if "arith.cmpi" in line:
-            line = re.sub("c19", "dim", line)
-        if " 19," in line:
-            line = re.sub(" 19,", " %dim,", line)
-        if "x20x" in line or "<20x" in line:
-            line = re.sub("20x", "?x", line)
-            line = re.sub("tensor.empty\(\)", "tensor.empty(%dimp1)", line)
-        if " 20," in line:
-            line = re.sub(" 20,", " %dimp1,", line)
-        return line
-
-    module = module.splitlines()
-    new_lines = []
-
-    # Using a while loop and the pop method to avoid creating a copy of module
-    if "llama2_13b" in model_name:
-        pkv_tensor_shape = "tensor<1x40x?x128x"
-    elif "llama2_70b" in model_name:
-        pkv_tensor_shape = "tensor<1x8x?x128x"
-    else:
-        pkv_tensor_shape = "tensor<1x32x?x128x"
-    if precision in ["fp16", "int4", "int8"]:
-        pkv_tensor_shape += "f16>"
-    else:
-        pkv_tensor_shape += "f32>"
-
-    while module:
-        line = module.pop(0)
-        if "%c19_i64 = arith.constant 19 : i64" in line:
-            new_lines.append("%c2 = arith.constant 2 : index")
-            new_lines.append(
-                f"%dim_4_int = tensor.dim %arg1, %c2 : {pkv_tensor_shape}"
-            )
-            new_lines.append(
-                "%dim_i64 = arith.index_cast %dim_4_int : index to i64"
-            )
-            continue
-        if "%c2 = arith.constant 2 : index" in line:
-            continue
-        if "%c20_i64 = arith.constant 20 : i64" in line:
-            new_lines.append("%c1_i64 = arith.constant 1 : i64")
-            new_lines.append("%c20_i64 = arith.addi %dim_i64, %c1_i64 : i64")
-            new_lines.append(
-                "%dimp1 = arith.index_cast %c20_i64 : i64 to index"
-            )
-            continue
-        line = remove_constant_dim(line)
-        new_lines.append(line)
-
-    return "\n".join(new_lines)
-
-
-def save_dynamic_ir(ir_to_save, output_file):
-    if not ir_to_save:
-        return
-    # We only get string output from the dynamic conversion utility.
-    from contextlib import redirect_stdout
-
-    with open(output_file, "w") as f:
-        with redirect_stdout(f):
-            print(ir_to_save)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="llama ir utility",
-        description="\tThis script can be used as a standalone utility to convert IRs to dynamic + combine them.\n"
-        + "\tFollowing are the various ways this script can be used :-\n"
-        + "\t\ta. To convert a single Linalg IR to dynamic IR:\n"
-        + "\t\t\t--dynamic --first_ir_path=<PATH TO FIRST IR>\n"
-        + "\t\tb. To convert two Linalg IRs to dynamic IR:\n"
-        + "\t\t\t--dynamic --first_ir_path=<PATH TO SECOND IR> --first_ir_path=<PATH TO SECOND IR>\n"
-        + "\t\tc. To combine two Linalg IRs into one:\n"
-        + "\t\t\t--combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>\n"
-        + "\t\td. To convert both IRs into dynamic as well as combine the IRs:\n"
-        + "\t\t\t--dynamic --combine --first_ir_path=<PATH TO FIRST IR> --second_ir_path=<PATH TO SECOND IR>\n\n"
-        + "\tNOTE: For dynamic you'll also need to provide the following set of flags:-\n"
-        + "\t\t i. For First Llama : --dynamic_input_size (DEFAULT: 19)\n"
-        + "\t\tii. For Second Llama: --model_name (DEFAULT: llama2_7b)\n"
-        + "\t\t\t--precision (DEFAULT: 'int4')\n"
-        + "\t      You may use --save_dynamic to also save the dynamic IR in option d above.\n"
-        + "\t      Else for option a. and b. the dynamic IR(s) will get saved by default.\n",
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--precision",
-        "-p",
-        default="int4",
-        choices=["fp32", "fp16", "int8", "int4"],
-        help="Precision of the concerned IR",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="llama2_7b",
-        choices=["vicuna", "llama2_7b", "llama2_13b", "llama2_70b"],
-        help="Specify which model to run.",
-    )
-    parser.add_argument(
-        "--first_ir_path",
-        default=None,
-        help="path to first llama mlir file",
-    )
-    parser.add_argument(
-        "--second_ir_path",
-        default=None,
-        help="path to second llama mlir file",
-    )
-    parser.add_argument(
-        "--dynamic_input_size",
-        type=int,
-        default=19,
-        help="Specify the static input size to replace with dynamic dim.",
-    )
-    parser.add_argument(
-        "--dynamic",
-        default=False,
-        action=argparse.BooleanOptionalAction,
-        help="Converts the IR(s) to dynamic",
-    )
-    parser.add_argument(
-        "--save_dynamic",
-        default=False,
-        action=argparse.BooleanOptionalAction,
-        help="Save the individual IR(s) after converting to dynamic",
-    )
-    parser.add_argument(
-        "--combine",
-        default=False,
-        action=argparse.BooleanOptionalAction,
-        help="Converts the IR(s) to dynamic",
-    )
-
-    args, unknown = parser.parse_known_args()
-
-    dynamic = args.dynamic
-    combine = args.combine
-    assert (
-        dynamic or combine
-    ), "neither `dynamic` nor `combine` flag is turned on"
-    first_ir_path = args.first_ir_path
-    second_ir_path = args.second_ir_path
-    assert first_ir_path or second_ir_path, "no input ir has been provided"
-    if combine:
-        assert (
-            first_ir_path and second_ir_path
-        ), "you will need to provide both IRs to combine"
-    precision = args.precision
-    model_name = args.model_name
-    dynamic_input_size = args.dynamic_input_size
-    save_dynamic = args.save_dynamic
-
-    print(f"Dynamic conversion utility is turned {'ON' if dynamic else 'OFF'}")
-    print(f"Combining IR utility is turned {'ON' if combine else 'OFF'}")
-
-    if dynamic and not combine:
-        save_dynamic = True
-
-    first_ir = None
-    first_dynamic_ir_name = None
-    second_ir = None
-    second_dynamic_ir_name = None
-    if first_ir_path:
-        first_dynamic_ir_name = f"{Path(first_ir_path).stem}_dynamic"
-        with open(first_ir_path, "r") as f:
-            first_ir = f.read()
-    if second_ir_path:
-        second_dynamic_ir_name = f"{Path(second_ir_path).stem}_dynamic"
-        with open(second_ir_path, "r") as f:
-            second_ir = f.read()
-    if dynamic:
-        first_ir = (
-            write_in_dynamic_inputs0(first_ir, dynamic_input_size)
-            if first_ir
-            else None
-        )
-        second_ir = (
-            write_in_dynamic_inputs1(second_ir, model_name, precision)
-            if second_ir
-            else None
-        )
-        if save_dynamic:
-            save_dynamic_ir(first_ir, f"{first_dynamic_ir_name}.mlir")
-            save_dynamic_ir(second_ir, f"{second_dynamic_ir_name}.mlir")
-
-    if combine:
-        combine_mlir_scripts(
-            first_ir,
-            second_ir,
-            f"{model_name}_{precision}.mlir",
-            return_ir=False,
-        )
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -1,211 +0,0 @@
-import torch
-import torch_mlir
-from transformers import (
-    AutoTokenizer,
-    StoppingCriteria,
-)
-from io import BytesIO
-from pathlib import Path
-from apps.language_models.utils import (
-    get_torch_mlir_module_bytecode,
-    get_vmfb_from_path,
-)
-
-
-class StopOnTokens(StoppingCriteria):
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-    ) -> bool:
-        stop_ids = [50278, 50279, 50277, 1, 0]
-        for stop_id in stop_ids:
-            if input_ids[0][-1] == stop_id:
-                return True
-        return False
-
-
-def shouldStop(tokens):
-    stop_ids = [50278, 50279, 50277, 1, 0]
-    for stop_id in stop_ids:
-        if tokens[0][-1] == stop_id:
-            return True
-    return False
-
-
-MAX_SEQUENCE_LENGTH = 256
-
-
-def user(message, history):
-    # Append the user's message to the conversation history
-    return "", history + [[message, ""]]
-
-
-def compile_stableLM(
-    model,
-    model_inputs,
-    model_name,
-    model_vmfb_name,
-    device="cuda",
-    precision="fp32",
-    debug=False,
-):
-    from shark.shark_inference import SharkInference
-
-    # device = "cuda"  # "cpu"
-    # TODO: vmfb and mlir name should include precision and device
-    vmfb_path = (
-        Path(model_name + f"_{device}.vmfb")
-        if model_vmfb_name is None
-        else Path(model_vmfb_name)
-    )
-    shark_module = get_vmfb_from_path(
-        vmfb_path, device, mlir_dialect="tm_tensor"
-    )
-    if shark_module is not None:
-        return shark_module
-
-    mlir_path = Path(model_name + ".mlir")
-    print(
-        f"[DEBUG] mlir path {mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
-    )
-    if mlir_path.exists():
-        with open(mlir_path, "rb") as f:
-            bytecode = f.read()
-    else:
-        ts_graph = get_torch_mlir_module_bytecode(model, model_inputs)
-        module = torch_mlir.compile(
-            ts_graph,
-            [*model_inputs],
-            torch_mlir.OutputType.LINALG_ON_TENSORS,
-            use_tracing=False,
-            verbose=False,
-        )
-        bytecode_stream = BytesIO()
-        module.operation.write_bytecode(bytecode_stream)
-        bytecode = bytecode_stream.getvalue()
-    f_ = open(model_name + ".mlir", "wb")
-    f_.write(bytecode)
-    print("Saved mlir")
-    f_.close()
-
-    shark_module = SharkInference(
-        mlir_module=bytecode, device=device, mlir_dialect="tm_tensor"
-    )
-    shark_module.compile()
-
-    path = shark_module.save_module(
-        vmfb_path.parent.absolute(), vmfb_path.stem, debug=debug
-    )
-    print("Saved vmfb at ", str(path))
-
-    return shark_module
-
-
-class StableLMModel(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, input_ids, attention_mask):
-        combine_input_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        output = self.model(**combine_input_dict)
-        return output.logits
-
-
-# Initialize a StopOnTokens object
-system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
-"""
-
-
-def get_tokenizer():
-    model_path = "stabilityai/stablelm-tuned-alpha-3b"
-    tok = AutoTokenizer.from_pretrained(model_path)
-    tok.add_special_tokens({"pad_token": "<PAD>"})
-    print("Sucessfully loaded the tokenizer to the memory")
-    return tok
-
-
-# sharkStableLM = compile_stableLM
-# (
-#   None,
-#   tuple([input_ids, attention_mask]),
-#   "stableLM_linalg_f32_seqLen256",
-#   "/home/shark/vivek/stableLM_shark_f32_seqLen256"
-# )
-def generate(
-    new_text,
-    max_new_tokens,
-    sharkStableLM,
-    tokenizer=None,
-):
-    if tokenizer is None:
-        tokenizer = get_tokenizer()
-    # Construct the input message string for the model by
-    # concatenating the current system message and conversation history
-    # Tokenize the messages string
-    # sharkStableLM = compile_stableLM
-    # (
-    #   None,
-    #   tuple([input_ids, attention_mask]),
-    #   "stableLM_linalg_f32_seqLen256",
-    #   "/home/shark/vivek/stableLM_shark_f32_seqLen256"
-    # )
-    words_list = []
-    for i in range(max_new_tokens):
-        # numWords = len(new_text.split())
-        # if(numWords>220):
-        #  break
-        params = {
-            "new_text": new_text,
-        }
-        generated_token_op = generate_new_token(
-            sharkStableLM, tokenizer, params
-        )
-        detok = generated_token_op["detok"]
-        stop_generation = generated_token_op["stop_generation"]
-        if stop_generation:
-            break
-        print(detok, end="", flush=True)
-        words_list.append(detok)
-        if detok == "":
-            break
-        new_text = new_text + detok
-    return words_list
-
-
-def generate_new_token(shark_model, tokenizer, params):
-    new_text = params["new_text"]
-    model_inputs = tokenizer(
-        [new_text],
-        padding="max_length",
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        return_tensors="pt",
-    )
-    sum_attentionmask = torch.sum(model_inputs.attention_mask)
-    # sharkStableLM = compile_stableLM(None, tuple([input_ids, attention_mask]), "stableLM_linalg_f32_seqLen256", "/home/shark/vivek/stableLM_shark_f32_seqLen256")
-    output = shark_model(
-        "forward", [model_inputs.input_ids, model_inputs.attention_mask]
-    )
-    output = torch.from_numpy(output)
-    next_toks = torch.topk(output, 1)
-    stop_generation = False
-    if shouldStop(next_toks.indices):
-        stop_generation = True
-    new_token = next_toks.indices[0][int(sum_attentionmask) - 1]
-    detok = tokenizer.decode(
-        new_token,
-        skip_special_tokens=True,
-    )
-    ret_dict = {
-        "new_token": new_token,
-        "detok": detok,
-        "stop_generation": stop_generation,
-    }
-    return ret_dict
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
--- a/apps/language_models/shark_llama_cli.spec
+++ b/apps/language_models/shark_llama_cli.spec
@@ -1,94 +0,0 @@
-# -*- mode: python ; coding: utf-8 -*-
-from PyInstaller.utils.hooks import collect_data_files
-from PyInstaller.utils.hooks import collect_submodules
-from PyInstaller.utils.hooks import copy_metadata
-
-import sys ; sys.setrecursionlimit(sys.getrecursionlimit() * 5)
-
-datas = []
-datas += collect_data_files('torch')
-datas += copy_metadata('torch')
-datas += copy_metadata('tqdm')
-datas += copy_metadata('regex')
-datas += copy_metadata('requests')
-datas += copy_metadata('packaging')
-datas += copy_metadata('filelock')
-datas += copy_metadata('numpy')
-datas += copy_metadata('tokenizers')
-datas += copy_metadata('importlib_metadata')
-datas += copy_metadata('torch-mlir')
-datas += copy_metadata('omegaconf')
-datas += copy_metadata('safetensors')
-datas += copy_metadata('huggingface-hub')
-datas += copy_metadata('sentencepiece')
-datas += copy_metadata("pyyaml")
-datas += collect_data_files("tokenizers")
-datas += collect_data_files("tiktoken")
-datas += collect_data_files("accelerate")
-datas += collect_data_files('diffusers')
-datas += collect_data_files('transformers')
-datas += collect_data_files('opencv-python')
-datas += collect_data_files('pytorch_lightning')
-datas += collect_data_files('skimage')
-datas += collect_data_files('gradio')
-datas += collect_data_files('gradio_client')
-datas += collect_data_files('iree')
-datas += collect_data_files('google-cloud-storage')
-datas += collect_data_files('py-cpuinfo')
-datas += collect_data_files("shark", include_py_files=True)
-datas += collect_data_files("timm", include_py_files=True)
-datas += collect_data_files("tqdm")
-datas += collect_data_files("tkinter")
-datas += collect_data_files("webview")
-datas += collect_data_files("sentencepiece")
-datas += collect_data_files("jsonschema")
-datas += collect_data_files("jsonschema_specifications")
-datas += collect_data_files("cpuinfo")
-datas += collect_data_files("langchain")
-
-binaries = []
-
-block_cipher = None
-
-hiddenimports = ['shark', 'shark.shark_inference', 'apps']
-hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
-hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
-
-a = Analysis(
-    ['scripts/vicuna.py'],
-    pathex=['.'],
-    binaries=binaries,
-    datas=datas,
-    hiddenimports=hiddenimports,
-    hookspath=[],
-    hooksconfig={},
-    runtime_hooks=[],
-    excludes=[],
-    win_no_prefer_redirects=False,
-    win_private_assemblies=False,
-    cipher=block_cipher,
-    noarchive=False,
-)
-pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
-
-exe = EXE(
-    pyz,
-    a.scripts,
-    a.binaries,
-    a.zipfiles,
-    a.datas,
-    [],
-    name='shark_llama_cli',
-    debug=False,
-    bootloader_ignore_signals=False,
-    strip=False,
-    upx=True,
-    upx_exclude=[],
-    runtime_tmpdir=None,
-    console=True,
-    disable_windowed_traceback=False,
-    argv_emulation=False,
-    target_arch=None,
-    codesign_identity=None,
-    entitlements_file=None,
-)
--- a/apps/language_models/src/model_wrappers/falcon_model.py
+++ b/apps/language_models/src/model_wrappers/falcon_model.py
@@ -1,22 +0,0 @@
-import torch
-
-
-class FalconModel(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, input_ids, attention_mask):
-        input_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": None,
-            "use_cache": True,
-        }
-        output = self.model(
-            **input_dict,
-            return_dict=True,
-            output_attentions=False,
-            output_hidden_states=False,
-        )[0]
-        return output[:, -1, :]
--- a/apps/language_models/src/model_wrappers/falcon_sharded_model.py
+++ b/apps/language_models/src/model_wrappers/falcon_sharded_model.py
@@ -1,675 +0,0 @@
-import torch
-from typing import Optional, Tuple
-
-
-class WordEmbeddingsLayer(torch.nn.Module):
-    def __init__(self, word_embedding_layer):
-        super().__init__()
-        self.model = word_embedding_layer
-
-    def forward(self, input_ids):
-        output = self.model.forward(input=input_ids)
-        return output
-
-
-class CompiledWordEmbeddingsLayer(torch.nn.Module):
-    def __init__(self, compiled_word_embedding_layer):
-        super().__init__()
-        self.model = compiled_word_embedding_layer
-
-    def forward(self, input_ids):
-        input_ids = input_ids.detach().numpy()
-        new_input_ids = self.model("forward", input_ids)
-        new_input_ids = new_input_ids.reshape(
-            [1, new_input_ids.shape[0], new_input_ids.shape[1]]
-        )
-        return torch.tensor(new_input_ids)
-
-
-class LNFEmbeddingLayer(torch.nn.Module):
-    def __init__(self, ln_f):
-        super().__init__()
-        self.model = ln_f
-
-    def forward(self, hidden_states):
-        output = self.model.forward(input=hidden_states)
-        return output
-
-
-class CompiledLNFEmbeddingLayer(torch.nn.Module):
-    def __init__(self, ln_f):
-        super().__init__()
-        self.model = ln_f
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.detach().numpy()
-        new_hidden_states = self.model("forward", (hidden_states,))
-
-        return torch.tensor(new_hidden_states)
-
-
-class LMHeadEmbeddingLayer(torch.nn.Module):
-    def __init__(self, embedding_layer):
-        super().__init__()
-        self.model = embedding_layer
-
-    def forward(self, hidden_states):
-        output = self.model.forward(input=hidden_states)
-        return output
-
-
-class CompiledLMHeadEmbeddingLayer(torch.nn.Module):
-    def __init__(self, lm_head):
-        super().__init__()
-        self.model = lm_head
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.detach().numpy()
-        new_hidden_states = self.model("forward", (hidden_states,))
-        return torch.tensor(new_hidden_states)
-
-
-class FourWayShardingDecoderLayer(torch.nn.Module):
-    def __init__(self, decoder_layer_model, falcon_variant):
-        super().__init__()
-        self.model = decoder_layer_model
-        self.falcon_variant = falcon_variant
-
-    def forward(self, hidden_states, attention_mask):
-        new_pkvs = []
-        for layer in self.model:
-            outputs = layer(
-                hidden_states=hidden_states,
-                alibi=None,
-                attention_mask=attention_mask,
-                use_cache=True,
-            )
-            hidden_states = outputs[0]
-            new_pkvs.append(
-                (
-                    outputs[-1][0],
-                    outputs[-1][1],
-                )
-            )
-
-        (
-            (new_pkv00, new_pkv01),
-            (new_pkv10, new_pkv11),
-            (new_pkv20, new_pkv21),
-            (new_pkv30, new_pkv31),
-            (new_pkv40, new_pkv41),
-            (new_pkv50, new_pkv51),
-            (new_pkv60, new_pkv61),
-            (new_pkv70, new_pkv71),
-            (new_pkv80, new_pkv81),
-            (new_pkv90, new_pkv91),
-            (new_pkv100, new_pkv101),
-            (new_pkv110, new_pkv111),
-            (new_pkv120, new_pkv121),
-            (new_pkv130, new_pkv131),
-            (new_pkv140, new_pkv141),
-            (new_pkv150, new_pkv151),
-            (new_pkv160, new_pkv161),
-            (new_pkv170, new_pkv171),
-            (new_pkv180, new_pkv181),
-            (new_pkv190, new_pkv191),
-        ) = new_pkvs
-        result = (
-            hidden_states,
-            new_pkv00,
-            new_pkv01,
-            new_pkv10,
-            new_pkv11,
-            new_pkv20,
-            new_pkv21,
-            new_pkv30,
-            new_pkv31,
-            new_pkv40,
-            new_pkv41,
-            new_pkv50,
-            new_pkv51,
-            new_pkv60,
-            new_pkv61,
-            new_pkv70,
-            new_pkv71,
-            new_pkv80,
-            new_pkv81,
-            new_pkv90,
-            new_pkv91,
-            new_pkv100,
-            new_pkv101,
-            new_pkv110,
-            new_pkv111,
-            new_pkv120,
-            new_pkv121,
-            new_pkv130,
-            new_pkv131,
-            new_pkv140,
-            new_pkv141,
-            new_pkv150,
-            new_pkv151,
-            new_pkv160,
-            new_pkv161,
-            new_pkv170,
-            new_pkv171,
-            new_pkv180,
-            new_pkv181,
-            new_pkv190,
-            new_pkv191,
-        )
-        return result
-
-
-class CompiledFourWayShardingDecoderLayer(torch.nn.Module):
-    def __init__(
-        self, layer_id, device_idx, falcon_variant, device, precision, model
-    ):
-        super().__init__()
-        self.layer_id = layer_id
-        self.device_index = device_idx
-        self.falcon_variant = falcon_variant
-        self.device = device
-        self.precision = precision
-        self.model = model
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: Optional[torch.Tensor],
-        attention_mask: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        import gc
-
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        if self.model is None:
-            raise ValueError("Layer vmfb not found")
-
-        hidden_states = hidden_states.to(torch.float32).detach().numpy()
-        attention_mask = attention_mask.to(torch.float32).detach().numpy()
-
-        if alibi is not None or layer_past is not None:
-            raise ValueError("Past Key Values and alibi should be None")
-        else:
-            output = self.model(
-                "forward",
-                (
-                    hidden_states,
-                    attention_mask,
-                ),
-            )
-
-        result = (
-            torch.tensor(output[0]),
-            (
-                torch.tensor(output[1]),
-                torch.tensor(output[2]),
-            ),
-            (
-                torch.tensor(output[3]),
-                torch.tensor(output[4]),
-            ),
-            (
-                torch.tensor(output[5]),
-                torch.tensor(output[6]),
-            ),
-            (
-                torch.tensor(output[7]),
-                torch.tensor(output[8]),
-            ),
-            (
-                torch.tensor(output[9]),
-                torch.tensor(output[10]),
-            ),
-            (
-                torch.tensor(output[11]),
-                torch.tensor(output[12]),
-            ),
-            (
-                torch.tensor(output[13]),
-                torch.tensor(output[14]),
-            ),
-            (
-                torch.tensor(output[15]),
-                torch.tensor(output[16]),
-            ),
-            (
-                torch.tensor(output[17]),
-                torch.tensor(output[18]),
-            ),
-            (
-                torch.tensor(output[19]),
-                torch.tensor(output[20]),
-            ),
-            (
-                torch.tensor(output[21]),
-                torch.tensor(output[22]),
-            ),
-            (
-                torch.tensor(output[23]),
-                torch.tensor(output[24]),
-            ),
-            (
-                torch.tensor(output[25]),
-                torch.tensor(output[26]),
-            ),
-            (
-                torch.tensor(output[27]),
-                torch.tensor(output[28]),
-            ),
-            (
-                torch.tensor(output[29]),
-                torch.tensor(output[30]),
-            ),
-            (
-                torch.tensor(output[31]),
-                torch.tensor(output[32]),
-            ),
-            (
-                torch.tensor(output[33]),
-                torch.tensor(output[34]),
-            ),
-            (
-                torch.tensor(output[35]),
-                torch.tensor(output[36]),
-            ),
-            (
-                torch.tensor(output[37]),
-                torch.tensor(output[38]),
-            ),
-            (
-                torch.tensor(output[39]),
-                torch.tensor(output[40]),
-            ),
-        )
-        return result
-
-
-class TwoWayShardingDecoderLayer(torch.nn.Module):
-    def __init__(self, decoder_layer_model, falcon_variant):
-        super().__init__()
-        self.model = decoder_layer_model
-        self.falcon_variant = falcon_variant
-
-    def forward(self, hidden_states, attention_mask):
-        new_pkvs = []
-        for layer in self.model:
-            outputs = layer(
-                hidden_states=hidden_states,
-                alibi=None,
-                attention_mask=attention_mask,
-                use_cache=True,
-            )
-            hidden_states = outputs[0]
-            new_pkvs.append(
-                (
-                    outputs[-1][0],
-                    outputs[-1][1],
-                )
-            )
-
-        (
-            (new_pkv00, new_pkv01),
-            (new_pkv10, new_pkv11),
-            (new_pkv20, new_pkv21),
-            (new_pkv30, new_pkv31),
-            (new_pkv40, new_pkv41),
-            (new_pkv50, new_pkv51),
-            (new_pkv60, new_pkv61),
-            (new_pkv70, new_pkv71),
-            (new_pkv80, new_pkv81),
-            (new_pkv90, new_pkv91),
-            (new_pkv100, new_pkv101),
-            (new_pkv110, new_pkv111),
-            (new_pkv120, new_pkv121),
-            (new_pkv130, new_pkv131),
-            (new_pkv140, new_pkv141),
-            (new_pkv150, new_pkv151),
-            (new_pkv160, new_pkv161),
-            (new_pkv170, new_pkv171),
-            (new_pkv180, new_pkv181),
-            (new_pkv190, new_pkv191),
-            (new_pkv200, new_pkv201),
-            (new_pkv210, new_pkv211),
-            (new_pkv220, new_pkv221),
-            (new_pkv230, new_pkv231),
-            (new_pkv240, new_pkv241),
-            (new_pkv250, new_pkv251),
-            (new_pkv260, new_pkv261),
-            (new_pkv270, new_pkv271),
-            (new_pkv280, new_pkv281),
-            (new_pkv290, new_pkv291),
-            (new_pkv300, new_pkv301),
-            (new_pkv310, new_pkv311),
-            (new_pkv320, new_pkv321),
-            (new_pkv330, new_pkv331),
-            (new_pkv340, new_pkv341),
-            (new_pkv350, new_pkv351),
-            (new_pkv360, new_pkv361),
-            (new_pkv370, new_pkv371),
-            (new_pkv380, new_pkv381),
-            (new_pkv390, new_pkv391),
-        ) = new_pkvs
-        result = (
-            hidden_states,
-            new_pkv00,
-            new_pkv01,
-            new_pkv10,
-            new_pkv11,
-            new_pkv20,
-            new_pkv21,
-            new_pkv30,
-            new_pkv31,
-            new_pkv40,
-            new_pkv41,
-            new_pkv50,
-            new_pkv51,
-            new_pkv60,
-            new_pkv61,
-            new_pkv70,
-            new_pkv71,
-            new_pkv80,
-            new_pkv81,
-            new_pkv90,
-            new_pkv91,
-            new_pkv100,
-            new_pkv101,
-            new_pkv110,
-            new_pkv111,
-            new_pkv120,
-            new_pkv121,
-            new_pkv130,
-            new_pkv131,
-            new_pkv140,
-            new_pkv141,
-            new_pkv150,
-            new_pkv151,
-            new_pkv160,
-            new_pkv161,
-            new_pkv170,
-            new_pkv171,
-            new_pkv180,
-            new_pkv181,
-            new_pkv190,
-            new_pkv191,
-            new_pkv200,
-            new_pkv201,
-            new_pkv210,
-            new_pkv211,
-            new_pkv220,
-            new_pkv221,
-            new_pkv230,
-            new_pkv231,
-            new_pkv240,
-            new_pkv241,
-            new_pkv250,
-            new_pkv251,
-            new_pkv260,
-            new_pkv261,
-            new_pkv270,
-            new_pkv271,
-            new_pkv280,
-            new_pkv281,
-            new_pkv290,
-            new_pkv291,
-            new_pkv300,
-            new_pkv301,
-            new_pkv310,
-            new_pkv311,
-            new_pkv320,
-            new_pkv321,
-            new_pkv330,
-            new_pkv331,
-            new_pkv340,
-            new_pkv341,
-            new_pkv350,
-            new_pkv351,
-            new_pkv360,
-            new_pkv361,
-            new_pkv370,
-            new_pkv371,
-            new_pkv380,
-            new_pkv381,
-            new_pkv390,
-            new_pkv391,
-        )
-        return result
-
-
-class CompiledTwoWayShardingDecoderLayer(torch.nn.Module):
-    def __init__(
-        self, layer_id, device_idx, falcon_variant, device, precision, model
-    ):
-        super().__init__()
-        self.layer_id = layer_id
-        self.device_index = device_idx
-        self.falcon_variant = falcon_variant
-        self.device = device
-        self.precision = precision
-        self.model = model
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        alibi: Optional[torch.Tensor],
-        attention_mask: torch.Tensor,
-        position_ids: Optional[torch.LongTensor] = None,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        import gc
-
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        if self.model is None:
-            raise ValueError("Layer vmfb not found")
-
-        hidden_states = hidden_states.to(torch.float32).detach().numpy()
-        attention_mask = attention_mask.to(torch.float32).detach().numpy()
-
-        if alibi is not None or layer_past is not None:
-            raise ValueError("Past Key Values and alibi should be None")
-        else:
-            output = self.model(
-                "forward",
-                (
-                    hidden_states,
-                    attention_mask,
-                ),
-            )
-
-        result = (
-            torch.tensor(output[0]),
-            (
-                torch.tensor(output[1]),
-                torch.tensor(output[2]),
-            ),
-            (
-                torch.tensor(output[3]),
-                torch.tensor(output[4]),
-            ),
-            (
-                torch.tensor(output[5]),
-                torch.tensor(output[6]),
-            ),
-            (
-                torch.tensor(output[7]),
-                torch.tensor(output[8]),
-            ),
-            (
-                torch.tensor(output[9]),
-                torch.tensor(output[10]),
-            ),
-            (
-                torch.tensor(output[11]),
-                torch.tensor(output[12]),
-            ),
-            (
-                torch.tensor(output[13]),
-                torch.tensor(output[14]),
-            ),
-            (
-                torch.tensor(output[15]),
-                torch.tensor(output[16]),
-            ),
-            (
-                torch.tensor(output[17]),
-                torch.tensor(output[18]),
-            ),
-            (
-                torch.tensor(output[19]),
-                torch.tensor(output[20]),
-            ),
-            (
-                torch.tensor(output[21]),
-                torch.tensor(output[22]),
-            ),
-            (
-                torch.tensor(output[23]),
-                torch.tensor(output[24]),
-            ),
-            (
-                torch.tensor(output[25]),
-                torch.tensor(output[26]),
-            ),
-            (
-                torch.tensor(output[27]),
-                torch.tensor(output[28]),
-            ),
-            (
-                torch.tensor(output[29]),
-                torch.tensor(output[30]),
-            ),
-            (
-                torch.tensor(output[31]),
-                torch.tensor(output[32]),
-            ),
-            (
-                torch.tensor(output[33]),
-                torch.tensor(output[34]),
-            ),
-            (
-                torch.tensor(output[35]),
-                torch.tensor(output[36]),
-            ),
-            (
-                torch.tensor(output[37]),
-                torch.tensor(output[38]),
-            ),
-            (
-                torch.tensor(output[39]),
-                torch.tensor(output[40]),
-            ),
-            (
-                torch.tensor(output[41]),
-                torch.tensor(output[42]),
-            ),
-            (
-                torch.tensor(output[43]),
-                torch.tensor(output[44]),
-            ),
-            (
-                torch.tensor(output[45]),
-                torch.tensor(output[46]),
-            ),
-            (
-                torch.tensor(output[47]),
-                torch.tensor(output[48]),
-            ),
-            (
-                torch.tensor(output[49]),
-                torch.tensor(output[50]),
-            ),
-            (
-                torch.tensor(output[51]),
-                torch.tensor(output[52]),
-            ),
-            (
-                torch.tensor(output[53]),
-                torch.tensor(output[54]),
-            ),
-            (
-                torch.tensor(output[55]),
-                torch.tensor(output[56]),
-            ),
-            (
-                torch.tensor(output[57]),
-                torch.tensor(output[58]),
-            ),
-            (
-                torch.tensor(output[59]),
-                torch.tensor(output[60]),
-            ),
-            (
-                torch.tensor(output[61]),
-                torch.tensor(output[62]),
-            ),
-            (
-                torch.tensor(output[63]),
-                torch.tensor(output[64]),
-            ),
-            (
-                torch.tensor(output[65]),
-                torch.tensor(output[66]),
-            ),
-            (
-                torch.tensor(output[67]),
-                torch.tensor(output[68]),
-            ),
-            (
-                torch.tensor(output[69]),
-                torch.tensor(output[70]),
-            ),
-            (
-                torch.tensor(output[71]),
-                torch.tensor(output[72]),
-            ),
-            (
-                torch.tensor(output[73]),
-                torch.tensor(output[74]),
-            ),
-            (
-                torch.tensor(output[75]),
-                torch.tensor(output[76]),
-            ),
-            (
-                torch.tensor(output[77]),
-                torch.tensor(output[78]),
-            ),
-            (
-                torch.tensor(output[79]),
-                torch.tensor(output[80]),
-            ),
-        )
-        return result
-
-
-class ShardedFalconModel:
-    def __init__(self, model, layers, word_embeddings, ln_f, lm_head):
-        super().__init__()
-        self.model = model
-        self.model.transformer.h = torch.nn.modules.container.ModuleList(
-            layers
-        )
-        self.model.transformer.word_embeddings = word_embeddings
-        self.model.transformer.ln_f = ln_f
-        self.model.lm_head = lm_head
-
-    def forward(
-        self,
-        input_ids,
-        attention_mask=None,
-    ):
-        return self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        ).logits[:, -1, :]
--- a/apps/language_models/src/model_wrappers/minigpt4.py
+++ b/apps/language_models/src/model_wrappers/minigpt4.py
@@ -1,503 +0,0 @@
-import torch
-import dataclasses
-from enum import auto, Enum
-from typing import List, Any
-from transformers import StoppingCriteria
-
-
-from brevitas_examples.common.generative.quantize import quantize_model
-from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
-
-
-class LayerNorm(torch.nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x.type(torch.float32))
-        return ret.type(orig_type)
-
-
-class VisionModel(torch.nn.Module):
-    def __init__(
-        self,
-        ln_vision,
-        visual_encoder,
-        precision="fp32",
-        weight_group_size=128,
-    ):
-        super().__init__()
-        self.ln_vision = ln_vision
-        self.visual_encoder = visual_encoder
-        if precision in ["int4", "int8"]:
-            print("Vision Model applying weight quantization to ln_vision")
-            weight_bit_width = 4 if precision == "int4" else 8
-            quantize_model(
-                self.ln_vision,
-                dtype=torch.float32,
-                weight_bit_width=weight_bit_width,
-                weight_param_method="stats",
-                weight_scale_precision="float_scale",
-                weight_quant_type="asym",
-                weight_quant_granularity="per_group",
-                weight_group_size=weight_group_size,
-                quantize_weight_zero_point=False,
-            )
-            print("Weight quantization applied.")
-            print(
-                "Vision Model applying weight quantization to visual_encoder"
-            )
-            quantize_model(
-                self.visual_encoder,
-                dtype=torch.float32,
-                weight_bit_width=weight_bit_width,
-                weight_param_method="stats",
-                weight_scale_precision="float_scale",
-                weight_quant_type="asym",
-                weight_quant_granularity="per_group",
-                weight_group_size=weight_group_size,
-                quantize_weight_zero_point=False,
-            )
-            print("Weight quantization applied.")
-
-    def forward(self, image):
-        image_embeds = self.ln_vision(self.visual_encoder(image))
-        return image_embeds
-
-
-class QformerBertModel(torch.nn.Module):
-    def __init__(self, qformer_bert):
-        super().__init__()
-        self.qformer_bert = qformer_bert
-
-    def forward(self, query_tokens, image_embeds, image_atts):
-        query_output = self.qformer_bert(
-            query_embeds=query_tokens,
-            encoder_hidden_states=image_embeds,
-            encoder_attention_mask=image_atts,
-            return_dict=True,
-        )
-        return query_output.last_hidden_state
-
-
-class FirstLlamaModel(torch.nn.Module):
-    def __init__(self, model, precision="fp32", weight_group_size=128):
-        super().__init__()
-        self.model = model
-        print("SHARK: Loading LLAMA Done")
-        if precision in ["int4", "int8"]:
-            print("First Llama applying weight quantization")
-            weight_bit_width = 4 if precision == "int4" else 8
-            quantize_model(
-                self.model,
-                dtype=torch.float32,
-                weight_bit_width=weight_bit_width,
-                weight_param_method="stats",
-                weight_scale_precision="float_scale",
-                weight_quant_type="asym",
-                weight_quant_granularity="per_group",
-                weight_group_size=weight_group_size,
-                quantize_weight_zero_point=False,
-            )
-            print("Weight quantization applied.")
-
-    def forward(self, inputs_embeds, position_ids, attention_mask):
-        print("************************************")
-        print(
-            "inputs_embeds: ",
-            inputs_embeds.shape,
-            " dtype: ",
-            inputs_embeds.dtype,
-        )
-        print(
-            "position_ids: ",
-            position_ids.shape,
-            " dtype: ",
-            position_ids.dtype,
-        )
-        print(
-            "attention_mask: ",
-            attention_mask.shape,
-            " dtype: ",
-            attention_mask.dtype,
-        )
-        print("************************************")
-        config = {
-            "inputs_embeds": inputs_embeds,
-            "position_ids": position_ids,
-            "past_key_values": None,
-            "use_cache": True,
-            "attention_mask": attention_mask,
-        }
-        output = self.model(
-            **config,
-            return_dict=True,
-            output_attentions=False,
-            output_hidden_states=False,
-        )
-        return_vals = []
-        return_vals.append(output.logits)
-        temp_past_key_values = output.past_key_values
-        for item in temp_past_key_values:
-            return_vals.append(item[0])
-            return_vals.append(item[1])
-        return tuple(return_vals)
-
-
-class SecondLlamaModel(torch.nn.Module):
-    def __init__(self, model, precision="fp32", weight_group_size=128):
-        super().__init__()
-        self.model = model
-        print("SHARK: Loading LLAMA Done")
-        if precision in ["int4", "int8"]:
-            print("Second Llama applying weight quantization")
-            weight_bit_width = 4 if precision == "int4" else 8
-            quantize_model(
-                self.model,
-                dtype=torch.float32,
-                weight_bit_width=weight_bit_width,
-                weight_param_method="stats",
-                weight_scale_precision="float_scale",
-                weight_quant_type="asym",
-                weight_quant_granularity="per_group",
-                weight_group_size=weight_group_size,
-                quantize_weight_zero_point=False,
-            )
-            print("Weight quantization applied.")
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        attention_mask,
-        i1,
-        i2,
-        i3,
-        i4,
-        i5,
-        i6,
-        i7,
-        i8,
-        i9,
-        i10,
-        i11,
-        i12,
-        i13,
-        i14,
-        i15,
-        i16,
-        i17,
-        i18,
-        i19,
-        i20,
-        i21,
-        i22,
-        i23,
-        i24,
-        i25,
-        i26,
-        i27,
-        i28,
-        i29,
-        i30,
-        i31,
-        i32,
-        i33,
-        i34,
-        i35,
-        i36,
-        i37,
-        i38,
-        i39,
-        i40,
-        i41,
-        i42,
-        i43,
-        i44,
-        i45,
-        i46,
-        i47,
-        i48,
-        i49,
-        i50,
-        i51,
-        i52,
-        i53,
-        i54,
-        i55,
-        i56,
-        i57,
-        i58,
-        i59,
-        i60,
-        i61,
-        i62,
-        i63,
-        i64,
-    ):
-        print("************************************")
-        print("input_ids: ", input_ids.shape, " dtype: ", input_ids.dtype)
-        print(
-            "position_ids: ",
-            position_ids.shape,
-            " dtype: ",
-            position_ids.dtype,
-        )
-        print(
-            "attention_mask: ",
-            attention_mask.shape,
-            " dtype: ",
-            attention_mask.dtype,
-        )
-        print("past_key_values: ", i1.shape, i2.shape, i63.shape, i64.shape)
-        print("past_key_values dtype: ", i1.dtype)
-        print("************************************")
-        config = {
-            "input_ids": input_ids,
-            "position_ids": position_ids,
-            "past_key_values": (
-                (i1, i2),
-                (
-                    i3,
-                    i4,
-                ),
-                (
-                    i5,
-                    i6,
-                ),
-                (
-                    i7,
-                    i8,
-                ),
-                (
-                    i9,
-                    i10,
-                ),
-                (
-                    i11,
-                    i12,
-                ),
-                (
-                    i13,
-                    i14,
-                ),
-                (
-                    i15,
-                    i16,
-                ),
-                (
-                    i17,
-                    i18,
-                ),
-                (
-                    i19,
-                    i20,
-                ),
-                (
-                    i21,
-                    i22,
-                ),
-                (
-                    i23,
-                    i24,
-                ),
-                (
-                    i25,
-                    i26,
-                ),
-                (
-                    i27,
-                    i28,
-                ),
-                (
-                    i29,
-                    i30,
-                ),
-                (
-                    i31,
-                    i32,
-                ),
-                (
-                    i33,
-                    i34,
-                ),
-                (
-                    i35,
-                    i36,
-                ),
-                (
-                    i37,
-                    i38,
-                ),
-                (
-                    i39,
-                    i40,
-                ),
-                (
-                    i41,
-                    i42,
-                ),
-                (
-                    i43,
-                    i44,
-                ),
-                (
-                    i45,
-                    i46,
-                ),
-                (
-                    i47,
-                    i48,
-                ),
-                (
-                    i49,
-                    i50,
-                ),
-                (
-                    i51,
-                    i52,
-                ),
-                (
-                    i53,
-                    i54,
-                ),
-                (
-                    i55,
-                    i56,
-                ),
-                (
-                    i57,
-                    i58,
-                ),
-                (
-                    i59,
-                    i60,
-                ),
-                (
-                    i61,
-                    i62,
-                ),
-                (
-                    i63,
-                    i64,
-                ),
-            ),
-            "use_cache": True,
-            "attention_mask": attention_mask,
-        }
-        output = self.model(
-            **config,
-            return_dict=True,
-            output_attentions=False,
-            output_hidden_states=False,
-        )
-        return_vals = []
-        return_vals.append(output.logits)
-        temp_past_key_values = output.past_key_values
-        for item in temp_past_key_values:
-            return_vals.append(item[0])
-            return_vals.append(item[1])
-        return tuple(return_vals)
-
-
-class SeparatorStyle(Enum):
-    """Different separator style."""
-
-    SINGLE = auto()
-    TWO = auto()
-
-
-@dataclasses.dataclass
-class Conversation:
-    """A class that keeps all conversation history."""
-
-    system: str
-    roles: List[str]
-    messages: List[List[str]]
-    offset: int
-    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
-    sep: str = "###"
-    sep2: str = None
-
-    skip_next: bool = False
-    conv_id: Any = None
-
-    def get_prompt(self):
-        if self.sep_style == SeparatorStyle.SINGLE:
-            ret = self.system + self.sep
-            for role, message in self.messages:
-                if message:
-                    ret += role + ": " + message + self.sep
-                else:
-                    ret += role + ":"
-            return ret
-        elif self.sep_style == SeparatorStyle.TWO:
-            seps = [self.sep, self.sep2]
-            ret = self.system + seps[0]
-            for i, (role, message) in enumerate(self.messages):
-                if message:
-                    ret += role + ": " + message + seps[i % 2]
-                else:
-                    ret += role + ":"
-            return ret
-        else:
-            raise ValueError(f"Invalid style: {self.sep_style}")
-
-    def append_message(self, role, message):
-        self.messages.append([role, message])
-
-    def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
-    def copy(self):
-        return Conversation(
-            system=self.system,
-            roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            offset=self.offset,
-            sep_style=self.sep_style,
-            sep=self.sep,
-            sep2=self.sep2,
-            conv_id=self.conv_id,
-        )
-
-    def dict(self):
-        return {
-            "system": self.system,
-            "roles": self.roles,
-            "messages": self.messages,
-            "offset": self.offset,
-            "sep": self.sep,
-            "sep2": self.sep2,
-            "conv_id": self.conv_id,
-        }
-
-
-class StoppingCriteriaSub(StoppingCriteria):
-    def __init__(self, stops=[], encounters=1):
-        super().__init__()
-        self.stops = stops
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
-        for stop in self.stops:
-            if torch.all((stop == input_ids[0][-len(stop) :])).item():
-                return True
-
-        return False
-
-
-CONV_VISION = Conversation(
-    system="Give the following image: <Img>ImageContent</Img>. "
-    "You will be able to see the image once I provide it to you. Please answer my questions.",
-    roles=("Human", "Assistant"),
-    messages=[],
-    offset=2,
-    sep_style=SeparatorStyle.SINGLE,
-    sep="###",
-)
--- a/apps/language_models/src/model_wrappers/stablelm_model.py
+++ b/apps/language_models/src/model_wrappers/stablelm_model.py
@@ -1,15 +0,0 @@
-import torch
-
-
-class StableLMModel(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, input_ids, attention_mask):
-        combine_input_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        output = self.model(**combine_input_dict)
-        return output.logits
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -1,876 +0,0 @@
-import argparse
-import json
-import re
-from io import BytesIO
-from pathlib import Path
-from tqdm import tqdm
-from typing import List, Optional, Tuple, Union
-import numpy as np
-import iree.runtime
-import itertools
-import subprocess
-
-import torch
-import torch_mlir
-from torch_mlir import TensorPlaceholder
-from torch_mlir.compiler_utils import run_pipeline_with_repro_report
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    LlamaPreTrainedModel,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-
-from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
-from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
-    FirstVicunaLayer,
-    SecondVicunaLayer,
-    CompiledVicunaLayer,
-    ShardedVicunaModel,
-    LMHead,
-    LMHeadCompiled,
-    VicunaEmbedding,
-    VicunaEmbeddingCompiled,
-    VicunaNorm,
-    VicunaNormCompiled,
-)
-from apps.language_models.src.model_wrappers.vicuna_model import (
-    FirstVicuna,
-    SecondVicuna7B,
-)
-from apps.language_models.utils import (
-    get_vmfb_from_path,
-)
-from shark.shark_downloader import download_public_file
-from shark.shark_importer import get_f16_inputs
-from shark.shark_inference import SharkInference
-
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import (
-    LlamaDecoderLayer,
-    LlamaRMSNorm,
-    _make_causal_mask,
-    _expand_mask,
-)
-from torch import nn
-from time import time
-
-
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.hidden_size, self.padding_idx
-        )
-        self.layers = nn.ModuleList(
-            [
-                LlamaDecoderLayer(config)
-                for _ in range(config.num_hidden_layers)
-            ]
-        )
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(
-        self,
-        attention_mask,
-        input_shape,
-        inputs_embeds,
-        past_key_values_length,
-    ):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(
-                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-            ).to(inputs_embeds.device)
-            combined_attention_mask = (
-                expanded_attn_mask
-                if combined_attention_mask is None
-                else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        t1 = time()
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = (
-            use_cache if use_cache is not None else self.config.use_cache
-        )
-
-        return_dict = (
-            return_dict
-            if return_dict is not None
-            else self.config.use_return_dict
-        )
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError(
-                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-            )
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = (
-                seq_length_with_past + past_key_values_length
-            )
-
-        if position_ids is None:
-            device = (
-                input_ids.device
-                if input_ids is not None
-                else inputs_embeds.device
-            )
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past),
-                dtype=torch.bool,
-                device=inputs_embeds.device,
-            )
-
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-        )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.compressedlayers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = (
-                past_key_values[8 * idx : 8 * (idx + 1)]
-                if past_key_values is not None
-                else None
-            )
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer.forward(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[1:],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        try:
-            hidden_states = np.asarray(hidden_states, hidden_states.dtype)
-        except:
-            _ = 10
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        next_cache = tuple(itertools.chain.from_iterable(next_cache))
-        print(f"Token generated in {time() - t1} seconds")
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_cache,
-                    all_hidden_states,
-                    all_self_attns,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class EightLayerLayerSV(torch.nn.Module):
-    def __init__(self, layers):
-        super().__init__()
-        assert len(layers) == 8
-        self.layers = layers
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        position_ids,
-        pkv00,
-        pkv01,
-        pkv10,
-        pkv11,
-        pkv20,
-        pkv21,
-        pkv30,
-        pkv31,
-        pkv40,
-        pkv41,
-        pkv50,
-        pkv51,
-        pkv60,
-        pkv61,
-        pkv70,
-        pkv71,
-    ):
-        pkvs = [
-            (pkv00, pkv01),
-            (pkv10, pkv11),
-            (pkv20, pkv21),
-            (pkv30, pkv31),
-            (pkv40, pkv41),
-            (pkv50, pkv51),
-            (pkv60, pkv61),
-            (pkv70, pkv71),
-        ]
-        new_pkvs = []
-        for layer, pkv in zip(self.layers, pkvs):
-            outputs = layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=(
-                    pkv[0],
-                    pkv[1],
-                ),
-                use_cache=True,
-            )
-
-            hidden_states = outputs[0]
-            new_pkvs.append(
-                (
-                    outputs[-1][0],
-                    outputs[-1][1],
-                )
-            )
-        (
-            (new_pkv00, new_pkv01),
-            (new_pkv10, new_pkv11),
-            (new_pkv20, new_pkv21),
-            (new_pkv30, new_pkv31),
-            (new_pkv40, new_pkv41),
-            (new_pkv50, new_pkv51),
-            (new_pkv60, new_pkv61),
-            (new_pkv70, new_pkv71),
-        ) = new_pkvs
-        return (
-            hidden_states,
-            new_pkv00,
-            new_pkv01,
-            new_pkv10,
-            new_pkv11,
-            new_pkv20,
-            new_pkv21,
-            new_pkv30,
-            new_pkv31,
-            new_pkv40,
-            new_pkv41,
-            new_pkv50,
-            new_pkv51,
-            new_pkv60,
-            new_pkv61,
-            new_pkv70,
-            new_pkv71,
-        )
-
-
-class EightLayerLayerFV(torch.nn.Module):
-    def __init__(self, layers):
-        super().__init__()
-        assert len(layers) == 8
-        self.layers = layers
-
-    def forward(self, hidden_states, attention_mask, position_ids):
-        new_pkvs = []
-        for layer in self.layers:
-            outputs = layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=None,
-                use_cache=True,
-            )
-
-            hidden_states = outputs[0]
-            new_pkvs.append(
-                (
-                    outputs[-1][0],
-                    outputs[-1][1],
-                )
-            )
-        (
-            (new_pkv00, new_pkv01),
-            (new_pkv10, new_pkv11),
-            (new_pkv20, new_pkv21),
-            (new_pkv30, new_pkv31),
-            (new_pkv40, new_pkv41),
-            (new_pkv50, new_pkv51),
-            (new_pkv60, new_pkv61),
-            (new_pkv70, new_pkv71),
-        ) = new_pkvs
-        return (
-            hidden_states,
-            new_pkv00,
-            new_pkv01,
-            new_pkv10,
-            new_pkv11,
-            new_pkv20,
-            new_pkv21,
-            new_pkv30,
-            new_pkv31,
-            new_pkv40,
-            new_pkv41,
-            new_pkv50,
-            new_pkv51,
-            new_pkv60,
-            new_pkv61,
-            new_pkv70,
-            new_pkv71,
-        )
-
-
-class CompiledEightLayerLayerSV(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        position_ids,
-        past_key_value,
-        output_attentions=False,
-        use_cache=True,
-    ):
-        hidden_states = hidden_states.detach()
-        attention_mask = attention_mask.detach()
-        position_ids = position_ids.detach()
-        (
-            (pkv00, pkv01),
-            (pkv10, pkv11),
-            (pkv20, pkv21),
-            (pkv30, pkv31),
-            (pkv40, pkv41),
-            (pkv50, pkv51),
-            (pkv60, pkv61),
-            (pkv70, pkv71),
-        ) = past_key_value
-        pkv00 = pkv00.detatch()
-        pkv01 = pkv01.detatch()
-        pkv10 = pkv10.detatch()
-        pkv11 = pkv11.detatch()
-        pkv20 = pkv20.detatch()
-        pkv21 = pkv21.detatch()
-        pkv30 = pkv30.detatch()
-        pkv31 = pkv31.detatch()
-        pkv40 = pkv40.detatch()
-        pkv41 = pkv41.detatch()
-        pkv50 = pkv50.detatch()
-        pkv51 = pkv51.detatch()
-        pkv60 = pkv60.detatch()
-        pkv61 = pkv61.detatch()
-        pkv70 = pkv70.detatch()
-        pkv71 = pkv71.detatch()
-
-        output = self.model(
-            "forward",
-            (
-                hidden_states,
-                attention_mask,
-                position_ids,
-                pkv00,
-                pkv01,
-                pkv10,
-                pkv11,
-                pkv20,
-                pkv21,
-                pkv30,
-                pkv31,
-                pkv40,
-                pkv41,
-                pkv50,
-                pkv51,
-                pkv60,
-                pkv61,
-                pkv70,
-                pkv71,
-            ),
-            send_to_host=False,
-        )
-        return (
-            output[0],
-            (output[1][0], output[1][1]),
-            (output[2][0], output[2][1]),
-            (output[3][0], output[3][1]),
-            (output[4][0], output[4][1]),
-            (output[5][0], output[5][1]),
-            (output[6][0], output[6][1]),
-            (output[7][0], output[7][1]),
-            (output[8][0], output[8][1]),
-        )
-
-
-def forward_compressed(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-):
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-        )
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError(
-            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-        )
-
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-
-    if past_key_values is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]
-        seq_length_with_past = seq_length_with_past + past_key_values_length
-
-    if position_ids is None:
-        device = (
-            input_ids.device if input_ids is not None else inputs_embeds.device
-        )
-        position_ids = torch.arange(
-            past_key_values_length,
-            seq_length + past_key_values_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-    # embed positions
-    if attention_mask is None:
-        attention_mask = torch.ones(
-            (batch_size, seq_length_with_past),
-            dtype=torch.bool,
-            device=inputs_embeds.device,
-        )
-    attention_mask = self._prepare_decoder_attention_mask(
-        attention_mask,
-        (batch_size, seq_length),
-        inputs_embeds,
-        past_key_values_length,
-    )
-
-    hidden_states = inputs_embeds
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = () if use_cache else None
-
-    for idx, decoder_layer in enumerate(self.compressedlayers):
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        past_key_value = (
-            past_key_values[8 * idx : 8 * (idx + 1)]
-            if past_key_values is not None
-            else None
-        )
-
-        if self.gradient_checkpointing and self.training:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    # None for past_key_value
-                    return module(*inputs, output_attentions, None)
-
-                return custom_forward
-
-            layer_outputs = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(decoder_layer),
-                hidden_states,
-                attention_mask,
-                position_ids,
-                None,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (
-                layer_outputs[2 if output_attentions else 1],
-            )
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [
-                hidden_states,
-                next_cache,
-                all_hidden_states,
-                all_self_attns,
-            ]
-            if v is not None
-        )
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class CompiledEightLayerLayer(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        position_ids,
-        past_key_value=None,
-        output_attentions=False,
-        use_cache=True,
-    ):
-        t2 = time()
-        if past_key_value is None:
-            try:
-                hidden_states = np.asarray(hidden_states, hidden_states.dtype)
-            except:
-                pass
-            attention_mask = attention_mask.detach()
-            position_ids = position_ids.detach()
-            t1 = time()
-
-            output = self.model(
-                "first_vicuna_forward",
-                (hidden_states, attention_mask, position_ids),
-                send_to_host=False,
-            )
-            output2 = (
-                output[0],
-                (
-                    output[1],
-                    output[2],
-                ),
-                (
-                    output[3],
-                    output[4],
-                ),
-                (
-                    output[5],
-                    output[6],
-                ),
-                (
-                    output[7],
-                    output[8],
-                ),
-                (
-                    output[9],
-                    output[10],
-                ),
-                (
-                    output[11],
-                    output[12],
-                ),
-                (
-                    output[13],
-                    output[14],
-                ),
-                (
-                    output[15],
-                    output[16],
-                ),
-            )
-            return output2
-        else:
-            (
-                (pkv00, pkv01),
-                (pkv10, pkv11),
-                (pkv20, pkv21),
-                (pkv30, pkv31),
-                (pkv40, pkv41),
-                (pkv50, pkv51),
-                (pkv60, pkv61),
-                (pkv70, pkv71),
-            ) = past_key_value
-
-            try:
-                hidden_states = hidden_states.detach()
-                attention_mask = attention_mask.detach()
-                position_ids = position_ids.detach()
-                pkv00 = pkv00.detach()
-                pkv01 = pkv01.detach()
-                pkv10 = pkv10.detach()
-                pkv11 = pkv11.detach()
-                pkv20 = pkv20.detach()
-                pkv21 = pkv21.detach()
-                pkv30 = pkv30.detach()
-                pkv31 = pkv31.detach()
-                pkv40 = pkv40.detach()
-                pkv41 = pkv41.detach()
-                pkv50 = pkv50.detach()
-                pkv51 = pkv51.detach()
-                pkv60 = pkv60.detach()
-                pkv61 = pkv61.detach()
-                pkv70 = pkv70.detach()
-                pkv71 = pkv71.detach()
-            except:
-                x = 10
-
-            t1 = time()
-            if type(hidden_states) == iree.runtime.array_interop.DeviceArray:
-                hidden_states = np.array(hidden_states, hidden_states.dtype)
-                hidden_states = torch.tensor(hidden_states)
-                hidden_states = hidden_states.detach()
-
-            output = self.model(
-                "second_vicuna_forward",
-                (
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    pkv00,
-                    pkv01,
-                    pkv10,
-                    pkv11,
-                    pkv20,
-                    pkv21,
-                    pkv30,
-                    pkv31,
-                    pkv40,
-                    pkv41,
-                    pkv50,
-                    pkv51,
-                    pkv60,
-                    pkv61,
-                    pkv70,
-                    pkv71,
-                ),
-                send_to_host=False,
-            )
-            print(f"{time() - t1}")
-            del pkv00
-            del pkv01
-            del pkv10
-            del pkv11
-            del pkv20
-            del pkv21
-            del pkv30
-            del pkv31
-            del pkv40
-            del pkv41
-            del pkv50
-            del pkv51
-            del pkv60
-            del pkv61
-            del pkv70
-            del pkv71
-            output2 = (
-                output[0],
-                (
-                    output[1],
-                    output[2],
-                ),
-                (
-                    output[3],
-                    output[4],
-                ),
-                (
-                    output[5],
-                    output[6],
-                ),
-                (
-                    output[7],
-                    output[8],
-                ),
-                (
-                    output[9],
-                    output[10],
-                ),
-                (
-                    output[11],
-                    output[12],
-                ),
-                (
-                    output[13],
-                    output[14],
-                ),
-                (
-                    output[15],
-                    output[16],
-                ),
-            )
-            return output2
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
--- a/apps/language_models/src/model_wrappers/vicuna_model_gpu.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model_gpu.py
--- a/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
@@ -1,231 +0,0 @@
-import torch
-
-
-class FirstVicunaLayer(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, hidden_states, attention_mask, position_ids):
-        outputs = self.model(
-            hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            use_cache=True,
-        )
-        next_hidden_states = outputs[0]
-        past_key_value_out0, past_key_value_out1 = (
-            outputs[-1][0],
-            outputs[-1][1],
-        )
-
-        return (
-            next_hidden_states,
-            past_key_value_out0,
-            past_key_value_out1,
-        )
-
-
-class SecondVicunaLayer(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        position_ids,
-        past_key_value0,
-        past_key_value1,
-    ):
-        outputs = self.model(
-            hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=(
-                past_key_value0,
-                past_key_value1,
-            ),
-            use_cache=True,
-        )
-        next_hidden_states = outputs[0]
-        past_key_value_out0, past_key_value_out1 = (
-            outputs[-1][0],
-            outputs[-1][1],
-        )
-
-        return (
-            next_hidden_states,
-            past_key_value_out0,
-            past_key_value_out1,
-        )
-
-
-class ShardedVicunaModel(torch.nn.Module):
-    def __init__(self, model, layers, lmhead, embedding, norm):
-        super().__init__()
-        self.model = model
-        # assert len(layers) == len(model.model.layers)
-        self.model.model.config.use_cache = True
-        self.model.model.config.output_attentions = False
-        self.layers = layers
-        self.norm = norm
-        self.embedding = embedding
-        self.lmhead = lmhead
-        self.model.model.norm = self.norm
-        self.model.model.embed_tokens = self.embedding
-        self.model.lm_head = self.lmhead
-        self.model.model.layers = torch.nn.modules.container.ModuleList(
-            self.layers
-        )
-
-    def forward(
-        self,
-        input_ids,
-        is_first=True,
-        past_key_values=None,
-        attention_mask=None,
-    ):
-        return self.model.forward(
-            input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-
-
-class LMHead(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, hidden_states):
-        output = self.model(hidden_states)
-        return output
-
-
-class LMHeadCompiled(torch.nn.Module):
-    def __init__(self, shark_module):
-        super().__init__()
-        self.model = shark_module
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.detach()
-        output = self.model("forward", (hidden_states,))
-        output = torch.tensor(output)
-        return output
-
-
-class VicunaNorm(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, hidden_states):
-        output = self.model(hidden_states)
-        return output
-
-
-class VicunaNormCompiled(torch.nn.Module):
-    def __init__(self, shark_module):
-        super().__init__()
-        self.model = shark_module
-
-    def forward(self, hidden_states):
-        try:
-            hidden_states.detach()
-        except:
-            pass
-        output = self.model("forward", (hidden_states,))
-        output = torch.tensor(output)
-        return output
-
-
-class VicunaEmbedding(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, input_ids):
-        output = self.model(input_ids)
-        return output
-
-
-class VicunaEmbeddingCompiled(torch.nn.Module):
-    def __init__(self, shark_module):
-        super().__init__()
-        self.model = shark_module
-
-    def forward(self, input_ids):
-        input_ids.detach()
-        output = self.model("forward", (input_ids,))
-        output = torch.tensor(output)
-        return output
-
-
-class CompiledVicunaLayer(torch.nn.Module):
-    def __init__(self, shark_module):
-        super().__init__()
-        self.model = shark_module
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        position_ids,
-        past_key_value=None,
-        output_attentions=False,
-        use_cache=True,
-    ):
-        if past_key_value is None:
-            hidden_states = hidden_states.detach()
-            attention_mask = attention_mask.detach()
-            position_ids = position_ids.detach()
-            output = self.model(
-                "first_vicuna_forward",
-                (
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                ),
-            )
-
-            output0 = torch.tensor(output[0])
-            output1 = torch.tensor(output[1])
-            output2 = torch.tensor(output[2])
-
-            return (
-                output0,
-                (
-                    output1,
-                    output2,
-                ),
-            )
-        else:
-            hidden_states = hidden_states.detach()
-            attention_mask = attention_mask.detach()
-            position_ids = position_ids.detach()
-            pkv0 = past_key_value[0].detach()
-            pkv1 = past_key_value[1].detach()
-            output = self.model(
-                "second_vicuna_forward",
-                (
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    pkv0,
-                    pkv1,
-                ),
-            )
-
-            output0 = torch.tensor(output[0])
-            output1 = torch.tensor(output[1])
-            output2 = torch.tensor(output[2])
-
-            return (
-                output0,
-                (
-                    output1,
-                    output2,
-                ),
-            )
--- a/apps/language_models/src/pipelines/SharkLLMBase.py
+++ b/apps/language_models/src/pipelines/SharkLLMBase.py
@@ -1,44 +0,0 @@
-from abc import ABC, abstractmethod
-
-
-class SharkLLMBase(ABC):
-    def __init__(
-        self,
-        model_name,
-        hf_model_path=None,
-        max_num_tokens=512,
-    ) -> None:
-        self.model_name = model_name
-        self.hf_model_path = hf_model_path
-        self.max_num_tokens = max_num_tokens
-        self.shark_model = None
-        self.device = "cpu"
-        self.precision = "fp32"
-
-    @classmethod
-    @abstractmethod
-    def compile(self):
-        pass
-
-    @classmethod
-    @abstractmethod
-    def generate(self, prompt):
-        pass
-
-    @classmethod
-    @abstractmethod
-    def generate_new_token(self, params):
-        pass
-
-    @classmethod
-    @abstractmethod
-    def get_tokenizer(self):
-        pass
-
-    @classmethod
-    @abstractmethod
-    def get_src_model(self):
-        pass
-
-    def load_init_from_config(self):
-        pass
--- a/apps/language_models/src/pipelines/falcon_pipeline.py
+++ b/apps/language_models/src/pipelines/falcon_pipeline.py
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
--- a/apps/language_models/src/pipelines/minigpt4_utils/Qformer.py
+++ b/apps/language_models/src/pipelines/minigpt4_utils/Qformer.py
--- a/apps/language_models/src/pipelines/minigpt4_utils/blip_processors.py
+++ b/apps/language_models/src/pipelines/minigpt4_utils/blip_processors.py
@@ -1,68 +0,0 @@
-"""
- Copyright (c) 2022, salesforce.com, inc.
- All rights reserved.
- SPDX-License-Identifier: BSD-3-Clause
- For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
-"""
-from omegaconf import OmegaConf
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
-
-class BaseProcessor:
-    def __init__(self):
-        self.transform = lambda x: x
-        return
-
-    def __call__(self, item):
-        return self.transform(item)
-
-    @classmethod
-    def from_config(cls, cfg=None):
-        return cls()
-
-    def build(self, **kwargs):
-        cfg = OmegaConf.create(kwargs)
-
-        return self.from_config(cfg)
-
-
-class BlipImageBaseProcessor(BaseProcessor):
-    def __init__(self, mean=None, std=None):
-        if mean is None:
-            mean = (0.48145466, 0.4578275, 0.40821073)
-        if std is None:
-            std = (0.26862954, 0.26130258, 0.27577711)
-
-        self.normalize = transforms.Normalize(mean, std)
-
-
-class Blip2ImageEvalProcessor(BlipImageBaseProcessor):
-    def __init__(self, image_size=224, mean=None, std=None):
-        super().__init__(mean=mean, std=std)
-
-        self.transform = transforms.Compose(
-            [
-                transforms.Resize(
-                    (image_size, image_size),
-                    interpolation=InterpolationMode.BICUBIC,
-                ),
-                transforms.ToTensor(),
-                self.normalize,
-            ]
-        )
-
-    def __call__(self, item):
-        return self.transform(item)
-
-    @classmethod
-    def from_config(cls, cfg=None):
-        if cfg is None:
-            cfg = OmegaConf.create()
-
-        image_size = cfg.get("image_size", 224)
-
-        mean = cfg.get("mean", None)
-        std = cfg.get("std", None)
-
-        return cls(image_size=image_size, mean=mean, std=std)
--- a/apps/language_models/src/pipelines/minigpt4_utils/configs/cc_sbu_align.yaml
+++ b/apps/language_models/src/pipelines/minigpt4_utils/configs/cc_sbu_align.yaml
@@ -1,5 +0,0 @@
-datasets:
-  cc_sbu_align:
-    data_type: images
-    build_info:
-      storage: /path/to/cc_sbu_align/
--- a/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4.yaml
+++ b/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4.yaml
@@ -1,33 +0,0 @@
-model:
-  arch: mini_gpt4
-
-  # vit encoder
-  image_size: 224
-  drop_path_rate: 0
-  use_grad_checkpoint: False
-  vit_precision: "fp16"
-  freeze_vit: True
-  freeze_qformer: True
-
-  # Q-Former
-  num_query_token: 32
-
-  # Vicuna
-  llama_model: "lmsys/vicuna-7b-v1.3"
-
-  # generation configs
-  prompt: ""
-
-preprocess:
-    vis_processor:
-        train:
-          name: "blip2_image_train"
-          image_size: 224
-        eval:
-          name: "blip2_image_eval"
-          image_size: 224
-    text_processor:
-        train:
-          name: "blip_caption"
-        eval:
-          name: "blip_caption"
--- a/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4_eval.yaml
+++ b/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4_eval.yaml
@@ -1,25 +0,0 @@
-model:
-  arch: mini_gpt4
-  model_type: pretrain_vicuna
-  freeze_vit: True
-  freeze_qformer: True
-  max_txt_len: 160
-  end_sym: "###"
-  low_resource: False
-  prompt_path: "apps/language_models/src/pipelines/minigpt4_utils/prompts/alignment.txt"
-  prompt_template: '###Human: {} ###Assistant: '
-  ckpt: 'prerained_minigpt4_7b.pth'
-
-
-datasets:
-  cc_sbu_align:
-    vis_processor:
-      train:
-        name: "blip2_image_eval"
-        image_size: 224
-    text_processor:
-      train:
-        name: "blip_caption"
-
-run:
-  task: image_text_pretrain
--- a/apps/language_models/src/pipelines/minigpt4_utils/eva_vit.py
+++ b/apps/language_models/src/pipelines/minigpt4_utils/eva_vit.py
@@ -1,629 +0,0 @@
-# Based on EVA, BEIT, timm and DeiT code bases
-# https://github.com/baaivision/EVA
-# https://github.com/rwightman/pytorch-image-models/tree/master/timm
-# https://github.com/microsoft/unilm/tree/master/beit
-# https://github.com/facebookresearch/deit/
-# https://github.com/facebookresearch/dino
-# --------------------------------------------------------'
-import math
-import requests
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-from timm.models.layers import drop_path, to_2tuple, trunc_normal_
-
-
-def _cfg(url="", **kwargs):
-    return {
-        "url": url,
-        "num_classes": 1000,
-        "input_size": (3, 224, 224),
-        "pool_size": None,
-        "crop_pct": 0.9,
-        "interpolation": "bicubic",
-        "mean": (0.5, 0.5, 0.5),
-        "std": (0.5, 0.5, 0.5),
-        **kwargs,
-    }
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
-
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-    def extra_repr(self) -> str:
-        return "p={}".format(self.drop_prob)
-
-
-class Mlp(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        # x = self.drop(x)
-        # commit this for the orignal BERT implement
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads=8,
-        qkv_bias=False,
-        qk_scale=None,
-        attn_drop=0.0,
-        proj_drop=0.0,
-        window_size=None,
-        attn_head_dim=None,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        if attn_head_dim is not None:
-            head_dim = attn_head_dim
-        all_head_dim = head_dim * self.num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
-        if qkv_bias:
-            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
-            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
-        else:
-            self.q_bias = None
-            self.v_bias = None
-
-        if window_size:
-            self.window_size = window_size
-            self.num_relative_distance = (2 * window_size[0] - 1) * (
-                2 * window_size[1] - 1
-            ) + 3
-            self.relative_position_bias_table = nn.Parameter(
-                torch.zeros(self.num_relative_distance, num_heads)
-            )  # 2*Wh-1 * 2*Ww-1, nH
-            # cls to token & token 2 cls & cls to cls
-
-            # get pair-wise relative position index for each token inside the window
-            coords_h = torch.arange(window_size[0])
-            coords_w = torch.arange(window_size[1])
-            coords = torch.stack(
-                torch.meshgrid([coords_h, coords_w])
-            )  # 2, Wh, Ww
-            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-            relative_coords = (
-                coords_flatten[:, :, None] - coords_flatten[:, None, :]
-            )  # 2, Wh*Ww, Wh*Ww
-            relative_coords = relative_coords.permute(
-                1, 2, 0
-            ).contiguous()  # Wh*Ww, Wh*Ww, 2
-            relative_coords[:, :, 0] += (
-                window_size[0] - 1
-            )  # shift to start from 0
-            relative_coords[:, :, 1] += window_size[1] - 1
-            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-            relative_position_index = torch.zeros(
-                size=(window_size[0] * window_size[1] + 1,) * 2,
-                dtype=relative_coords.dtype,
-            )
-            relative_position_index[1:, 1:] = relative_coords.sum(
-                -1
-            )  # Wh*Ww, Wh*Ww
-            relative_position_index[0, 0:] = self.num_relative_distance - 3
-            relative_position_index[0:, 0] = self.num_relative_distance - 2
-            relative_position_index[0, 0] = self.num_relative_distance - 1
-
-            self.register_buffer(
-                "relative_position_index", relative_position_index
-            )
-        else:
-            self.window_size = None
-            self.relative_position_bias_table = None
-            self.relative_position_index = None
-
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(all_head_dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, rel_pos_bias=None):
-        B, N, C = x.shape
-        qkv_bias = None
-        if self.q_bias is not None:
-            qkv_bias = torch.cat(
-                (
-                    self.q_bias,
-                    torch.zeros_like(self.v_bias, requires_grad=False),
-                    self.v_bias,
-                )
-            )
-        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
-        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k, v = (
-            qkv[0],
-            qkv[1],
-            qkv[2],
-        )  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-
-        if self.relative_position_bias_table is not None:
-            relative_position_bias = self.relative_position_bias_table[
-                self.relative_position_index.view(-1)
-            ].view(
-                self.window_size[0] * self.window_size[1] + 1,
-                self.window_size[0] * self.window_size[1] + 1,
-                -1,
-            )  # Wh*Ww,Wh*Ww,nH
-            relative_position_bias = relative_position_bias.permute(
-                2, 0, 1
-            ).contiguous()  # nH, Wh*Ww, Wh*Ww
-            attn = attn + relative_position_bias.unsqueeze(0)
-
-        if rel_pos_bias is not None:
-            attn = attn + rel_pos_bias
-
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        init_values=None,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-        window_size=None,
-        attn_head_dim=None,
-    ):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-            window_size=window_size,
-            attn_head_dim=attn_head_dim,
-        )
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = (
-            DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        )
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-
-        if init_values is not None and init_values > 0:
-            self.gamma_1 = nn.Parameter(
-                init_values * torch.ones((dim)), requires_grad=True
-            )
-            self.gamma_2 = nn.Parameter(
-                init_values * torch.ones((dim)), requires_grad=True
-            )
-        else:
-            self.gamma_1, self.gamma_2 = None, None
-
-    def forward(self, x, rel_pos_bias=None):
-        if self.gamma_1 is None:
-            x = x + self.drop_path(
-                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
-            )
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        else:
-            x = x + self.drop_path(
-                self.gamma_1
-                * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
-            )
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """Image to Patch Embedding"""
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * (
-            img_size[0] // patch_size[0]
-        )
-        self.patch_shape = (
-            img_size[0] // patch_size[0],
-            img_size[1] // patch_size[1],
-        )
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
-        )
-
-    def forward(self, x, **kwargs):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert (
-            H == self.img_size[0] and W == self.img_size[1]
-        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)
-        return x
-
-
-class RelativePositionBias(nn.Module):
-    def __init__(self, window_size, num_heads):
-        super().__init__()
-        self.window_size = window_size
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1
-        ) + 3
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros(self.num_relative_distance, num_heads)
-        )  # 2*Wh-1 * 2*Ww-1, nH
-        # cls to token & token 2 cls & cls to cls
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(window_size[0])
-        coords_w = torch.arange(window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = (
-            coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        )  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(
-            1, 2, 0
-        ).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = torch.zeros(
-            size=(window_size[0] * window_size[1] + 1,) * 2,
-            dtype=relative_coords.dtype,
-        )
-        relative_position_index[1:, 1:] = relative_coords.sum(
-            -1
-        )  # Wh*Ww, Wh*Ww
-        relative_position_index[0, 0:] = self.num_relative_distance - 3
-        relative_position_index[0:, 0] = self.num_relative_distance - 2
-        relative_position_index[0, 0] = self.num_relative_distance - 1
-
-        self.register_buffer(
-            "relative_position_index", relative_position_index
-        )
-
-        # trunc_normal_(self.relative_position_bias_table, std=.02)
-
-    def forward(self):
-        relative_position_bias = self.relative_position_bias_table[
-            self.relative_position_index.view(-1)
-        ].view(
-            self.window_size[0] * self.window_size[1] + 1,
-            self.window_size[0] * self.window_size[1] + 1,
-            -1,
-        )  # Wh*Ww,Wh*Ww,nH
-        return relative_position_bias.permute(
-            2, 0, 1
-        ).contiguous()  # nH, Wh*Ww, Wh*Ww
-
-
-class VisionTransformer(nn.Module):
-    """Vision Transformer with support for patch or hybrid CNN input stage"""
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        num_classes=1000,
-        embed_dim=768,
-        depth=12,
-        num_heads=12,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.0,
-        norm_layer=nn.LayerNorm,
-        init_values=None,
-        use_abs_pos_emb=True,
-        use_rel_pos_bias=False,
-        use_shared_rel_pos_bias=False,
-        use_mean_pooling=True,
-        init_scale=0.001,
-        use_checkpoint=False,
-    ):
-        super().__init__()
-        self.image_size = img_size
-        self.num_classes = num_classes
-        self.num_features = (
-            self.embed_dim
-        ) = embed_dim  # num_features for consistency with other models
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-        )
-        num_patches = self.patch_embed.num_patches
-
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        if use_abs_pos_emb:
-            self.pos_embed = nn.Parameter(
-                torch.zeros(1, num_patches + 1, embed_dim)
-            )
-        else:
-            self.pos_embed = None
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        if use_shared_rel_pos_bias:
-            self.rel_pos_bias = RelativePositionBias(
-                window_size=self.patch_embed.patch_shape, num_heads=num_heads
-            )
-        else:
-            self.rel_pos_bias = None
-        self.use_checkpoint = use_checkpoint
-
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, depth)
-        ]  # stochastic depth decay rule
-        self.use_rel_pos_bias = use_rel_pos_bias
-        self.blocks = nn.ModuleList(
-            [
-                Block(
-                    dim=embed_dim,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[i],
-                    norm_layer=norm_layer,
-                    init_values=init_values,
-                    window_size=self.patch_embed.patch_shape
-                    if use_rel_pos_bias
-                    else None,
-                )
-                for i in range(depth)
-            ]
-        )
-        #         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
-        #         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
-        #         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-        if self.pos_embed is not None:
-            trunc_normal_(self.pos_embed, std=0.02)
-        trunc_normal_(self.cls_token, std=0.02)
-        # trunc_normal_(self.mask_token, std=.02)
-        #         if isinstance(self.head, nn.Linear):
-        #             trunc_normal_(self.head.weight, std=.02)
-        self.apply(self._init_weights)
-        self.fix_init_weight()
-
-    #         if isinstance(self.head, nn.Linear):
-    #             self.head.weight.data.mul_(init_scale)
-    #             self.head.bias.data.mul_(init_scale)
-
-    def fix_init_weight(self):
-        def rescale(param, layer_id):
-            param.div_(math.sqrt(2.0 * layer_id))
-
-        for layer_id, layer in enumerate(self.blocks):
-            rescale(layer.attn.proj.weight.data, layer_id + 1)
-            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=""):
-        self.num_classes = num_classes
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes)
-            if num_classes > 0
-            else nn.Identity()
-        )
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        batch_size, seq_len, _ = x.size()
-
-        cls_tokens = self.cls_token.expand(
-            batch_size, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        x = torch.cat((cls_tokens, x), dim=1)
-        if self.pos_embed is not None:
-            x = x + self.pos_embed
-        x = self.pos_drop(x)
-
-        rel_pos_bias = (
-            self.rel_pos_bias() if self.rel_pos_bias is not None else None
-        )
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
-            else:
-                x = blk(x, rel_pos_bias)
-        return x
-
-    #         x = self.norm(x)
-
-    #         if self.fc_norm is not None:
-    #             t = x[:, 1:, :]
-    #             return self.fc_norm(t.mean(1))
-    #         else:
-    #             return x[:, 0]
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        #         x = self.head(x)
-        return x
-
-    def get_intermediate_layers(self, x):
-        x = self.patch_embed(x)
-        batch_size, seq_len, _ = x.size()
-
-        cls_tokens = self.cls_token.expand(
-            batch_size, -1, -1
-        )  # stole cls_tokens impl from Phil Wang, thanks
-        x = torch.cat((cls_tokens, x), dim=1)
-        if self.pos_embed is not None:
-            x = x + self.pos_embed
-        x = self.pos_drop(x)
-
-        features = []
-        rel_pos_bias = (
-            self.rel_pos_bias() if self.rel_pos_bias is not None else None
-        )
-        for blk in self.blocks:
-            x = blk(x, rel_pos_bias)
-            features.append(x)
-
-        return features
-
-
-def interpolate_pos_embed(model, checkpoint_model):
-    if "pos_embed" in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model["pos_embed"].float()
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.patch_embed.num_patches
-        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int(
-            (pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5
-        )
-        # height (== width) for the new position embedding
-        new_size = int(num_patches**0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print(
-                "Position interpolate from %dx%d to %dx%d"
-                % (orig_size, orig_size, new_size, new_size)
-            )
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(
-                -1, orig_size, orig_size, embedding_size
-            ).permute(0, 3, 1, 2)
-            pos_tokens = torch.nn.functional.interpolate(
-                pos_tokens,
-                size=(new_size, new_size),
-                mode="bicubic",
-                align_corners=False,
-            )
-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-            checkpoint_model["pos_embed"] = new_pos_embed
-
-
-def convert_weights_to_fp16(model: nn.Module):
-    """Convert applicable model parameters to fp16"""
-
-    def _convert_weights_to_fp16(l):
-        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
-            # l.weight.data = l.weight.data.half()
-            l.weight.data = l.weight.data
-            if l.bias is not None:
-                # l.bias.data = l.bias.data.half()
-                l.bias.data = l.bias.data
-
-    #         if isinstance(l, (nn.MultiheadAttention, Attention)):
-    #             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
-    #                 tensor = getattr(l, attr)
-    #                 if tensor is not None:
-    #                     tensor.data = tensor.data.half()
-
-    model.apply(_convert_weights_to_fp16)
-
-
-def create_eva_vit_g(
-    img_size=224, drop_path_rate=0.4, use_checkpoint=False, precision="fp16"
-):
-    model = VisionTransformer(
-        img_size=img_size,
-        patch_size=14,
-        use_mean_pooling=False,
-        embed_dim=1408,
-        depth=39,
-        num_heads=1408 // 88,
-        mlp_ratio=4.3637,
-        qkv_bias=True,
-        drop_path_rate=drop_path_rate,
-        norm_layer=partial(nn.LayerNorm, eps=1e-6),
-        use_checkpoint=use_checkpoint,
-    )
-    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
-
-    local_filename = "eva_vit_g.pth"
-    response = requests.get(url)
-    if response.status_code == 200:
-        with open(local_filename, "wb") as f:
-            f.write(response.content)
-        print("File downloaded successfully.")
-    state_dict = torch.load(local_filename, map_location="cpu")
-    interpolate_pos_embed(model, state_dict)
-
-    incompatible_keys = model.load_state_dict(state_dict, strict=False)
-
-    if precision == "fp16":
-        #         model.to("cuda")
-        convert_weights_to_fp16(model)
-    return model
--- a/apps/language_models/src/pipelines/minigpt4_utils/prompts/alignment.txt
+++ b/apps/language_models/src/pipelines/minigpt4_utils/prompts/alignment.txt
@@ -1,4 +0,0 @@
-<Img><ImageHere></Img> Describe this image in detail.
-<Img><ImageHere></Img> Take a look at this image and describe what you notice.
-<Img><ImageHere></Img> Please provide a detailed description of the picture.
-<Img><ImageHere></Img> Could you describe the contents of this image for me?
--- a/apps/language_models/src/pipelines/stablelm_pipeline.py
+++ b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -1,187 +0,0 @@
-import torch
-import torch_mlir
-from transformers import AutoTokenizer, StoppingCriteria, AutoModelForCausalLM
-from io import BytesIO
-from pathlib import Path
-from apps.language_models.utils import (
-    get_torch_mlir_module_bytecode,
-    get_vmfb_from_path,
-)
-from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
-from apps.language_models.src.model_wrappers.stablelm_model import (
-    StableLMModel,
-)
-
-
-class StopOnTokens(StoppingCriteria):
-    def __call__(
-        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-    ) -> bool:
-        stop_ids = [50278, 50279, 50277, 1, 0]
-        for stop_id in stop_ids:
-            if input_ids[0][-1] == stop_id:
-                return True
-        return False
-
-
-class SharkStableLM(SharkLLMBase):
-    def __init__(
-        self,
-        model_name,
-        hf_model_path="stabilityai/stablelm-tuned-alpha-3b",
-        max_num_tokens=512,
-        device="cuda",
-        precision="fp32",
-        debug="False",
-    ) -> None:
-        super().__init__(model_name, hf_model_path, max_num_tokens)
-        self.max_sequence_len = 256
-        self.device = device
-        self.precision = precision
-        self.debug = debug
-        self.tokenizer = self.get_tokenizer()
-        self.shark_model = self.compile()
-
-    def shouldStop(self, tokens):
-        stop_ids = [50278, 50279, 50277, 1, 0]
-        for stop_id in stop_ids:
-            if tokens[0][-1] == stop_id:
-                return True
-        return False
-
-    def get_src_model(self):
-        model = AutoModelForCausalLM.from_pretrained(
-            self.hf_model_path, torch_dtype=torch.float32
-        )
-        return model
-
-    def get_model_inputs(self):
-        input_ids = torch.randint(3, (1, self.max_sequence_len))
-        attention_mask = torch.randint(3, (1, self.max_sequence_len))
-        return input_ids, attention_mask
-
-    def compile(self):
-        tmp_model_name = (
-            f"stableLM_linalg_{self.precision}_seqLen{self.max_sequence_len}"
-        )
-
-        # device = "cuda"  # "cpu"
-        # TODO: vmfb and mlir name should include precision and device
-        model_vmfb_name = None
-        vmfb_path = (
-            Path(tmp_model_name + f"_{self.device}.vmfb")
-            if model_vmfb_name is None
-            else Path(model_vmfb_name)
-        )
-        shark_module = get_vmfb_from_path(
-            vmfb_path, self.device, mlir_dialect="tm_tensor"
-        )
-        if shark_module is not None:
-            return shark_module
-
-        mlir_path = Path(tmp_model_name + ".mlir")
-        print(
-            f"[DEBUG] mlir path {mlir_path} {'exists' if mlir_path.exists() else 'does not exist'}"
-        )
-        if mlir_path.exists():
-            with open(mlir_path, "rb") as f:
-                bytecode = f.read()
-        else:
-            model = StableLMModel(self.get_src_model())
-            model_inputs = self.get_model_inputs()
-            ts_graph = get_torch_mlir_module_bytecode(model, model_inputs)
-            module = torch_mlir.compile(
-                ts_graph,
-                [*model_inputs],
-                torch_mlir.OutputType.LINALG_ON_TENSORS,
-                use_tracing=False,
-                verbose=False,
-            )
-            bytecode_stream = BytesIO()
-            module.operation.write_bytecode(bytecode_stream)
-            bytecode = bytecode_stream.getvalue()
-        f_ = open(tmp_model_name + ".mlir", "wb")
-        f_.write(bytecode)
-        print("Saved mlir")
-        f_.close()
-
-        from shark.shark_inference import SharkInference
-
-        shark_module = SharkInference(
-            mlir_module=bytecode, device=self.device, mlir_dialect="tm_tensor"
-        )
-        shark_module.compile()
-
-        path = shark_module.save_module(
-            vmfb_path.parent.absolute(), vmfb_path.stem, debug=self.debug
-        )
-        print("Saved vmfb at ", str(path))
-
-        return shark_module
-
-    def get_tokenizer(self):
-        tok = AutoTokenizer.from_pretrained(self.hf_model_path)
-        tok.add_special_tokens({"pad_token": "<PAD>"})
-        # print("[DEBUG] Sucessfully loaded the tokenizer to the memory")
-        return tok
-
-    def generate(self, prompt):
-        words_list = []
-        for i in range(self.max_num_tokens):
-            params = {
-                "new_text": prompt,
-            }
-
-            generated_token_op = self.generate_new_token(params)
-
-            detok = generated_token_op["detok"]
-            stop_generation = generated_token_op["stop_generation"]
-
-            if stop_generation:
-                break
-
-            print(detok, end="", flush=True)  # this is for CLI and DEBUG
-            words_list.append(detok)
-            if detok == "":
-                break
-            prompt = prompt + detok
-        return words_list
-
-    def generate_new_token(self, params):
-        new_text = params["new_text"]
-        model_inputs = self.tokenizer(
-            [new_text],
-            padding="max_length",
-            max_length=self.max_sequence_len,
-            truncation=True,
-            return_tensors="pt",
-        )
-        sum_attentionmask = torch.sum(model_inputs.attention_mask)
-        output = self.shark_model(
-            "forward", [model_inputs.input_ids, model_inputs.attention_mask]
-        )
-        output = torch.from_numpy(output)
-        next_toks = torch.topk(output, 1)
-        stop_generation = False
-        if self.shouldStop(next_toks.indices):
-            stop_generation = True
-        new_token = next_toks.indices[0][int(sum_attentionmask) - 1]
-        detok = self.tokenizer.decode(
-            new_token,
-            skip_special_tokens=True,
-        )
-        ret_dict = {
-            "new_token": new_token,
-            "detok": detok,
-            "stop_generation": stop_generation,
-        }
-        return ret_dict
-
-
-# Initialize a StopOnTokens object
-system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
-"""
--- a/apps/language_models/utils.py
+++ b/apps/language_models/utils.py
@@ -1,48 +0,0 @@
-import torch
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch._decomp import get_decompositions
-from typing import List
-from pathlib import Path
-from shark.shark_downloader import download_public_file
-
-
-# expects a Path / str as arg
-# returns None if path not found or SharkInference module
-def get_vmfb_from_path(vmfb_path, device, mlir_dialect, device_id=None):
-    if not isinstance(vmfb_path, Path):
-        vmfb_path = Path(vmfb_path)
-
-    from shark.shark_inference import SharkInference
-
-    if not vmfb_path.exists():
-        return None
-
-    print("Loading vmfb from: ", vmfb_path)
-    print("Device from get_vmfb_from_path - ", device)
-    shark_module = SharkInference(
-        None, device=device, mlir_dialect=mlir_dialect, device_idx=device_id
-    )
-    shark_module.load_module(vmfb_path)
-    print("Successfully loaded vmfb")
-    return shark_module
-
-
-def get_vmfb_from_config(
-    shark_container,
-    model,
-    precision,
-    device,
-    vmfb_path,
-    padding=None,
-    device_id=None,
-):
-    vmfb_url = (
-        f"gs://shark_tank/{shark_container}/{model}_{precision}_{device}"
-    )
-    if padding:
-        vmfb_url = vmfb_url + f"_{padding}"
-    vmfb_url = vmfb_url + ".vmfb"
-    download_public_file(vmfb_url, vmfb_path.absolute(), single_file=True)
-    return get_vmfb_from_path(
-        vmfb_path, device, "tm_tensor", device_id=device_id
-    )
--- a/apps/shark_studio/api/controlnet.py
+++ b/apps/shark_studio/api/controlnet.py
@@ -0,0 +1,107 @@
+# from turbine_models.custom_models.controlnet import control_adapter, preprocessors
+import os
+import PIL
+import numpy as np
+from apps.shark_studio.web.utils.file_utils import (
+    get_generated_imgs_path,
+)
+from datetime import datetime
+from PIL import Image
+from gradio.components.image_editor import (
+    EditorValue,
+)
+
+
+class control_adapter:
+    def __init__(
+        self,
+        model: str,
+    ):
+        self.model = None
+
+    def export_control_adapter_model(model_keyword):
+        return None
+
+    def export_xl_control_adapter_model(model_keyword):
+        return None
+
+
+class preprocessors:
+    def __init__(
+        self,
+        model: str,
+    ):
+        self.model = None
+
+    def export_controlnet_model(model_keyword):
+        return None
+
+
+control_adapter_map = {
+    "sd15": {
+        "canny": {"initializer": control_adapter.export_control_adapter_model},
+        "openpose": {"initializer": control_adapter.export_control_adapter_model},
+        "scribble": {"initializer": control_adapter.export_control_adapter_model},
+        "zoedepth": {"initializer": control_adapter.export_control_adapter_model},
+    },
+    "sdxl": {
+        "canny": {"initializer": control_adapter.export_xl_control_adapter_model},
+    },
+}
+preprocessor_model_map = {
+    "canny": {"initializer": preprocessors.export_controlnet_model},
+    "openpose": {"initializer": preprocessors.export_controlnet_model},
+    "scribble": {"initializer": preprocessors.export_controlnet_model},
+    "zoedepth": {"initializer": preprocessors.export_controlnet_model},
+}
+
+
+class PreprocessorModel:
+    def __init__(
+        self,
+        hf_model_id,
+        device="cpu",
+    ):
+        self.model = hf_model_id
+        self.device = device
+
+    def compile(self):
+        print("compile not implemented for preprocessor.")
+        return
+
+    def run(self, inputs):
+        print("run not implemented for preprocessor.")
+        return inputs
+
+
+def cnet_preview(model, input_image):
+    curr_datetime = datetime.now().strftime("%Y-%m-%d.%H-%M-%S")
+    control_imgs_path = os.path.join(get_generated_imgs_path(), "control_hints")
+    if not os.path.exists(control_imgs_path):
+        os.mkdir(control_imgs_path)
+    img_dest = os.path.join(control_imgs_path, model + curr_datetime + ".png")
+    match model:
+        case "canny":
+            canny = PreprocessorModel("canny")
+            result = canny(
+                np.array(input_image),
+                100,
+                200,
+            )
+            Image.fromarray(result).save(fp=img_dest)
+            return result, img_dest
+        case "openpose":
+            openpose = PreprocessorModel("openpose")
+            result = openpose(np.array(input_image))
+            Image.fromarray(result[0]).save(fp=img_dest)
+            return result, img_dest
+        case "zoedepth":
+            zoedepth = PreprocessorModel("ZoeDepth")
+            result = zoedepth(np.array(input_image))
+            Image.fromarray(result).save(fp=img_dest)
+            return result, img_dest
+        case "scribble":
+            input_image.save(fp=img_dest)
+            return input_image, img_dest
+        case _:
+            return None, None
--- a/apps/shark_studio/api/initializers.py
+++ b/apps/shark_studio/api/initializers.py
@@ -0,0 +1,125 @@
+import importlib
+import os
+import signal
+import sys
+import warnings
+import json
+from threading import Thread
+
+from apps.shark_studio.modules.timer import startup_timer
+
+from apps.shark_studio.web.utils.tmp_configs import (
+    config_tmp,
+    clear_tmp_mlir,
+    clear_tmp_imgs,
+    shark_tmp,
+)
+
+
+def imports():
+    import torch  # noqa: F401
+
+    startup_timer.record("import torch")
+    warnings.filterwarnings(
+        action="ignore", category=DeprecationWarning, module="torch"
+    )
+    warnings.filterwarnings(action="ignore", category=UserWarning, module="torchvision")
+    warnings.filterwarnings(action="ignore", category=UserWarning, module="torch")
+
+    import gradio  # noqa: F401
+
+    startup_timer.record("import gradio")
+
+    import apps.shark_studio.web.utils.globals as global_obj
+
+    global_obj._init()
+    startup_timer.record("initialize globals")
+
+    from apps.shark_studio.modules import (
+        img_processing,
+    )  # noqa: F401
+
+    startup_timer.record("other imports")
+
+
+def initialize():
+    configure_sigint_handler()
+    # Setup to use shark_tmp for gradio's temporary image files and clear any
+    # existing temporary images there if they exist. Then we can import gradio.
+    # It has to be in this order or gradio ignores what we've set up.
+
+    config_tmp()
+    # clear_tmp_mlir()
+    clear_tmp_imgs()
+
+    from apps.shark_studio.web.utils.file_utils import (
+        create_model_folders,
+    )
+
+    # Create custom models folders if they don't exist
+    create_model_folders()
+
+    import gradio as gr
+
+    # initialize_rest(reload_script_modules=False)
+
+
+def initialize_rest(*, reload_script_modules=False):
+    """
+    Called both from initialize() and when reloading the webui.
+    """
+    # Keep this for adding reload options to the webUI.
+
+
+def dumpstacks():
+    import threading
+    import traceback
+
+    id2name = {th.ident: th.name for th in threading.enumerate()}
+    code = []
+    for threadId, stack in sys._current_frames().items():
+        code.append(f"\n# Thread: {id2name.get(threadId, '')}({threadId})")
+        for filename, lineno, name, line in traceback.extract_stack(stack):
+            code.append(f"""File: "{filename}", line {lineno}, in {name}""")
+            if line:
+                code.append("  " + line.strip())
+    with open(os.path.join(shark_tmp, "stack_dump.log"), "w") as f:
+        f.write("\n".join(code))
+
+
+def setup_middleware(app):
+    from starlette.middleware.gzip import GZipMiddleware
+
+    app.middleware_stack = (
+        None  # reset current middleware to allow modifying user provided list
+    )
+    app.add_middleware(GZipMiddleware, minimum_size=1000)
+    configure_cors_middleware(app)
+    app.build_middleware_stack()  # rebuild middleware stack on-the-fly
+
+
+def configure_cors_middleware(app):
+    from starlette.middleware.cors import CORSMiddleware
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+    cors_options = {
+        "allow_methods": ["*"],
+        "allow_headers": ["*"],
+        "allow_credentials": True,
+    }
+    if cmd_opts.api_accept_origin:
+        cors_options["allow_origins"] = cmd_opts.api_accept_origin.split(",")
+
+    app.add_middleware(CORSMiddleware, **cors_options)
+
+
+def configure_sigint_handler():
+    # make the program just exit at ctrl+c without waiting for anything
+    def sigint_handler(sig, frame):
+        print(f"Interrupted with signal {sig} in {frame}")
+
+        dumpstacks()
+
+        os._exit(0)
+
+    signal.signal(signal.SIGINT, sigint_handler)
--- a/apps/shark_studio/api/llm.py
+++ b/apps/shark_studio/api/llm.py
@@ -1,91 +1,475 @@
 from turbine_models.custom_models import stateless_llama
-from shark.iree_utils.compile_utils import get_iree_compiled_module
-from apps.shark_studio.api.utils import get_resource_path
+from turbine_models.model_runner import vmfbRunner
+from turbine_models.gen_external_params.gen_external_params import gen_external_params
+import time
+from shark.iree_utils.compile_utils import compile_module_to_flatbuffer
+from apps.shark_studio.web.utils.file_utils import (
+    get_resource_path,
+    get_checkpoints_path,
+)
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+from apps.shark_studio.api.utils import parse_device
+from urllib.request import urlopen
 import iree.runtime as ireert
+from itertools import chain
 import gc
+import os
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM

 llm_model_map = {
-    "llama2_7b": {
+    "meta-llama/Llama-2-7b-chat-hf": {
        "initializer": stateless_llama.export_transformer_model,
        "hf_model_name": "meta-llama/Llama-2-7b-chat-hf",
+        "compile_flags": ["--iree-opt-const-expr-hoisting=False"],
        "stop_token": 2,
        "max_tokens": 4096,
-    }
+        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
+    },
+    "Trelis/Llama-2-7b-chat-hf-function-calling-v2": {
+        "initializer": stateless_llama.export_transformer_model,
+        "hf_model_name": "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        "compile_flags": ["--iree-opt-const-expr-hoisting=False"],
+        "stop_token": 2,
+        "max_tokens": 4096,
+        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
+    },
+    "TinyPixel/small-llama2": {
+        "initializer": stateless_llama.export_transformer_model,
+        "hf_model_name": "TinyPixel/small-llama2",
+        "compile_flags": ["--iree-opt-const-expr-hoisting=True"],
+        "stop_token": 2,
+        "max_tokens": 1024,
+        "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
+    },
 }

+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<s>", "</s>"
+
+DEFAULT_CHAT_SYS_PROMPT = """<s>[INST] <<SYS>>
+Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n
+"""
+
+
+def append_user_prompt(history, input_prompt):
+    user_prompt = f"{B_INST} {input_prompt} {E_INST}"
+    history += user_prompt
+    return history
+

 class LanguageModel:
    def __init__(
-        self, model_name, hf_auth_token=None, device=None, precision="fp32"
+        self,
+        model_name,
+        hf_auth_token=None,
+        device=None,
+        quantization="int4",
+        precision="",
+        external_weights=None,
+        use_system_prompt=True,
+        streaming_llm=False,
    ):
-        print(llm_model_map[model_name])
+        _, _, self.triple = parse_device(device)
        self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
-        self.torch_ir, self.tokenizer = llm_model_map[model_name][
-            "initializer"
-        ](self.hf_model_name, hf_auth_token, compile_to="torch")
-        self.tempfile_name = get_resource_path("llm.torch.tempfile")
-        with open(self.tempfile_name, "w+") as f:
-            f.write(self.torch_ir)
-        del self.torch_ir
-        gc.collect()
+        self.device = device.split("=>")[-1].strip()
+        self.backend = self.device.split("://")[0]
+        self.driver = self.backend
+        if "cpu" in device:
+            self.device = "cpu"
+            self.backend = "llvm-cpu"
+            self.driver = "local-task"
+
+        print(f"Selected {self.backend} as IREE target backend.")
+        self.precision = "f32" if "cpu" in device else "f16"
+        self.quantization = quantization
+        self.safe_name = self.hf_model_name.replace("/", "_").replace("-", "_")
+        self.external_weight_file = None
+        # TODO: find a programmatic solution for model arch spec instead of hardcoding llama2
+        self.file_spec = "_".join(
+            [
+                self.safe_name,
+                self.precision,
+            ]
+        )
+        if self.quantization != "None":
+            self.file_spec += "_" + self.quantization
+
+        if external_weights in ["safetensors", "gguf"]:
+            self.external_weight_file = get_resource_path(
+                os.path.join("..", self.file_spec + "." + external_weights)
+            )
+        else:
+            self.external_weights = None
+            self.external_weight_file = None
+
+        if streaming_llm:
+            # Add streaming suffix to file spec after setting external weights filename.
+            self.file_spec += "_streaming"
+        self.streaming_llm = streaming_llm
+
+        self.tempfile_name = get_resource_path(
+            os.path.join("..", f"{self.file_spec}.tempfile")
+        )
+        # TODO: Tag vmfb with target triple of device instead of HAL backend
+        self.vmfb_name = str(
+            get_resource_path(
+                os.path.join("..", f"{self.file_spec}_{self.backend}.vmfb.tempfile")
+            )
+        )

-        self.device = device
-        self.precision = precision
        self.max_tokens = llm_model_map[model_name]["max_tokens"]
        self.iree_module_dict = None
-        self.compile()
+        self.use_system_prompt = use_system_prompt
+        self.global_iter = 0
+        self.prev_token_len = 0
+        self.first_input = True
+        self.hf_auth_token = hf_auth_token
+        if self.external_weight_file is not None:
+            if not os.path.exists(self.external_weight_file):
+                print(
+                    f"External weight file {self.external_weight_file} does not exist. Generating..."
+                )
+                gen_external_params(
+                    hf_model_name=self.hf_model_name,
+                    quantization=self.quantization,
+                    weight_path=self.external_weight_file,
+                    hf_auth_token=hf_auth_token,
+                    precision=self.precision,
+                )
+            else:
+                print(
+                    f"External weight file {self.external_weight_file} found for {self.vmfb_name}"
+                )
+            self.external_weight_file = str(self.external_weight_file)
+
+        if os.path.exists(self.vmfb_name) and (
+            external_weights is None or os.path.exists(str(self.external_weight_file))
+        ):
+            self.runner = vmfbRunner(
+                device=self.driver,
+                vmfb_path=self.vmfb_name,
+                external_weight_path=self.external_weight_file,
+            )
+            if self.streaming_llm:
+                self.model = self.runner.ctx.modules.streaming_state_update
+            else:
+                self.model = self.runner.ctx.modules.state_update
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hf_model_name,
+                use_fast=False,
+                use_auth_token=hf_auth_token,
+            )
+        elif not os.path.exists(self.tempfile_name):
+            self.torch_ir, self.tokenizer = llm_model_map[self.hf_model_name][
+                "initializer"
+            ](
+                self.hf_model_name,
+                hf_auth_token,
+                compile_to="torch",
+                external_weights=external_weights,
+                precision=self.precision,
+                quantization=self.quantization,
+                streaming_llm=self.streaming_llm,
+                decomp_attn=True,
+            )
+            with open(self.tempfile_name, "w+") as f:
+                f.write(self.torch_ir)
+            del self.torch_ir
+            gc.collect()
+            self.compile()
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hf_model_name,
+                use_fast=False,
+                use_auth_token=hf_auth_token,
+            )
+            self.compile()
+        # Reserved for running HF torch model as reference.
+        self.hf_mod = None

    def compile(self) -> None:
        # this comes with keys: "vmfb", "config", and "temp_file_to_unlink".
-        self.iree_module_dict = get_iree_compiled_module(
-            self.tempfile_name, device=self.device, frontend="torch"
+        # ONLY architecture/api-specific compile-time flags for each backend, if needed.
+        # hf_model_id-specific global flags currently in model map.
+        flags = []
+        if "cpu" in self.backend:
+            flags.extend(
+                [
+                    "--iree-global-opt-enable-quantized-matmul-reassociation",
+                ]
+            )
+        elif self.backend == "vulkan":
+            flags.extend(["--iree-stream-resource-max-allocation-size=4294967296"])
+        elif self.backend == "rocm":
+            flags.extend(
+                [
+                    "--iree-codegen-llvmgpu-enable-transform-dialect-jit=false",
+                    "--iree-llvmgpu-enable-prefetch=true",
+                    "--iree-opt-outer-dim-concat=true",
+                    "--iree-flow-enable-aggressive-fusion",
+                ]
+            )
+            if "gfx9" in self.triple:
+                flags.extend(
+                    [
+                        f"--iree-codegen-transform-dialect-library={get_mfma_spec_path(self.triple, get_checkpoints_path())}",
+                        "--iree-codegen-llvmgpu-use-vector-distribution=true",
+                    ]
+                )
+        flags.extend(llm_model_map[self.hf_model_name]["compile_flags"])
+        flatbuffer_blob = compile_module_to_flatbuffer(
+            self.tempfile_name,
+            device=self.device,
+            frontend="auto",
+            model_config_path=None,
+            extra_args=flags,
+            write_to=self.vmfb_name,
        )
-        # TODO: delete the temp file
+        self.runner = vmfbRunner(
+            device=self.driver,
+            vmfb_path=self.vmfb_name,
+            external_weight_path=self.external_weight_file,
+        )
+        if self.streaming_llm:
+            self.model = self.runner.ctx.modules.streaming_state_update
+        else:
+            self.model = self.runner.ctx.modules.state_update
+
+    def sanitize_prompt(self, prompt):
+        if isinstance(prompt, list):
+            prompt = list(chain.from_iterable(prompt))
+            prompt = " ".join([x for x in prompt if isinstance(x, str)])
+        prompt = prompt.replace("\n", " ")
+        prompt = prompt.replace("\t", " ")
+        prompt = prompt.replace("\r", " ")
+        if self.use_system_prompt and self.global_iter == 0:
+            prompt = append_user_prompt(DEFAULT_CHAT_SYS_PROMPT, prompt)
+            return prompt
+        else:
+            return f"{B_INST} {prompt} {E_INST}"

    def chat(self, prompt):
+        prompt = self.sanitize_prompt(prompt)
+
+        input_tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+
+        def format_out(results):
+            return torch.tensor(results.to_host()[0][0])
+
        history = []
        for iter in range(self.max_tokens):
-            input_tensor = self.tokenizer(
-                prompt, return_tensors="pt"
-            ).input_ids
+            if self.streaming_llm:
+                token_slice = max(self.prev_token_len - 1, 0)
+                input_tensor = input_tensor[:, token_slice:]
+            if self.streaming_llm and self.model["get_seq_step"]() > 600:
+                print("Evicting cache space!")
+                self.model["evict_kvcache_space"]()
+            token_len = input_tensor.shape[-1]
            device_inputs = [
-                ireert.asdevicearray(
-                    self.iree_module_dict["config"], input_tensor
-                )
+                ireert.asdevicearray(self.runner.config.device, input_tensor)
            ]
-            if iter == 0:
-                token = torch.tensor(
-                    self.iree_module_dict["vmfb"]["run_initialize"](
-                        *device_inputs
-                    ).to_host()[0][0]
-                )
+            if self.first_input or not self.streaming_llm:
+                st_time = time.time()
+                token = self.model["run_initialize"](*device_inputs)
+                total_time = time.time() - st_time
+                token_len += 1
+                self.first_input = False
            else:
-                token = torch.tensor(
-                    self.iree_module_dict["vmfb"]["run_forward"](
-                        *device_inputs
-                    ).to_host()[0][0]
-                )
+                st_time = time.time()
+                token = self.model["run_cached_initialize"](*device_inputs)
+                total_time = time.time() - st_time
+                token_len += 1

-            history.append(token)
-            yield self.tokenizer.decode(history)
+            history.append(format_out(token))
+            while (
+                format_out(token) != llm_model_map[self.hf_model_name]["stop_token"]
+                and len(history) < self.max_tokens
+            ):
+                dec_time = time.time()
+                if self.streaming_llm and self.model["get_seq_step"]() > 600:
+                    print("Evicting cache space!")
+                    self.model["evict_kvcache_space"]()
+                token = self.model["run_forward"](token)
+                history.append(format_out(token))
+                total_time = time.time() - dec_time
+                yield self.tokenizer.decode(history), total_time

-            if token == llm_model_map["llama2_7b"]["stop_token"]:
+            self.prev_token_len = token_len + len(history)
+
+            if format_out(token) == llm_model_map[self.hf_model_name]["stop_token"]:
                break

        for i in range(len(history)):
            if type(history[i]) != int:
                history[i] = int(history[i])
        result_output = self.tokenizer.decode(history)
-        yield result_output
+        self.global_iter += 1
+        return result_output, total_time
+
+    # Reference HF model function for sanity checks.
+    def chat_hf(self, prompt):
+        if self.hf_mod is None:
+            self.hf_mod = AutoModelForCausalLM.from_pretrained(
+                self.hf_model_name,
+                torch_dtype=torch.float,
+                token=self.hf_auth_token,
+            )
+        prompt = self.sanitize_prompt(prompt)
+
+        input_tensor = self.tokenizer(prompt, return_tensors="pt").input_ids
+        history = []
+        for iter in range(self.max_tokens):
+            token_len = input_tensor.shape[-1]
+            if self.first_input:
+                st_time = time.time()
+                result = self.hf_mod(input_tensor)
+                token = torch.argmax(result.logits[:, -1, :], dim=1)
+                total_time = time.time() - st_time
+                token_len += 1
+                pkv = result.past_key_values
+                self.first_input = False
+
+            history.append(int(token))
+            while token != llm_model_map[self.hf_model_name]["stop_token"]:
+                dec_time = time.time()
+                result = self.hf_mod(token.reshape([1, 1]), past_key_values=pkv)
+                history.append(int(token))
+                total_time = time.time() - dec_time
+                token = torch.argmax(result.logits[:, -1, :], dim=1)
+                pkv = result.past_key_values
+                yield self.tokenizer.decode(history), total_time
+
+            self.prev_token_len = token_len + len(history)
+
+            if token == llm_model_map[self.hf_model_name]["stop_token"]:
+                break
+        for i in range(len(history)):
+            if type(history[i]) != int:
+                history[i] = int(history[i])
+        result_output = self.tokenizer.decode(history)
+        self.global_iter += 1
+        return result_output, total_time
+
+
+def get_mfma_spec_path(target_chip, save_dir):
+    url = "https://raw.githubusercontent.com/iree-org/iree/main/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir"
+    attn_spec = urlopen(url).read().decode("utf-8")
+    spec_path = os.path.join(save_dir, "attention_and_matmul_spec_mfma.mlir")
+    if os.path.exists(spec_path):
+        return spec_path
+    with open(spec_path, "w") as f:
+        f.write(attn_spec)
+    return spec_path
+
+
+def llm_chat_api(InputData: dict):
+    from datetime import datetime as dt
+
+    import apps.shark_studio.web.utils.globals as global_obj
+
+    print(f"Input keys : {InputData.keys()}")
+
+    # print(f"model : {InputData['model']}")
+
+    is_chat_completion_api = (
+        "messages" in InputData.keys()
+    )  # else it is the legacy `completion` api
+
+    # For Debugging input data from API
+    if is_chat_completion_api:
+        print(f"message -> role : {InputData['messages'][0]['role']}")
+        print(f"message -> content : {InputData['messages'][0]['content']}")
+    else:
+        print(f"prompt : {InputData['prompt']}")
+
+    model_name = (
+        InputData["model"]
+        if "model" in InputData.keys()
+        else "meta-llama/Llama-2-7b-chat-hf"
+    )
+    model_path = llm_model_map[model_name]
+    device = InputData["device"] if "device" in InputData.keys() else "cpu"
+    precision = "fp16"
+    max_tokens = InputData["max_tokens"] if "max_tokens" in InputData.keys() else 4096
+
+    device_id = None
+    if not global_obj.get_llm_obj():
+        print("\n[LOG] Initializing new pipeline...")
+        global_obj.clear_cache()
+        gc.collect()
+        if "cuda" in device:
+            device = "cuda"
+        elif "vulkan" in device:
+            device_id = int(device.split("://")[1])
+            device = "vulkan"
+        elif "cpu" in device:
+            device = "cpu"
+            precision = "fp32"
+        else:
+            print("unrecognized device")
+        llm_model = LanguageModel(
+            model_name=model_name,
+            hf_auth_token=cmd_opts.hf_auth_token,
+            device=device,
+            quantization=cmd_opts.quantization,
+            external_weights="safetensors",
+            use_system_prompt=True,
+            streaming_llm=False,
+        )
+        global_obj.set_llm_obj(llm_model)
+    else:
+        llm_model = global_obj.get_llm_obj()
+
+    llm_model.max_tokens = max_tokens
+    # TODO: add role dict for different models
+    if is_chat_completion_api:
+        # TODO: add funtionality for multiple messages
+        prompt = append_user_prompt(
+            InputData["messages"][0]["role"], InputData["messages"][0]["content"]
+        )
+    else:
+        prompt = InputData["prompt"]
+    print("prompt = ", prompt)
+
+    for res_op, _ in llm_model.chat(prompt):
+        if is_chat_completion_api:
+            choices = [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": res_op,  # since we are yeilding the result
+                    },
+                    "finish_reason": "stop",  # or length
+                }
+            ]
+        else:
+            choices = [
+                {
+                    "text": res_op,
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": "stop",  # or length
+                }
+            ]
+    end_time = dt.now().strftime("%Y%m%d%H%M%S%f")
+    return {
+        "id": end_time,
+        "object": "chat.completion" if is_chat_completion_api else "text_completion",
+        "created": int(end_time),
+        "choices": choices,
+    }


 if __name__ == "__main__":
    lm = LanguageModel(
-        "llama2_7b",
-        hf_auth_token="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",
+        "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        hf_auth_token=None,
        device="cpu-task",
+        external_weights="safetensors",
    )
+
    print("model loaded")
-    for i in lm.chat("Hello, I am a robot."):
+    for i in lm.chat("hi, what are you?"):
        print(i)
--- a/apps/shark_studio/api/sd.py
+++ b/apps/shark_studio/api/sd.py
@@ -0,0 +1,505 @@
+import gc
+import torch
+import gradio as gr
+import time
+import os
+import json
+import numpy as np
+import copy
+import importlib.util
+import sys
+from tqdm.auto import tqdm
+
+from pathlib import Path
+from random import randint
+from turbine_models.custom_models.sd_inference.sd_pipeline import SharkSDPipeline
+from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
+    SharkSDXLPipeline,
+)
+
+
+from apps.shark_studio.api.controlnet import control_adapter_map
+from apps.shark_studio.api.utils import parse_device
+from apps.shark_studio.web.utils.state import status_label
+from apps.shark_studio.web.utils.file_utils import (
+    safe_name,
+    get_resource_path,
+    get_checkpoints_path,
+)
+
+from apps.shark_studio.modules.img_processing import (
+    save_output_img,
+)
+
+from apps.shark_studio.modules.ckpt_processing import (
+    preprocessCKPT,
+    save_irpa,
+)
+
+EMPTY_SD_MAP = {
+    "clip": None,
+    "scheduler": None,
+    "unet": None,
+    "vae_decode": None,
+}
+
+EMPTY_SDXL_MAP = {
+    "prompt_encoder": None,
+    "scheduled_unet": None,
+    "vae_decode": None,
+    "pipeline": None,
+    "full_pipeline": None,
+}
+
+EMPTY_FLAGS = {
+    "clip": None,
+    "unet": None,
+    "vae": None,
+    "pipeline": None,
+}
+
+
+def load_script(source, module_name):
+    """
+    reads file source and loads it as a module
+
+    :param source: file to load
+    :param module_name: name of module to register in sys.modules
+    :return: loaded module
+    """
+
+    spec = importlib.util.spec_from_file_location(module_name, source)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+
+    return module
+
+
+class StableDiffusion:
+    # This class is responsible for executing image generation and creating
+    # /managing a set of compiled modules to run Stable Diffusion. The init
+    # aims to be as general as possible, and the class will infer and compile
+    # a list of necessary modules or a combined "pipeline module" for a
+    # specified job based on the inference task.
+
+    def __init__(
+        self,
+        base_model_id,
+        height: int,
+        width: int,
+        batch_size: int,
+        steps: int,
+        scheduler: str,
+        precision: str,
+        device: str,
+        target_triple: str = None,
+        custom_vae: str = None,
+        num_loras: int = 0,
+        import_ir: bool = True,
+        is_controlled: bool = False,
+        external_weights: str = "safetensors",
+    ):
+        self.precision = precision
+        self.compiled_pipeline = False
+        self.base_model_id = base_model_id
+        self.custom_vae = custom_vae
+        self.is_sdxl = "xl" in self.base_model_id.lower()
+        self.is_custom = ".py" in self.base_model_id.lower()
+        if self.is_custom:
+            custom_module = load_script(
+                os.path.join(get_checkpoints_path("scripts"), self.base_model_id),
+                "custom_pipeline",
+            )
+            self.turbine_pipe = custom_module.StudioPipeline
+            self.model_map = custom_module.MODEL_MAP
+        elif self.is_sdxl:
+            self.turbine_pipe = SharkSDXLPipeline
+            self.model_map = EMPTY_SDXL_MAP
+        else:
+            self.turbine_pipe = SharkSDPipeline
+            self.model_map = EMPTY_SD_MAP
+        max_length = 64
+        target_backend, self.rt_device, triple = parse_device(device, target_triple)
+        pipe_id_list = [
+            safe_name(base_model_id),
+            str(batch_size),
+            str(max_length),
+            f"{str(height)}x{str(width)}",
+            precision,
+            triple,
+        ]
+        if num_loras > 0:
+            pipe_id_list.append(str(num_loras) + "lora")
+        if is_controlled:
+            pipe_id_list.append("controlled")
+        if custom_vae:
+            pipe_id_list.append(custom_vae)
+        self.pipe_id = "_".join(pipe_id_list)
+        self.pipeline_dir = Path(os.path.join(get_checkpoints_path(), self.pipe_id))
+        self.weights_path = Path(
+            os.path.join(
+                get_checkpoints_path(), safe_name(self.base_model_id + "_" + precision)
+            )
+        )
+        if not os.path.exists(self.weights_path):
+            os.mkdir(self.weights_path)
+
+        decomp_attn = True
+        attn_spec = None
+        if triple in ["gfx940", "gfx942", "gfx90a"]:
+            decomp_attn = False
+            attn_spec = "mfma"
+        elif triple in ["gfx1100", "gfx1103", "gfx1150"]:
+            decomp_attn = False
+            attn_spec = "wmma"
+            if triple in ["gfx1103", "gfx1150"]:
+                # external weights have issues on igpu
+                external_weights = None
+        elif target_backend == "llvm-cpu":
+            decomp_attn = False
+
+        self.sd_pipe = self.turbine_pipe(
+            hf_model_name=base_model_id,
+            scheduler_id=scheduler,
+            height=height,
+            width=width,
+            precision=precision,
+            max_length=max_length,
+            batch_size=batch_size,
+            num_inference_steps=steps,
+            device=target_backend,
+            iree_target_triple=triple,
+            ireec_flags=EMPTY_FLAGS,
+            attn_spec=attn_spec,
+            decomp_attn=decomp_attn,
+            pipeline_dir=self.pipeline_dir,
+            external_weights_dir=self.weights_path,
+            external_weights=external_weights,
+            custom_vae=custom_vae,
+        )
+        print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
+        gc.collect()
+
+    def prepare_pipe(
+        self, custom_weights, adapters, embeddings, is_img2img, compiled_pipeline
+    ):
+        print(f"\n[LOG] Preparing pipeline...")
+        self.is_img2img = False
+        mlirs = copy.deepcopy(self.model_map)
+        vmfbs = copy.deepcopy(self.model_map)
+        weights = copy.deepcopy(self.model_map)
+        if not self.is_sdxl:
+            compiled_pipeline = False
+        self.compiled_pipeline = compiled_pipeline
+
+        if custom_weights:
+            custom_weights = os.path.join(
+                get_checkpoints_path("checkpoints"),
+                safe_name(self.base_model_id.split("/")[-1]),
+                custom_weights,
+            )
+            diffusers_weights_path = preprocessCKPT(custom_weights, self.precision)
+            for key in weights:
+                if key in ["scheduled_unet", "unet"]:
+                    unet_weights_path = os.path.join(
+                        diffusers_weights_path,
+                        "unet",
+                        "diffusion_pytorch_model.safetensors",
+                    )
+                    weights[key] = save_irpa(unet_weights_path, "unet.")
+
+                elif key in ["clip", "prompt_encoder"]:
+                    if not self.is_sdxl:
+                        sd1_path = os.path.join(
+                            diffusers_weights_path, "text_encoder", "model.safetensors"
+                        )
+                        weights[key] = save_irpa(sd1_path, "text_encoder_model.")
+                    else:
+                        clip_1_path = os.path.join(
+                            diffusers_weights_path, "text_encoder", "model.safetensors"
+                        )
+                        clip_2_path = os.path.join(
+                            diffusers_weights_path,
+                            "text_encoder_2",
+                            "model.safetensors",
+                        )
+                        weights[key] = [
+                            save_irpa(clip_1_path, "text_encoder_model_1."),
+                            save_irpa(clip_2_path, "text_encoder_model_2."),
+                        ]
+
+                elif key in ["vae_decode"] and weights[key] is None:
+                    vae_weights_path = os.path.join(
+                        diffusers_weights_path,
+                        "vae",
+                        "diffusion_pytorch_model.safetensors",
+                    )
+                    weights[key] = save_irpa(vae_weights_path, "vae.")
+
+        vmfbs, weights = self.sd_pipe.check_prepared(
+            mlirs, vmfbs, weights, interactive=False
+        )
+        print(f"\n[LOG] Loading pipeline to device {self.rt_device}.")
+        self.sd_pipe.load_pipeline(
+            vmfbs, weights, self.rt_device, self.compiled_pipeline
+        )
+        print(
+            "\n[LOG] Pipeline successfully prepared for runtime. Generating images..."
+        )
+        return
+
+    def generate_images(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        strength,
+        guidance_scale,
+        seed,
+        ondemand,
+        resample_type,
+        control_mode,
+        hints,
+    ):
+        img = self.sd_pipe.generate_images(
+            prompt,
+            negative_prompt,
+            1,
+            guidance_scale,
+            seed,
+            return_imgs=True,
+        )
+        return img
+
+
+def shark_sd_fn_dict_input(
+    sd_kwargs: dict,
+):
+    print("\n[LOG] Submitting Request...")
+
+    for key in sd_kwargs:
+        if sd_kwargs[key] in [None, []]:
+            sd_kwargs[key] = None
+        if sd_kwargs[key] in ["None"]:
+            sd_kwargs[key] = ""
+        if key == "seed":
+            sd_kwargs[key] = int(sd_kwargs[key])
+
+    # TODO: move these checks into the UI code so we don't have gradio warnings in a generalized dict input function.
+    if not sd_kwargs["device"]:
+        gr.Warning("No device specified. Please specify a device.")
+        return None, ""
+    if sd_kwargs["height"] not in [512, 1024]:
+        gr.Warning("Height must be 512 or 1024. This is a temporary limitation.")
+        return None, ""
+    if sd_kwargs["height"] != sd_kwargs["width"]:
+        gr.Warning("Height and width must be the same. This is a temporary limitation.")
+        return None, ""
+    if sd_kwargs["base_model_id"] == "stabilityai/sdxl-turbo":
+        if sd_kwargs["steps"] > 10:
+            gr.Warning("Max steps for sdxl-turbo is 10. 1 to 4 steps are recommended.")
+            return None, ""
+        if sd_kwargs["guidance_scale"] > 3:
+            gr.Warning(
+                "sdxl-turbo CFG scale should be less than 2.0 if using negative prompt, 0 otherwise."
+            )
+            return None, ""
+    if sd_kwargs["target_triple"] == "":
+        if parse_device(sd_kwargs["device"], sd_kwargs["target_triple"])[2] == "":
+            gr.Warning(
+                "Target device architecture could not be inferred. Please specify a target triple, e.g. 'gfx1100' for a Radeon 7900xtx."
+            )
+            return None, ""
+
+    generated_imgs = yield from shark_sd_fn(**sd_kwargs)
+    return generated_imgs
+
+
+def shark_sd_fn(
+    prompt,
+    negative_prompt,
+    sd_init_image: list,
+    height: int,
+    width: int,
+    steps: int,
+    strength: float,
+    guidance_scale: float,
+    seed: list,
+    batch_count: int,
+    batch_size: int,
+    scheduler: str,
+    base_model_id: str,
+    custom_weights: str,
+    custom_vae: str,
+    precision: str,
+    device: str,
+    target_triple: str,
+    ondemand: bool,
+    compiled_pipeline: bool,
+    resample_type: str,
+    controlnets: dict,
+    embeddings: dict,
+):
+    sd_kwargs = locals()
+    if not isinstance(sd_init_image, list):
+        sd_init_image = [sd_init_image]
+    is_img2img = True if sd_init_image[0] is not None else False
+
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+    import apps.shark_studio.web.utils.globals as global_obj
+
+    adapters = {}
+    is_controlled = False
+    control_mode = None
+    hints = []
+    num_loras = 0
+    import_ir = True
+    for i in embeddings:
+        num_loras += 1 if embeddings[i] else 0
+    if "model" in controlnets:
+        for i, model in enumerate(controlnets["model"]):
+            if "xl" not in base_model_id.lower():
+                adapters[f"control_adapter_{model}"] = {
+                    "hf_id": control_adapter_map["runwayml/stable-diffusion-v1-5"][
+                        model
+                    ],
+                    "strength": controlnets["strength"][i],
+                }
+            else:
+                adapters[f"control_adapter_{model}"] = {
+                    "hf_id": control_adapter_map["stabilityai/stable-diffusion-xl-1.0"][
+                        model
+                    ],
+                    "strength": controlnets["strength"][i],
+                }
+            if model is not None:
+                is_controlled = True
+        control_mode = controlnets["control_mode"]
+        for i in controlnets["hint"]:
+            hints.append[i]
+
+    submit_pipe_kwargs = {
+        "base_model_id": base_model_id,
+        "height": height,
+        "width": width,
+        "batch_size": batch_size,
+        "precision": precision,
+        "device": device,
+        "target_triple": target_triple,
+        "custom_vae": custom_vae,
+        "num_loras": num_loras,
+        "import_ir": import_ir,
+        "is_controlled": is_controlled,
+        "steps": steps,
+        "scheduler": scheduler,
+    }
+    submit_prep_kwargs = {
+        "custom_weights": custom_weights,
+        "adapters": adapters,
+        "embeddings": embeddings,
+        "is_img2img": is_img2img,
+        "compiled_pipeline": compiled_pipeline,
+    }
+    submit_run_kwargs = {
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "image": sd_init_image,
+        "strength": strength,
+        "guidance_scale": guidance_scale,
+        "seed": seed,
+        "ondemand": ondemand,
+        "resample_type": resample_type,
+        "control_mode": control_mode,
+        "hints": hints,
+    }
+    if (
+        not global_obj.get_sd_obj()
+        or global_obj.get_pipe_kwargs() != submit_pipe_kwargs
+    ):
+        print("\n[LOG] Initializing new pipeline...")
+        global_obj.clear_cache()
+        gc.collect()
+
+        # Initializes the pipeline and retrieves IR based on all
+        # parameters that are static in the turbine output format,
+        # which is currently MLIR in the torch dialect.
+
+        sd_pipe = StableDiffusion(
+            **submit_pipe_kwargs,
+        )
+        global_obj.set_sd_obj(sd_pipe)
+        global_obj.set_pipe_kwargs(submit_pipe_kwargs)
+    if (
+        not global_obj.get_prep_kwargs()
+        or global_obj.get_prep_kwargs() != submit_prep_kwargs
+    ):
+        global_obj.set_prep_kwargs(submit_prep_kwargs)
+        global_obj.get_sd_obj().prepare_pipe(**submit_prep_kwargs)
+
+    generated_imgs = []
+    for current_batch in range(batch_count):
+        start_time = time.time()
+        out_imgs = global_obj.get_sd_obj().generate_images(**submit_run_kwargs)
+        if not isinstance(out_imgs, list):
+            out_imgs = [out_imgs]
+        # total_time = time.time() - start_time
+        # text_output = f"Total image(s) generation time: {total_time:.4f}sec"
+        # print(f"\n[LOG] {text_output}")
+        # if global_obj.get_sd_status() == SD_STATE_CANCEL:
+        #     break
+        # else:
+        for batch in range(batch_size):
+            save_output_img(
+                out_imgs[batch],
+                seed,
+                sd_kwargs,
+            )
+        generated_imgs.extend(out_imgs)
+        # TODO: make seed changes over batch counts more configurable.
+        submit_run_kwargs["seed"] = submit_run_kwargs["seed"] + 1
+        yield generated_imgs, status_label(
+            "Stable Diffusion", current_batch + 1, batch_count, batch_size
+        )
+    return (generated_imgs, "")
+
+
+def unload_sd():
+    print("Unloading models.")
+    import apps.shark_studio.web.utils.globals as global_obj
+
+    global_obj.clear_cache()
+    gc.collect()
+
+
+def cancel_sd():
+    print("Inject call to cancel longer API calls.")
+    return
+
+
+def view_json_file(file_path):
+    content = ""
+    with open(file_path, "r") as fopen:
+        content = fopen.read()
+    return content
+
+
+def safe_name(name):
+    return name.replace("/", "_").replace("\\", "_").replace(".", "_")
+
+
+if __name__ == "__main__":
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+    import apps.shark_studio.web.utils.globals as global_obj
+
+    global_obj._init()
+
+    sd_json = view_json_file(
+        get_resource_path(os.path.join(cmd_opts.config_dir, "default_sd_config.json"))
+    )
+    sd_kwargs = json.loads(sd_json)
+    for arg in vars(cmd_opts):
+        if arg in sd_kwargs:
+            sd_kwargs[arg] = getattr(cmd_opts, arg)
+    for i in shark_sd_fn_dict_input(sd_kwargs):
+        print(i)
--- a/apps/shark_studio/api/utils.py
+++ b/apps/shark_studio/api/utils.py
@@ -1,14 +1,389 @@
-import os
-import sys
+import numpy as np
+import json
+from random import (
+    randint,
+    seed as seed_random,
+    getstate as random_getstate,
+    setstate as random_setstate,
+)
+
+from pathlib import Path
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+from cpuinfo import get_cpu_info
+
+# TODO: migrate these utils to studio
+from shark.iree_utils.vulkan_utils import (
+    set_iree_vulkan_runtime_flags,
+    get_vulkan_target_triple,
+    get_iree_vulkan_runtime_flags,
+)


 def get_available_devices():
-    return ["cpu-task"]
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map

+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            cpu_name = get_cpu_info()["brand_raw"]
+            for i, device in enumerate(device_list_dict):
+                device_name = (
+                    cpu_name if device["name"] == "default" else device["name"]
+                )
+                if "local" in driver_name:
+                    device_list.append(
+                        f"{device_name} => {driver_name.replace('local', 'cpu')}"
+                    )
+                else:
+                    # for drivers with single devices
+                    # let the default device be selected without any indexing
+                    if len(device_list_dict) == 1:
+                        device_list.append(f"{device_name} => {driver_name}")
+                    else:
+                        device_list.append(f"{device_name} => {driver_name}://{i}")
+        return device_list

-def get_resource_path(relative_path):
-    """Get absolute path to resource, works for dev and for PyInstaller"""
-    base_path = getattr(
-        sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
+    set_iree_runtime_flags()
+
+    available_devices = []
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
+    cpu_device = get_devices_by_name("cpu-sync")
+    available_devices.extend(cpu_device)
+    cpu_device = get_devices_by_name("cpu-task")
+    available_devices.extend(cpu_device)
+
+    from shark.iree_utils.vulkan_utils import (
+        get_all_vulkan_devices,
+    )
+
+    vulkaninfo_list = get_all_vulkan_devices()
+    vulkan_devices = []
+    id = 0
+    for device in vulkaninfo_list:
+        vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
+        id += 1
+    if id != 0:
+        print(f"vulkan devices are available.")
+
+    available_devices.extend(vulkan_devices)
+    metal_devices = get_devices_by_name("metal")
+    available_devices.extend(metal_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    hip_devices = get_devices_by_name("hip")
+    available_devices.extend(hip_devices)
+
+    for idx, device_str in enumerate(available_devices):
+        if "AMD Radeon(TM) Graphics =>" in device_str:
+            igpu_id_candidates = [
+                x.split("w/")[-1].split("=>")[0]
+                for x in available_devices
+                if "M Graphics" in x
+            ]
+            for igpu_name in igpu_id_candidates:
+                if igpu_name:
+                    available_devices[idx] = device_str.replace(
+                        "AMD Radeon(TM) Graphics", igpu_name
+                    )
+                break
+    return available_devices
+
+
+def set_init_device_flags():
+    if "vulkan" in cmd_opts.device:
+        # set runtime flags for vulkan.
+        set_iree_runtime_flags()
+
+        # set triple flag to avoid multiple calls to get_vulkan_triple_flag
+        device_name, cmd_opts.device = map_device_to_name_path(cmd_opts.device)
+        if not cmd_opts.iree_vulkan_target_triple:
+            triple = get_vulkan_target_triple(device_name)
+            if triple is not None:
+                cmd_opts.iree_vulkan_target_triple = triple
+        print(
+            f"Found device {device_name}. Using target triple "
+            f"{cmd_opts.iree_vulkan_target_triple}."
+        )
+    elif "cuda" in cmd_opts.device:
+        cmd_opts.device = "cuda"
+    elif "metal" in cmd_opts.device:
+        device_name, cmd_opts.device = map_device_to_name_path(cmd_opts.device)
+        if not cmd_opts.iree_metal_target_platform:
+            from shark.iree_utils.metal_utils import get_metal_target_triple
+
+            triple = get_metal_target_triple(device_name)
+            if triple is not None:
+                cmd_opts.iree_metal_target_platform = triple.split("-")[-1]
+        print(
+            f"Found device {device_name}. Using target triple "
+            f"{cmd_opts.iree_metal_target_platform}."
+        )
+    elif "cpu" in cmd_opts.device:
+        cmd_opts.device = "cpu"
+
+
+def set_iree_runtime_flags():
+    # TODO: This function should be device-agnostic and piped properly
+    # to general runtime driver init.
+    vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
+    if cmd_opts.enable_rgp:
+        vulkan_runtime_flags += [
+            f"--enable_rgp=true",
+            f"--vulkan_debug_utils=true",
+        ]
+    if cmd_opts.device_allocator_heap_key:
+        vulkan_runtime_flags += [
+            f"--device_allocator=caching:device_local={cmd_opts.device_allocator_heap_key}",
+        ]
+    set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
+
+
+def parse_device(device_str, target_override=""):
+    from shark.iree_utils.compile_utils import (
+        clean_device_info,
+        get_iree_target_triple,
+        iree_target_map,
+    )
+
+    rt_driver, device_id = clean_device_info(device_str)
+    target_backend = iree_target_map(rt_driver)
+    if device_id:
+        rt_device = f"{rt_driver}://{device_id}"
+    else:
+        rt_device = rt_driver
+
+    if target_override:
+        return target_backend, rt_device, target_override
+    match target_backend:
+        case "vulkan-spirv":
+            triple = get_iree_target_triple(device_str)
+            return target_backend, rt_device, triple
+        case "rocm":
+            triple = get_rocm_target_chip(device_str)
+            return target_backend, rt_device, triple
+        case "llvm-cpu":
+            return "llvm-cpu", "local-task", "x86_64-linux-gnu"
+
+
+def get_rocm_target_chip(device_str):
+    # TODO: Use a data file to map device_str to target chip.
+    rocm_chip_map = {
+        "6700": "gfx1031",
+        "6800": "gfx1030",
+        "6900": "gfx1030",
+        "7900": "gfx1100",
+        "MI300X": "gfx942",
+        "MI300A": "gfx940",
+        "MI210": "gfx90a",
+        "MI250": "gfx90a",
+        "MI100": "gfx908",
+        "MI50": "gfx906",
+        "MI60": "gfx906",
+        "780M": "gfx1103",
+    }
+    for key in rocm_chip_map:
+        if key in device_str:
+            return rocm_chip_map[key]
+    raise AssertionError(
+        f"Device {device_str} not recognized. Please file an issue at https://github.com/nod-ai/SHARK/issues."
+    )
+
+
+def get_all_devices(driver_name):
+    """
+    Inputs: driver_name
+    Returns a list of all the available devices for a given driver sorted by
+    the iree path names of the device as in --list_devices option in iree.
+    """
+    from iree.runtime import get_driver
+
+    driver = get_driver(driver_name)
+    device_list_src = driver.query_available_devices()
+    device_list_src.sort(key=lambda d: d["path"])
+    return device_list_src
+
+
+def get_device_mapping(driver, key_combination=3):
+    """This method ensures consistent device ordering when choosing
+    specific devices for execution
+    Args:
+        driver (str): execution driver (vulkan, cuda, rocm, etc)
+        key_combination (int, optional): choice for mapping value for
+            device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Returns:
+        dict: map to possible device names user can input mapped to desired
+            combination of name/path.
+    """
+    from shark.iree_utils._common import iree_device_map
+
+    driver = iree_device_map(driver)
+    device_list = get_all_devices(driver)
+    device_map = dict()
+
+    def get_output_value(dev_dict):
+        if key_combination == 1:
+            return f"{driver}://{dev_dict['path']}"
+        if key_combination == 2:
+            return dev_dict["name"]
+        if key_combination == 3:
+            return dev_dict["name"], f"{driver}://{dev_dict['path']}"
+
+    # mapping driver name to default device (driver://0)
+    device_map[f"{driver}"] = get_output_value(device_list[0])
+    for i, device in enumerate(device_list):
+        # mapping with index
+        device_map[f"{driver}://{i}"] = get_output_value(device)
+        # mapping with full path
+        device_map[f"{driver}://{device['path']}"] = get_output_value(device)
+    return device_map
+
+
+def get_opt_flags(model, precision="fp16"):
+    iree_flags = []
+    if len(cmd_opts.iree_vulkan_target_triple) > 0:
+        iree_flags.append(
+            f"-iree-vulkan-target-triple={cmd_opts.iree_vulkan_target_triple}"
+        )
+    if "rocm" in cmd_opts.device:
+        from shark.iree_utils.gpu_utils import get_iree_rocm_args
+
+        rocm_args = get_iree_rocm_args()
+        iree_flags.extend(rocm_args)
+    if cmd_opts.iree_constant_folding == False:
+        iree_flags.append("--iree-opt-const-expr-hoisting=False")
+        iree_flags.append(
+            "--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
+        )
+    if cmd_opts.data_tiling == False:
+        iree_flags.append("--iree-opt-data-tiling=False")
+
+    if "vae" not in model:
+        # Due to lack of support for multi-reduce, we always collapse reduction
+        # dims before dispatch formation right now.
+        iree_flags += ["--iree-flow-collapse-reduction-dims"]
+    return iree_flags
+
+
+def map_device_to_name_path(device, key_combination=3):
+    """Gives the appropriate device data (supported name/path) for user
+        selected execution device
+    Args:
+        device (str): user
+        key_combination (int, optional): choice for mapping value for
+            device name.
+        1 : path
+        2 : name
+        3 : (name, path)
+        Defaults to 3.
+    Raises:
+        ValueError:
+    Returns:
+        str / tuple: returns the mapping str or tuple of mapping str for
+        the device depending on key_combination value
+    """
+    driver = device.split("://")[0]
+    device_map = get_device_mapping(driver, key_combination)
+    try:
+        device_mapping = device_map[device]
+    except KeyError:
+        raise ValueError(f"Device '{device}' is not a valid device.")
+    return device_mapping
+
+    def get_devices_by_name(driver_name):
+        from shark.iree_utils._common import iree_device_map
+
+        device_list = []
+        try:
+            driver_name = iree_device_map(driver_name)
+            device_list_dict = get_all_devices(driver_name)
+            print(f"{driver_name} devices are available.")
+        except:
+            print(f"{driver_name} devices are not available.")
+        else:
+            cpu_name = get_cpu_info()["brand_raw"]
+            for i, device in enumerate(device_list_dict):
+                device_name = (
+                    cpu_name if device["name"] == "default" else device["name"]
+                )
+                if "local" in driver_name:
+                    device_list.append(
+                        f"{device_name} => {driver_name.replace('local', 'cpu')}"
+                    )
+                else:
+                    # for drivers with single devices
+                    # let the default device be selected without any indexing
+                    if len(device_list_dict) == 1:
+                        device_list.append(f"{device_name} => {driver_name}")
+                    else:
+                        device_list.append(f"{device_name} => {driver_name}://{i}")
+        return device_list
+
+    set_iree_runtime_flags()
+
+    available_devices = []
+    from shark.iree_utils.vulkan_utils import (
+        get_all_vulkan_devices,
+    )
+
+    vulkaninfo_list = get_all_vulkan_devices()
+    vulkan_devices = []
+    id = 0
+    for device in vulkaninfo_list:
+        vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
+        id += 1
+    if id != 0:
+        print(f"vulkan devices are available.")
+    available_devices.extend(vulkan_devices)
+    metal_devices = get_devices_by_name("metal")
+    available_devices.extend(metal_devices)
+    cuda_devices = get_devices_by_name("cuda")
+    available_devices.extend(cuda_devices)
+    rocm_devices = get_devices_by_name("rocm")
+    available_devices.extend(rocm_devices)
+    cpu_device = get_devices_by_name("cpu-sync")
+    available_devices.extend(cpu_device)
+    cpu_device = get_devices_by_name("cpu-task")
+    available_devices.extend(cpu_device)
+    return available_devices
+
+
+# Generate and return a new seed if the provided one is not in the
+# supported range (including -1)
+def sanitize_seed(seed: int | str):
+    seed = int(seed)
+    uint32_info = np.iinfo(np.uint32)
+    uint32_min, uint32_max = uint32_info.min, uint32_info.max
+    if seed < uint32_min or seed >= uint32_max:
+        seed = randint(uint32_min, uint32_max)
+    return seed
+
+
+# take a seed expression in an input format and convert it to
+# a list of integers, where possible
+def parse_seed_input(seed_input: str | list | int):
+    if isinstance(seed_input, str):
+        try:
+            seed_input = json.loads(seed_input)
+        except (ValueError, TypeError):
+            seed_input = None
+
+    if isinstance(seed_input, int):
+        return [seed_input]
+
+    if isinstance(seed_input, list) and all(type(seed) is int for seed in seed_input):
+        return seed_input
+
+    raise TypeError(
+        "Seed input must be an integer or an array of integers in JSON format"
    )
-    return os.path.join(base_path, relative_path)
--- a/apps/shark_studio/modules/ckpt_processing.py
+++ b/apps/shark_studio/modules/ckpt_processing.py
@@ -0,0 +1,145 @@
+import os
+import json
+import re
+import requests
+import torch
+import safetensors
+from shark_turbine.aot.params import (
+    ParameterArchiveBuilder,
+)
+from io import BytesIO
+from pathlib import Path
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from diffusers import StableDiffusionPipeline
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+    download_from_original_stable_diffusion_ckpt,
+    create_vae_diffusers_config,
+    convert_ldm_vae_checkpoint,
+)
+
+
+def get_path_to_diffusers_checkpoint(custom_weights, precision="fp16"):
+    path = Path(custom_weights)
+    diffusers_path = path.parent.absolute()
+    diffusers_directory_name = os.path.join("diffusers", path.stem + f"_{precision}")
+    complete_path_to_diffusers = diffusers_path / diffusers_directory_name
+    complete_path_to_diffusers.mkdir(parents=True, exist_ok=True)
+    path_to_diffusers = complete_path_to_diffusers.as_posix()
+    return path_to_diffusers
+
+
+def preprocessCKPT(custom_weights, precision="fp16", is_inpaint=False):
+    path_to_diffusers = get_path_to_diffusers_checkpoint(custom_weights, precision)
+    if next(Path(path_to_diffusers).iterdir(), None):
+        print("Checkpoint already loaded at : ", path_to_diffusers)
+        return path_to_diffusers
+    else:
+        print(
+            "Diffusers' checkpoint will be identified here : ",
+            path_to_diffusers,
+        )
+    from_safetensors = (
+        True if custom_weights.lower().endswith(".safetensors") else False
+    )
+    # EMA weights usually yield higher quality images for inference but
+    # non-EMA weights have been yielding better results in our case.
+    # TODO: Add an option `--ema` (`--no-ema`) for users to specify if
+    #  they want to go for EMA weight extraction or not.
+    extract_ema = False
+    print("Loading diffusers' pipeline from original stable diffusion checkpoint")
+    num_in_channels = 9 if is_inpaint else 4
+    pipe = download_from_original_stable_diffusion_ckpt(
+        checkpoint_path_or_dict=custom_weights,
+        extract_ema=extract_ema,
+        from_safetensors=from_safetensors,
+        num_in_channels=num_in_channels,
+    )
+    if precision == "fp16":
+        pipe.to(dtype=torch.float16)
+    pipe.save_pretrained(path_to_diffusers)
+    del pipe
+    print("Loading complete")
+    return path_to_diffusers
+
+
+def save_irpa(weights_path, prepend_str):
+    weights = safetensors.torch.load_file(weights_path)
+    archive = ParameterArchiveBuilder()
+    for key in weights.keys():
+        new_key = prepend_str + key
+        archive.add_tensor(new_key, weights[key])
+
+    irpa_file = weights_path.replace(".safetensors", ".irpa")
+    archive.save(irpa_file)
+    return irpa_file
+
+
+def convert_original_vae(vae_checkpoint):
+    vae_state_dict = {}
+    for key in list(vae_checkpoint.keys()):
+        vae_state_dict["first_stage_model." + key] = vae_checkpoint.get(key)
+
+    config_url = (
+        "https://raw.githubusercontent.com/CompVis/stable-diffusion/"
+        "main/configs/stable-diffusion/v1-inference.yaml"
+    )
+    original_config_file = BytesIO(requests.get(config_url).content)
+    original_config = OmegaConf.load(original_config_file)
+    vae_config = create_vae_diffusers_config(original_config, image_size=512)
+
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(vae_state_dict, vae_config)
+    return converted_vae_checkpoint
+
+
+def process_custom_pipe_weights(custom_weights):
+    if custom_weights != "":
+        if custom_weights.startswith("https://civitai.com/api/"):
+            # download the checkpoint from civitai if we don't already have it
+            weights_path = get_civitai_checkpoint(custom_weights)
+
+            # act as if we were given the local file as custom_weights originally
+            custom_weights_tgt = get_path_to_diffusers_checkpoint(weights_path)
+            custom_weights_params = weights_path
+
+        else:
+            assert custom_weights.lower().endswith(
+                (".ckpt", ".safetensors")
+            ), "checkpoint files supported can be any of [.ckpt, .safetensors] type"
+            custom_weights_tgt = get_path_to_diffusers_checkpoint(custom_weights)
+            custom_weights_params = custom_weights
+
+        return custom_weights_params, custom_weights_tgt
+
+
+def get_civitai_checkpoint(url: str):
+    with requests.get(url, allow_redirects=True, stream=True) as response:
+        response.raise_for_status()
+
+        # civitai api returns the filename in the content disposition
+        base_filename = re.findall(
+            '"([^"]*)"', response.headers["Content-Disposition"]
+        )[0]
+        destination_path = Path.cwd() / (cmd_opts.model_dir or "models") / base_filename
+
+        # we don't have this model downloaded yet
+        if not destination_path.is_file():
+            print(f"downloading civitai model from {url} to {destination_path}")
+
+            size = int(response.headers["content-length"], 0)
+            progress_bar = tqdm(total=size, unit="iB", unit_scale=True)
+
+            with open(destination_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=65536):
+                    f.write(chunk)
+                    progress_bar.update(len(chunk))
+
+            progress_bar.close()
+
+        # we already have this model downloaded
+        else:
+            print(f"civitai model already downloaded to {destination_path}")
+
+        response.close()
+        return destination_path.as_posix()
--- a/apps/shark_studio/modules/embeddings.py
+++ b/apps/shark_studio/modules/embeddings.py
@@ -0,0 +1,185 @@
+import os
+import sys
+import torch
+import json
+import safetensors
+from dataclasses import dataclass
+from safetensors.torch import load_file
+from apps.shark_studio.web.utils.file_utils import (
+    get_checkpoint_pathfile,
+    get_path_stem,
+)
+
+
+@dataclass
+class LoRAweight:
+    up: torch.tensor
+    down: torch.tensor
+    mid: torch.tensor
+    alpha: torch.float32 = 1.0
+
+
+def processLoRA(model, use_lora, splitting_prefix, lora_strength=0.75):
+    state_dict = ""
+    if ".safetensors" in use_lora:
+        state_dict = load_file(use_lora)
+    else:
+        state_dict = torch.load(use_lora)
+
+    # gather the weights from the LoRA in a more convenient form, assumes
+    # everything will have an up.weight.
+    weight_dict: dict[str, LoRAweight] = {}
+    for key in state_dict:
+        if key.startswith(splitting_prefix) and key.endswith("up.weight"):
+            stem = key.split("up.weight")[0]
+            weight_key = stem.removesuffix(".lora_")
+            weight_key = weight_key.removesuffix("_lora_")
+            weight_key = weight_key.removesuffix(".lora_linear_layer.")
+
+            if weight_key not in weight_dict:
+                weight_dict[weight_key] = LoRAweight(
+                    state_dict[f"{stem}up.weight"],
+                    state_dict[f"{stem}down.weight"],
+                    state_dict.get(f"{stem}mid.weight", None),
+                    (
+                        state_dict[f"{weight_key}.alpha"]
+                        / state_dict[f"{stem}up.weight"].shape[1]
+                        if f"{weight_key}.alpha" in state_dict
+                        else 1.0
+                    ),
+                )
+
+    # Directly update weight in model
+
+    # Mostly adaptions of https://github.com/kohya-ss/sd-scripts/blob/main/networks/merge_lora.py
+    # and similar code in https://github.com/huggingface/diffusers/issues/3064
+
+    # TODO: handle mid weights (how do they even work?)
+    for key, lora_weight in weight_dict.items():
+        curr_layer = model
+        layer_infos = key.split(".")[0].split(splitting_prefix)[-1].split("_")
+
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+
+        weight = curr_layer.weight.data
+        scale = lora_weight.alpha * lora_strength
+        if len(weight.size()) == 2:
+            if len(lora_weight.up.shape) == 4:
+                weight_up = lora_weight.up.squeeze(3).squeeze(2).to(torch.float32)
+                weight_down = lora_weight.down.squeeze(3).squeeze(2).to(torch.float32)
+                change = torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+            else:
+                change = torch.mm(lora_weight.up, lora_weight.down)
+        elif lora_weight.down.size()[2:4] == (1, 1):
+            weight_up = lora_weight.up.squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = lora_weight.down.squeeze(3).squeeze(2).to(torch.float32)
+            change = torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+        else:
+            change = torch.nn.functional.conv2d(
+                lora_weight.down.permute(1, 0, 2, 3),
+                lora_weight.up,
+            ).permute(1, 0, 2, 3)
+
+        curr_layer.weight.data += change * scale
+
+    return model
+
+
+def update_lora_weight_for_unet(unet, use_lora, lora_strength):
+    extensions = [".bin", ".safetensors", ".pt"]
+    if not any([extension in use_lora for extension in extensions]):
+        # We assume if it is a HF ID with standalone LoRA weights.
+        unet.load_attn_procs(use_lora)
+        return unet
+
+    main_file_name = get_path_stem(use_lora)
+    if ".bin" in use_lora:
+        main_file_name += ".bin"
+    elif ".safetensors" in use_lora:
+        main_file_name += ".safetensors"
+    elif ".pt" in use_lora:
+        main_file_name += ".pt"
+    else:
+        sys.exit("Only .bin and .safetensors format for LoRA is supported")
+
+    try:
+        dir_name = os.path.dirname(use_lora)
+        unet.load_attn_procs(dir_name, weight_name=main_file_name)
+        return unet
+    except:
+        return processLoRA(unet, use_lora, "lora_unet_", lora_strength)
+
+
+def update_lora_weight(model, use_lora, model_name, lora_strength=1.0):
+    if "unet" in model_name:
+        return update_lora_weight_for_unet(model, use_lora, lora_strength)
+    try:
+        return processLoRA(model, use_lora, "lora_te_", lora_strength)
+    except:
+        return None
+
+
+def get_lora_metadata(lora_filename):
+    # get the metadata from the file
+    filename = get_checkpoint_pathfile(lora_filename, "lora")
+    with safetensors.safe_open(filename, framework="pt", device="cpu") as f:
+        metadata = f.metadata()
+
+    # guard clause for if there isn't any metadata
+    if not metadata:
+        return None
+
+    # metadata is a dictionary of strings, the values of the keys we're
+    # interested in are actually json, and need to be loaded as such
+    tag_frequencies = json.loads(metadata.get("ss_tag_frequency", str("{}")))
+    dataset_dirs = json.loads(metadata.get("ss_dataset_dirs", str("{}")))
+    tag_dirs = [dir for dir in tag_frequencies.keys()]
+
+    # gather the tag frequency information for all the datasets trained
+    all_frequencies = {}
+    for dataset in tag_dirs:
+        frequencies = sorted(
+            [entry for entry in tag_frequencies[dataset].items()],
+            reverse=True,
+            key=lambda x: x[1],
+        )
+
+        # get a figure for the total number of images processed for this dataset
+        # either then number actually listed or in its dataset_dir entry or
+        # the highest frequency's number if that doesn't exist
+        img_count = dataset_dirs.get(dir, {}).get("img_count", frequencies[0][1])
+
+        # add the dataset frequencies to the overall frequencies replacing the
+        # frequency counts on the tags with a percentage/ratio
+        all_frequencies.update(
+            [(entry[0], entry[1] / img_count) for entry in frequencies]
+        )
+
+    trained_model_id = " ".join(
+        [
+            metadata.get("ss_sd_model_hash", ""),
+            metadata.get("ss_sd_model_name", ""),
+            metadata.get("ss_base_model_version", ""),
+        ]
+    ).strip()
+
+    # return the topmost <count> of all frequencies in all datasets
+    return {
+        "model": trained_model_id,
+        "frequencies": sorted(
+            all_frequencies.items(), reverse=True, key=lambda x: x[1]
+        ),
+    }
--- a/apps/shark_studio/modules/img_processing.py
+++ b/apps/shark_studio/modules/img_processing.py
@@ -0,0 +1,202 @@
+import os
+import re
+import json
+import torch
+import numpy as np
+
+from csv import DictWriter
+from PIL import Image, PngImagePlugin
+from pathlib import Path
+from datetime import datetime as dt
+from base64 import decode
+
+
+resamplers = {
+    "Lanczos": Image.Resampling.LANCZOS,
+    "Nearest Neighbor": Image.Resampling.NEAREST,
+    "Bilinear": Image.Resampling.BILINEAR,
+    "Bicubic": Image.Resampling.BICUBIC,
+    "Hamming": Image.Resampling.HAMMING,
+    "Box": Image.Resampling.BOX,
+}
+
+resampler_list = resamplers.keys()
+
+
+# save output images and the inputs corresponding to it.
+def save_output_img(output_img, img_seed, extra_info=None):
+    from apps.shark_studio.web.utils.file_utils import (
+        get_generated_imgs_path,
+        get_generated_imgs_todays_subdir,
+    )
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+    if extra_info is None:
+        extra_info = {}
+    generated_imgs_path = Path(
+        get_generated_imgs_path(), get_generated_imgs_todays_subdir()
+    )
+    generated_imgs_path.mkdir(parents=True, exist_ok=True)
+    csv_path = Path(generated_imgs_path, "imgs_details.csv")
+
+    prompt_slice = re.sub("[^a-zA-Z0-9]", "_", extra_info["prompt"][0][:15])
+    out_img_name = f"{dt.now().strftime('%H%M%S')}_{prompt_slice}_{img_seed}"
+
+    img_model = extra_info["base_model_id"]
+    if extra_info["custom_weights"] not in [None, "None"]:
+        img_model = Path(os.path.basename(extra_info["custom_weights"])).stem
+
+    img_vae = None
+    if extra_info["custom_vae"]:
+        img_vae = Path(os.path.basename(extra_info["custom_vae"])).stem
+
+    img_loras = None
+    if extra_info["embeddings"]:
+        img_lora = []
+        for i in extra_info["embeddings"]:
+            img_lora += Path(os.path.basename(cmd_opts.use_lora)).stem
+        img_loras = ", ".join(img_lora)
+
+    if cmd_opts.output_img_format == "jpg":
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.jpg")
+        output_img.save(out_img_path, quality=95, subsampling=0)
+    else:
+        out_img_path = Path(generated_imgs_path, f"{out_img_name}.png")
+        pngInfo = PngImagePlugin.PngInfo()
+
+        if cmd_opts.write_metadata_to_png:
+            # Using a conditional expression caused problems, so setting a new
+            # variable for now.
+            # if cmd_opts.use_hiresfix:
+            #    png_size_text = (
+            #        f"{cmd_opts.hiresfix_width}x{cmd_opts.hiresfix_height}"
+            #    )
+            # else:
+            png_size_text = f"{extra_info['width']}x{extra_info['height']}"
+
+            pngInfo.add_text(
+                "parameters",
+                f"{extra_info['prompt'][0]}"
+                f"\nNegative prompt: {extra_info['negative_prompt'][0]}"
+                f"\nSteps: {extra_info['steps']},"
+                f"Sampler: {extra_info['scheduler']}, "
+                f"CFG scale: {extra_info['guidance_scale']}, "
+                f"Seed: {img_seed},"
+                f"Size: {png_size_text}, "
+                f"Model: {img_model}, "
+                f"VAE: {img_vae}, "
+                f"LoRA: {img_loras}",
+            )
+
+        output_img.save(out_img_path, "PNG", pnginfo=pngInfo)
+
+        if cmd_opts.output_img_format not in ["png", "jpg"]:
+            print(
+                f"[ERROR] Format {cmd_opts.output_img_format} is not "
+                f"supported yet. Image saved as png instead."
+                f"Supported formats: png / jpg"
+            )
+
+    # To be as low-impact as possible to the existing CSV format, we append
+    # "VAE" and "LORA" to the end. However, it does not fit the hierarchy of
+    # importance for each data point. Something to consider.
+    new_entry = {}
+
+    new_entry.update(extra_info)
+
+    csv_mode = "a" if os.path.isfile(csv_path) else "w"
+    with open(csv_path, csv_mode, encoding="utf-8") as csv_obj:
+        dictwriter_obj = DictWriter(csv_obj, fieldnames=list(new_entry.keys()))
+        if csv_mode == "w":
+            dictwriter_obj.writeheader()
+        dictwriter_obj.writerow(new_entry)
+        csv_obj.close()
+
+    json_path = Path(generated_imgs_path, f"{out_img_name}.json")
+    with open(json_path, "w") as f:
+        json.dump(new_entry, f, indent=4)
+
+
+# For stencil, the input image can be of any size, but we need to ensure that
+# it conforms with our model constraints :-
+#   Both width and height should be in the range of [128, 768] and multiple of 8.
+# This utility function performs the transformation on the input image while
+# also maintaining the aspect ratio before sending it to the stencil pipeline.
+def resize_stencil(image: Image.Image, width, height, resampler_type=None):
+    aspect_ratio = width / height
+    min_size = min(width, height)
+    if min_size < 128:
+        n_size = 128
+        if width == min_size:
+            width = n_size
+            height = n_size / aspect_ratio
+        else:
+            height = n_size
+            width = n_size * aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+
+    min_size = min(width, height)
+    if min_size > 768:
+        n_size = 768
+        if width == min_size:
+            height = n_size
+            width = n_size * aspect_ratio
+        else:
+            width = n_size
+            height = n_size / aspect_ratio
+    width = int(width)
+    height = int(height)
+    n_width = width // 8
+    n_height = height // 8
+    n_width *= 8
+    n_height *= 8
+    if resampler_type in resamplers:
+        resampler = resamplers[resampler_type]
+    else:
+        resampler = resamplers["Nearest Neighbor"]
+    new_image = image.resize((n_width, n_height), resampler=resampler)
+    return new_image, n_width, n_height
+
+
+def process_sd_init_image(self, sd_init_image, resample_type):
+    if isinstance(sd_init_image, list):
+        images = []
+        for img in sd_init_image:
+            img, _ = self.process_sd_init_image(img, resample_type)
+            images.append(img)
+            is_img2img = True
+            return images, is_img2img
+    if isinstance(sd_init_image, str):
+        if os.path.isfile(sd_init_image):
+            sd_init_image = Image.open(sd_init_image, mode="r").convert("RGB")
+            image, is_img2img = self.process_sd_init_image(sd_init_image, resample_type)
+        else:
+            image = None
+            is_img2img = False
+    elif isinstance(sd_init_image, Image.Image):
+        image = sd_init_image.convert("RGB")
+    elif sd_init_image:
+        image = sd_init_image["image"].convert("RGB")
+    else:
+        image = None
+        is_img2img = False
+    if image:
+        resample_type = (
+            resamplers[resample_type]
+            if resample_type in resampler_list
+            # Fallback to Lanczos
+            else Image.Resampling.LANCZOS
+        )
+        image = image.resize((self.width, self.height), resample=resample_type)
+        image_arr = np.stack([np.array(i) for i in (image,)], axis=0)
+        image_arr = image_arr / 255.0
+        image_arr = torch.from_numpy(image_arr).permute(0, 3, 1, 2).to(self.dtype)
+        image_arr = 2 * (image_arr - 0.5)
+        is_img2img = True
+        image = image_arr
+    return image, is_img2img
--- a/apps/shark_studio/modules/logger.py
+++ b/apps/shark_studio/modules/logger.py
@@ -0,0 +1,37 @@
+import sys
+
+
+class Logger:
+    def __init__(self, filename, filter=None):
+        self.terminal = sys.stdout
+        self.log = open(filename, "w")
+        self.filter = filter
+
+    def write(self, message):
+        for x in message.split("\n"):
+            if self.filter in x:
+                self.log.write(message)
+            else:
+                self.terminal.write(message)
+
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+
+    def isatty(self):
+        return False
+
+
+def logger_test(x):
+    print("[LOG] This is a test")
+    print(f"This is another test, without the filter")
+    return x
+
+
+def read_sd_logs():
+    sys.stdout.flush()
+    with open("shark_tmp/sd.log", "r") as f:
+        return f.read()
+
+
+sys.stdout = Logger("shark_tmp/sd.log", filter="[LOG]")
--- a/apps/shark_studio/modules/pipeline.py
+++ b/apps/shark_studio/modules/pipeline.py
@@ -0,0 +1,205 @@
+from shark.iree_utils.compile_utils import (
+    get_iree_compiled_module,
+    load_vmfb_using_mmap,
+    clean_device_info,
+    get_iree_target_triple,
+)
+from apps.shark_studio.web.utils.file_utils import (
+    get_checkpoints_path,
+    get_resource_path,
+)
+from apps.shark_studio.modules.shared_cmd_opts import (
+    cmd_opts,
+)
+from iree import runtime as ireert
+from pathlib import Path
+import gc
+import os
+
+
+class SharkPipelineBase:
+    # This class is a lightweight base for managing an
+    # inference API class. It should provide methods for:
+    # - compiling a set (model map) of torch IR modules
+    # - preparing weights for an inference job
+    # - loading weights for an inference job
+    # - utilites like benchmarks, tests
+
+    def __init__(
+        self,
+        model_map: dict,
+        base_model_id: str,
+        static_kwargs: dict,
+        device: str,
+        import_mlir: bool = True,
+    ):
+        self.model_map = model_map
+        self.pipe_map = {}
+        self.static_kwargs = static_kwargs
+        self.base_model_id = base_model_id
+        self.triple = get_iree_target_triple(device)
+        self.device, self.device_id = clean_device_info(device)
+        self.import_mlir = import_mlir
+        self.iree_module_dict = {}
+        self.tmp_dir = get_resource_path(cmd_opts.tmp_dir)
+        if not os.path.exists(self.tmp_dir):
+            os.mkdir(self.tmp_dir)
+        self.tempfiles = {}
+        self.pipe_vmfb_path = ""
+
+    def get_compiled_map(self, pipe_id, submodel="None", init_kwargs={}) -> None:
+        # First checks whether we have .vmfbs precompiled, then populates the map
+        # with the precompiled executables and fetches executables for the rest of the map.
+        # The weights aren't static here anymore so this function should be a part of pipeline
+        # initialization. As soon as you have a pipeline ID unique to your static torch IR parameters,
+        # and your model map is populated with any IR - unique model IDs and their static params,
+        # call this method to get the artifacts associated with your map.
+        self.pipe_id = self.safe_name(pipe_id)
+        self.pipe_vmfb_path = Path(os.path.join(get_checkpoints_path(), self.pipe_id))
+        self.pipe_vmfb_path.mkdir(parents=False, exist_ok=True)
+        if submodel == "None":
+            print("\n[LOG] Gathering any pre-compiled artifacts....")
+            for key in self.model_map:
+                self.get_compiled_map(pipe_id, submodel=key)
+        else:
+            self.pipe_map[submodel] = {}
+            self.get_precompiled(self.pipe_id, submodel)
+            ireec_flags = []
+            if submodel in self.iree_module_dict:
+                return
+            elif "vmfb_path" in self.pipe_map[submodel]:
+                return
+            elif submodel not in self.tempfiles:
+                print(
+                    f"\n[LOG] Tempfile for {submodel} not found. Fetching torch IR..."
+                )
+                if submodel in self.static_kwargs:
+                    init_kwargs = self.static_kwargs[submodel]
+                for key in self.static_kwargs["pipe"]:
+                    if key not in init_kwargs:
+                        init_kwargs[key] = self.static_kwargs["pipe"][key]
+                self.import_torch_ir(submodel, init_kwargs)
+                self.get_compiled_map(pipe_id, submodel)
+            else:
+                ireec_flags = (
+                    self.model_map[submodel]["ireec_flags"]
+                    if "ireec_flags" in self.model_map[submodel]
+                    else []
+                )
+
+                weights_path = self.get_io_params(submodel)
+                if weights_path:
+                    ireec_flags.append("--iree-opt-const-eval=False")
+
+                self.iree_module_dict[submodel] = get_iree_compiled_module(
+                    self.tempfiles[submodel],
+                    device=self.device,
+                    frontend="torch",
+                    mmap=True,
+                    external_weight_file=weights_path,
+                    extra_args=ireec_flags,
+                    write_to=os.path.join(self.pipe_vmfb_path, submodel + ".vmfb"),
+                )
+        return
+
+    def get_io_params(self, submodel):
+        if "external_weight_file" in self.static_kwargs[submodel]:
+            # we are using custom weights
+            weights_path = self.static_kwargs[submodel]["external_weight_file"]
+        elif "external_weight_path" in self.static_kwargs[submodel]:
+            # we are using the default weights for the HF model
+            weights_path = self.static_kwargs[submodel]["external_weight_path"]
+        else:
+            # assume the torch IR contains the weights.
+            weights_path = None
+        return weights_path
+
+    def get_precompiled(self, pipe_id, submodel="None"):
+        if submodel == "None":
+            for model in self.model_map:
+                self.get_precompiled(pipe_id, model)
+        vmfbs = []
+        for dirpath, dirnames, filenames in os.walk(self.pipe_vmfb_path):
+            vmfbs.extend(filenames)
+            break
+        for file in vmfbs:
+            if submodel in file:
+                self.pipe_map[submodel]["vmfb_path"] = os.path.join(
+                    self.pipe_vmfb_path, file
+                )
+        return
+
+    def import_torch_ir(self, submodel, kwargs):
+        torch_ir = self.model_map[submodel]["initializer"](
+            **self.safe_dict(kwargs), compile_to="torch"
+        )
+        if submodel == "clip":
+            # clip.export_clip_model returns (torch_ir, tokenizer)
+            torch_ir = torch_ir[0]
+
+        self.tempfiles[submodel] = os.path.join(
+            self.tmp_dir, f"{submodel}.torch.tempfile"
+        )
+
+        with open(self.tempfiles[submodel], "w+") as f:
+            f.write(torch_ir)
+        del torch_ir
+        gc.collect()
+        return
+
+    def load_submodels(self, submodels: list):
+        for submodel in submodels:
+            if submodel in self.iree_module_dict:
+                print(f"\n[LOG] {submodel} is ready for inference.")
+                continue
+            if "vmfb_path" in self.pipe_map[submodel]:
+                weights_path = self.get_io_params(submodel)
+                # print(
+                #     f"\n[LOG] Loading .vmfb for {submodel} from {self.pipe_map[submodel]['vmfb_path']}"
+                # )
+                self.iree_module_dict[submodel] = {}
+                (
+                    self.iree_module_dict[submodel]["vmfb"],
+                    self.iree_module_dict[submodel]["config"],
+                    self.iree_module_dict[submodel]["temp_file_to_unlink"],
+                ) = load_vmfb_using_mmap(
+                    self.pipe_map[submodel]["vmfb_path"],
+                    self.device,
+                    device_idx=0,
+                    rt_flags=[],
+                    external_weight_file=weights_path,
+                )
+            else:
+                self.get_compiled_map(self.pipe_id, submodel)
+        return
+
+    def unload_submodels(self, submodels: list):
+        for submodel in submodels:
+            if submodel in self.iree_module_dict:
+                del self.iree_module_dict[submodel]
+                gc.collect()
+        return
+
+    def run(self, submodel, inputs):
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        inp = [
+            ireert.asdevicearray(
+                self.iree_module_dict[submodel]["config"].device, input
+            )
+            for input in inputs
+        ]
+        return self.iree_module_dict[submodel]["vmfb"]["main"](*inp)
+
+    def safe_name(self, name):
+        return name.replace("/", "_").replace("-", "_").replace("\\", "_")
+
+    def safe_dict(self, kwargs: dict):
+        flat_args = {}
+        for i in kwargs:
+            if isinstance(kwargs[i], dict) and "pass_dict" not in kwargs[i]:
+                flat_args[i] = [kwargs[i][j] for j in kwargs[i]]
+            else:
+                flat_args[i] = kwargs[i]
+
+        return flat_args
--- a/apps/shark_studio/modules/prompt_encoding.py
+++ b/apps/shark_studio/modules/prompt_encoding.py
@@ -0,0 +1,376 @@
+from typing import List, Optional, Union
+from iree import runtime as ireert
+import re
+import torch
+import numpy as np
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs:
+        text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        print(
+            "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples"
+        )
+    return tokens, weights
+
+
+def pad_tokens_and_weights(
+    tokens,
+    weights,
+    max_length,
+    bos,
+    eos,
+    no_boseos_middle=True,
+    chunk_length=77,
+):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = (
+        max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    )
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][
+                        j
+                        * (chunk_length - 2) : min(
+                            len(weights[i]), (j + 1) * (chunk_length - 2)
+                        )
+                    ]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe,
+    text_input,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[
+                :, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2
+            ].clone()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+
+            text_embedding = pipe.run("clip", text_input_chunk)[0].to_host()
+
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        # SHARK: Convert the result to tensor
+        # text_embeddings = torch.concat(text_embeddings, axis=1)
+        text_embeddings_np = np.concatenate(np.array(text_embeddings))
+        text_embeddings = torch.from_numpy(text_embeddings_np)
+    else:
+        text_embeddings = pipe.run("clip", text_input)[0]
+        text_embeddings = torch.from_numpy(text_embeddings.to_host())
+    return text_embeddings
+
+
+# This function deals with NoneType values occuring in tokens after padding
+# It switches out None with 49407 as truncating None values causes matrix dimension errors,
+def filter_nonetype_tokens(tokens: List[List]):
+    return [[49407 if token is None else token for token in tokens[0]]]
+
+
+def get_weighted_text_embeddings(
+    pipe,
+    prompt: List[str],
+    uncond_prompt: List[str] = None,
+    max_embeddings_multiples: Optional[int] = 8,
+    no_boseos_middle: Optional[bool] = True,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+):
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
+
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(
+            pipe, prompt, max_length - 2
+        )
+        if uncond_prompt is not None:
+            uncond_tokens, uncond_weights = get_prompts_with_weights(
+                pipe, uncond_prompt, max_length - 2
+            )
+    else:
+        prompt_tokens = [
+            token[1:-1]
+            for token in pipe.tokenizer(
+                prompt, max_length=max_length, truncation=True
+            ).input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(
+                    uncond_prompt, max_length=max_length, truncation=True
+                ).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+
+    max_length = (pipe.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.model_max_length,
+    )
+
+    # FIXME: This is a hacky fix caused by tokenizer padding with None values
+    prompt_tokens = filter_nonetype_tokens(prompt_tokens)
+
+    # prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device="cpu")
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.model_max_length,
+        )
+
+        # FIXME: This is a hacky fix caused by tokenizer padding with None values
+        uncond_tokens = filter_nonetype_tokens(uncond_tokens)
+
+        # uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device="cpu")
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.model_max_length,
+        no_boseos_middle=no_boseos_middle,
+    )
+    # prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    prompt_weights = torch.tensor(prompt_weights, dtype=torch.float, device="cpu")
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.model_max_length,
+            no_boseos_middle=no_boseos_middle,
+        )
+        # uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+        uncond_weights = torch.tensor(uncond_weights, dtype=torch.float, device="cpu")
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = (
+            text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        )
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = (
+            text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        )
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = (
+                uncond_embeddings.float()
+                .mean(axis=[-2, -1])
+                .to(uncond_embeddings.dtype)
+            )
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = (
+                uncond_embeddings.float()
+                .mean(axis=[-2, -1])
+                .to(uncond_embeddings.dtype)
+            )
+            uncond_embeddings *= (
+                (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+            )
+
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
--- a/apps/shark_studio/modules/schedulers.py
+++ b/apps/shark_studio/modules/schedulers.py
@@ -0,0 +1,118 @@
+# from shark_turbine.turbine_models.schedulers import export_scheduler_model
+from diffusers import (
+    LCMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    DDPMScheduler,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    KDPM2DiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DEISMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    HeunDiscreteScheduler,
+)
+
+
+def get_schedulers(model_id):
+    # TODO: switch over to turbine and run all on GPU
+    print(f"\n[LOG] Initializing schedulers from model id: {model_id}")
+    schedulers = dict()
+    schedulers["PNDM"] = PNDMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    # schedulers["DDPM"] = DDPMScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["KDPM2Discrete"] = KDPM2DiscreteScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["LMSDiscrete"] = LMSDiscreteScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["DDIM"] = DDIMScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["LCMScheduler"] = LCMScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["DPMSolverMultistep"] = DPMSolverMultistepScheduler.from_pretrained(
+    #     model_id, subfolder="scheduler", algorithm_type="dpmsolver"
+    # )
+    # schedulers["DPMSolverMultistep++"] = DPMSolverMultistepScheduler.from_pretrained(
+    #     model_id, subfolder="scheduler", algorithm_type="dpmsolver++"
+    # )
+    # schedulers["DPMSolverMultistepKarras"] = (
+    #     DPMSolverMultistepScheduler.from_pretrained(
+    #         model_id,
+    #         subfolder="scheduler",
+    #         use_karras_sigmas=True,
+    #     )
+    # )
+    # schedulers["DPMSolverMultistepKarras++"] = (
+    #     DPMSolverMultistepScheduler.from_pretrained(
+    #         model_id,
+    #         subfolder="scheduler",
+    #         algorithm_type="dpmsolver++",
+    #         use_karras_sigmas=True,
+    #     )
+    # )
+    schedulers["EulerDiscrete"] = EulerDiscreteScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+    )
+    schedulers["EulerAncestralDiscrete"] = (
+        EulerAncestralDiscreteScheduler.from_pretrained(
+            model_id,
+            subfolder="scheduler",
+        )
+    )
+    # schedulers["DEISMultistep"] = DEISMultistepScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["DPMSolverSinglestep"] = DPMSolverSinglestepScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    # schedulers["KDPM2AncestralDiscrete"] = (
+    #     KDPM2AncestralDiscreteScheduler.from_pretrained(
+    #         model_id,
+    #         subfolder="scheduler",
+    #     )
+    # )
+    # schedulers["HeunDiscrete"] = HeunDiscreteScheduler.from_pretrained(
+    #     model_id,
+    #     subfolder="scheduler",
+    # )
+    return schedulers
+
+
+def export_scheduler_model(model):
+    return "None", "None"
+
+
+scheduler_model_map = {
+    "PNDM": export_scheduler_model("PNDMScheduler"),
+    # "DPMSolverSDE": export_scheduler_model("DpmSolverSDEScheduler"),
+    "EulerDiscrete": export_scheduler_model("EulerDiscreteScheduler"),
+    "EulerAncestralDiscrete": export_scheduler_model("EulerAncestralDiscreteScheduler"),
+    # "LCM": export_scheduler_model("LCMScheduler"),
+    # "LMSDiscrete": export_scheduler_model("LMSDiscreteScheduler"),
+    # "DDPM": export_scheduler_model("DDPMScheduler"),
+    # "DDIM": export_scheduler_model("DDIMScheduler"),
+    # "DPMSolverMultistep": export_scheduler_model("DPMSolverMultistepScheduler"),
+    # "KDPM2Discrete": export_scheduler_model("KDPM2DiscreteScheduler"),
+    # "DEISMultistep": export_scheduler_model("DEISMultistepScheduler"),
+    # "DPMSolverSinglestep": export_scheduler_model("DPMSolverSingleStepScheduler"),
+    # "KDPM2AncestralDiscrete": export_scheduler_model("KDPM2AncestralDiscreteScheduler"),
+    # "HeunDiscrete": export_scheduler_model("HeunDiscreteScheduler"),
+}
--- a/apps/shark_studio/modules/seed.py
+++ b/apps/shark_studio/modules/seed.py
@@ -0,0 +1,66 @@
+import numpy as np
+import json
+from random import (
+    randint,
+    seed as seed_random,
+    getstate as random_getstate,
+    setstate as random_setstate,
+)
+
+
+# Generate and return a new seed if the provided one is not in the
+# supported range (including -1)
+def sanitize_seed(seed: int | str):
+    seed = int(seed)
+    uint32_info = np.iinfo(np.uint32)
+    uint32_min, uint32_max = uint32_info.min, uint32_info.max
+    if seed < uint32_min or seed >= uint32_max:
+        seed = randint(uint32_min, uint32_max)
+    return seed
+
+
+# take a seed expression in an input format and convert it to
+# a list of integers, where possible
+def parse_seed_input(seed_input: str | list | int):
+    if isinstance(seed_input, str):
+        try:
+            seed_input = json.loads(seed_input)
+        except (ValueError, TypeError):
+            seed_input = None
+
+    if isinstance(seed_input, int):
+        return [seed_input]
+
+    if isinstance(seed_input, list) and all(type(seed) is int for seed in seed_input):
+        return seed_input
+
+    raise TypeError(
+        "Seed input must be an integer or an array of integers in JSON format"
+    )
+
+
+# Generate a set of seeds from an input expression for batch_count batches,
+# optionally using that input as the rng seed for any randomly generated seeds.
+def batch_seeds(seed_input: str | list | int, batch_count: int, repeatable=False):
+    # turn the input into a list if possible
+    seeds = parse_seed_input(seed_input)
+
+    # slice or pad the list to be of batch_count length
+    seeds = seeds[:batch_count] + [-1] * (batch_count - len(seeds))
+
+    if repeatable:
+        if all(seed < 0 for seed in seeds):
+            seeds[0] = sanitize_seed(seeds[0])
+
+        # set seed for the rng based on what we have so far
+        saved_random_state = random_getstate()
+        seed_random(str([n for n in seeds if n > -1]))
+
+    # generate any seeds that are unspecified
+    seeds = [sanitize_seed(seed) for seed in seeds]
+
+    if repeatable:
+        # reset the rng back to normal
+        random_setstate(saved_random_state)
+
+    return seeds
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -2,7 +2,7 @@ import argparse
 import os
 from pathlib import Path

-from apps.stable_diffusion.src.utils.resamplers import resampler_list
+from apps.shark_studio.modules.img_processing import resampler_list


 def path_expand(s):
@@ -32,11 +32,11 @@ p.add_argument(
 )
 p.add_argument(
    "-p",
-    "--prompts",
+    "--prompt",
    nargs="+",
    default=[
        "a photo taken of the front of a super-car drifting on a road near "
-        "mountains at high speeds with smokes coming off the tires, front "
+        "mountains at high speeds with smoke coming off the tires, front "
        "angle, front point of view, trees in the mountains of the "
        "background, ((sharp focus))"
    ],
@@ -44,7 +44,7 @@ p.add_argument(
 )

 p.add_argument(
-    "--negative_prompts",
+    "--negative_prompt",
    nargs="+",
    default=[
        "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), "
@@ -54,7 +54,7 @@ p.add_argument(
 )

 p.add_argument(
-    "--img_path",
+    "--sd_init_image",
    type=str,
    help="Path to the image input for img2img/inpainting.",
 )
@@ -85,7 +85,7 @@ p.add_argument(
    "--height",
    type=int,
    default=512,
-    choices=range(128, 769, 8),
+    choices=range(128, 1025, 8),
    help="The height of the output image.",
 )

@@ -93,7 +93,7 @@ p.add_argument(
    "--width",
    type=int,
    default=512,
-    choices=range(128, 769, 8),
+    choices=range(128, 1025, 8),
    help="The width of the output image.",
 )

@@ -130,8 +130,7 @@ p.add_argument(
    "--strength",
    type=float,
    default=0.8,
-    help="The strength of change applied on the given input image for "
-    "img2img.",
+    help="The strength of change applied on the given input image for " "img2img.",
 )

 p.add_argument(
@@ -290,9 +289,7 @@ p.add_argument(
 # Model Config and Usage Params
 ##############################################################################

-p.add_argument(
-    "--device", type=str, default="vulkan", help="Device to run the model."
-)
+p.add_argument("--device", type=str, default="vulkan", help="Device to run the model.")

 p.add_argument(
    "--precision", type=str, default="fp16", help="Precision to run the model."
@@ -306,21 +303,6 @@ p.add_argument(
    "downloads the model from shark_tank.",
 )

-p.add_argument(
-    "--load_vmfb",
-    default=True,
-    action=argparse.BooleanOptionalAction,
-    help="Attempts to load the model from a precompiled flat-buffer "
-    "and compiles + saves it if not found.",
-)
-
-p.add_argument(
-    "--save_vmfb",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Saves the compiled flat-buffer to the local directory.",
-)
-
 p.add_argument(
    "--use_tuned",
    default=False,
@@ -338,7 +320,7 @@ p.add_argument(
 p.add_argument(
    "--scheduler",
    type=str,
-    default="SharkEulerDiscrete",
+    default="DDIM",
    help="Other supported schedulers are [DDIM, PNDM, LMSDiscrete, "
    "DPMSolverMultistep, DPMSolverMultistep++, DPMSolverMultistepKarras, "
    "DPMSolverMultistepKarras++, EulerDiscrete, EulerAncestralDiscrete, "
@@ -357,7 +339,7 @@ p.add_argument(
 p.add_argument(
    "--output_dir",
    type=str,
-    default=None,
+    default=os.path.join(os.getcwd(), "generated_imgs"),
    help="Directory path to save the output images and json.",
 )

@@ -365,8 +347,7 @@ p.add_argument(
    "--batch_count",
    type=int,
    default=1,
-    help="Number of batches to be generated with random seeds in "
-    "single execution.",
+    help="Number of batches to be generated with random seeds in " "single execution.",
 )

 p.add_argument(
@@ -378,10 +359,10 @@ p.add_argument(
 )

 p.add_argument(
-    "--ckpt_loc",
+    "--custom_weights",
    type=str,
    default="",
-    help="Path to SD's .ckpt file.",
+    help="Path to a .safetensors or .ckpt file for SD pipeline weights.",
 )

 p.add_argument(
@@ -393,7 +374,7 @@ p.add_argument(
 )

 p.add_argument(
-    "--hf_model_id",
+    "--base_model_id",
    type=str,
    default="stabilityai/stable-diffusion-2-1-base",
    help="The repo-id of hugging face.",
@@ -431,8 +412,7 @@ p.add_argument(
    "--use_lora",
    type=str,
    default="",
-    help="Use standalone LoRA weight using a HF ID or a checkpoint "
-    "file (~3 MB).",
+    help="Use standalone LoRA weight using a HF ID or a checkpoint " "file (~3 MB).",
 )

 p.add_argument(
@@ -446,7 +426,7 @@ p.add_argument(
 )

 p.add_argument(
-    "--ondemand",
+    "--lowvram",
    default=False,
    action=argparse.BooleanOptionalAction,
    help="Load and unload models for low VRAM.",
@@ -459,6 +439,13 @@ p.add_argument(
    help="Specify your own huggingface authentication tokens for models like Llama2.",
 )

+p.add_argument(
+    "--external_weights",
+    type=str,
+    default=None,
+    help="What type of externalized weights to use. Currently options are 'safetensors' and defaults to inlined weights.",
+)
+
 p.add_argument(
    "--device_allocator_heap_key",
    type=str,
@@ -467,6 +454,7 @@ p.add_argument(
    "Expected form: max_allocation_size;max_allocation_capacity;max_free_allocation_count"
    "Example: --device_allocator_heap_key='*;1gib' (will limit caching on device to 1 gigabyte)",
 )
+
 ##############################################################################
 # IREE - Vulkan supported flags
 ##############################################################################
@@ -507,8 +495,7 @@ p.add_argument(
    "--dump_isa",
    default=False,
    action="store_true",
-    help="When enabled call amdllpc to get ISA dumps. "
-    "Use with dispatch benchmarks.",
+    help="When enabled call amdllpc to get ISA dumps. " "Use with dispatch benchmarks.",
 )

 p.add_argument(
@@ -529,8 +516,7 @@ p.add_argument(
    "--enable_rgp",
    default=False,
    action=argparse.BooleanOptionalAction,
-    help="Flag for inserting debug frames between iterations "
-    "for use with rgp.",
+    help="Flag for inserting debug frames between iterations " "for use with rgp.",
 )

 p.add_argument(
@@ -601,25 +587,53 @@ p.add_argument(
    help="Controls data tiling in iree-compile for all SD models.",
 )

+p.add_argument(
+    "--quantization",
+    type=str,
+    default="None",
+    help="Quantization to be used for api-exposed model.",
+)
+
 ##############################################################################
 # Web UI flags
 ##############################################################################

 p.add_argument(
-    "--progress_bar",
+    "--webui",
    default=True,
    action=argparse.BooleanOptionalAction,
-    help="Flag for removing the progress bar animation during "
-    "image generation.",
+    help="controls whether the webui is launched.",
 )

 p.add_argument(
-    "--ckpt_dir",
+    "--progress_bar",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for removing the progress bar animation during " "image generation.",
+)
+
+p.add_argument(
+    "--tmp_dir",
    type=str,
-    default="",
+    default=os.path.join(os.getcwd(), "shark_tmp"),
+    help="Path to tmp directory",
+)
+
+p.add_argument(
+    "--config_dir",
+    type=str,
+    default=os.path.join(os.getcwd(), "configs"),
+    help="Path to config directory",
+)
+
+p.add_argument(
+    "--model_dir",
+    type=str,
+    default=os.path.join(os.getcwd(), "models"),
    help="Path to directory where all .ckpts are stored in order to populate "
    "them in the web UI.",
 )
+
 # TODO: replace API flag when these can be run together
 p.add_argument(
    "--ui",
@@ -676,6 +690,13 @@ p.add_argument(
    "images under --output_dir in the UI.",
 )

+p.add_argument(
+    "--configs_path",
+    default=None,
+    type=str,
+    help="Path to .json config directory.",
+)
+
 p.add_argument(
    "--output_gallery_followlinks",
    default=False,
@@ -684,6 +705,12 @@ p.add_argument(
    "follow symlinks when listing subdirectories under --output_dir.",
 )

+p.add_argument(
+    "--api_log",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Enables Compatibility API logging.",
+)

 ##############################################################################
 # SD model auto-annotation flags
@@ -757,8 +784,8 @@ p.add_argument(
    "or `iree-run-module --dump_devices=rocm` or `hipinfo` to get desired arch name",
 )

-args, unknown = p.parse_known_args()
-if args.import_debug:
+cmd_opts, unknown = p.parse_known_args()
+if cmd_opts.import_debug:
    os.environ["IREE_SAVE_TEMPS"] = os.path.join(
-        os.getcwd(), args.hf_model_id.replace("/", "_")
+        os.getcwd(), cmd_opts.hf_model_id.replace("/", "_")
    )
--- a/apps/shark_studio/modules/timer.py
+++ b/apps/shark_studio/modules/timer.py
@@ -0,0 +1,106 @@
+import time
+import argparse
+
+
+class TimerSubcategory:
+    def __init__(self, timer, category):
+        self.timer = timer
+        self.category = category
+        self.start = None
+        self.original_base_category = timer.base_category
+
+    def __enter__(self):
+        self.start = time.time()
+        self.timer.base_category = self.original_base_category + self.category + "/"
+        self.timer.subcategory_level += 1
+
+        if self.timer.print_log:
+            print(f"{'  ' * self.timer.subcategory_level}{self.category}:")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        elapsed_for_subcategroy = time.time() - self.start
+        self.timer.base_category = self.original_base_category
+        self.timer.add_time_to_record(
+            self.original_base_category + self.category,
+            elapsed_for_subcategroy,
+        )
+        self.timer.subcategory_level -= 1
+        self.timer.record(self.category, disable_log=True)
+
+
+class Timer:
+    def __init__(self, print_log=False):
+        self.start = time.time()
+        self.records = {}
+        self.total = 0
+        self.base_category = ""
+        self.print_log = print_log
+        self.subcategory_level = 0
+
+    def elapsed(self):
+        end = time.time()
+        res = end - self.start
+        self.start = end
+        return res
+
+    def add_time_to_record(self, category, amount):
+        if category not in self.records:
+            self.records[category] = 0
+
+        self.records[category] += amount
+
+    def record(self, category, extra_time=0, disable_log=False):
+        e = self.elapsed()
+
+        self.add_time_to_record(self.base_category + category, e + extra_time)
+
+        self.total += e + extra_time
+
+        if self.print_log and not disable_log:
+            print(
+                f"{'  ' * self.subcategory_level}{category}: done in {e + extra_time:.3f}s"
+            )
+
+    def subcategory(self, name):
+        self.elapsed()
+
+        subcat = TimerSubcategory(self, name)
+        return subcat
+
+    def summary(self):
+        res = f"{self.total:.1f}s"
+
+        additions = [
+            (category, time_taken)
+            for category, time_taken in self.records.items()
+            if time_taken >= 0.1 and "/" not in category
+        ]
+        if not additions:
+            return res
+
+        res += " ("
+        res += ", ".join(
+            [f"{category}: {time_taken:.1f}s" for category, time_taken in additions]
+        )
+        res += ")"
+
+        return res
+
+    def dump(self):
+        return {"total": self.total, "records": self.records}
+
+    def reset(self):
+        self.__init__()
+
+
+parser = argparse.ArgumentParser(add_help=False)
+parser.add_argument(
+    "--log-startup",
+    action="store_true",
+    help="print a detailed log of what's happening at startup",
+)
+args = parser.parse_known_args()[0]
+
+startup_timer = Timer(print_log=args.log_startup)
+
+startup_record = None
--- a/apps/shark_studio/shark_studio.spec
+++ b/apps/shark_studio/shark_studio.spec
@@ -1,5 +1,5 @@
 # -*- mode: python ; coding: utf-8 -*-
-from apps.stable_diffusion.shark_studio_imports import pathex, datas, hiddenimports
+from apps.shark_studio.studio_imports import pathex, datas, hiddenimports

 binaries = []

@@ -19,6 +19,9 @@ a = Analysis(
    win_private_assemblies=False,
    cipher=block_cipher,
    noarchive=False,
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    },
 )
 pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)

--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -9,8 +9,6 @@ sys.setrecursionlimit(sys.getrecursionlimit() * 5)
 # python path for pyinstaller
 pathex = [
    ".",
-    "./apps/language_models/langchain",
-    "./apps/language_models/src/pipelines/minigpt4_utils",
 ]

 # datafiles for pyinstaller
@@ -24,65 +22,47 @@ datas += copy_metadata("packaging")
 datas += copy_metadata("filelock")
 datas += copy_metadata("numpy")
 datas += copy_metadata("importlib_metadata")
-datas += copy_metadata("torch-mlir")
 datas += copy_metadata("omegaconf")
 datas += copy_metadata("safetensors")
 datas += copy_metadata("Pillow")
 datas += copy_metadata("sentencepiece")
 datas += copy_metadata("pyyaml")
 datas += copy_metadata("huggingface-hub")
+datas += copy_metadata("gradio")
+datas += copy_metadata("scipy")
 datas += collect_data_files("torch")
 datas += collect_data_files("tokenizers")
-datas += collect_data_files("tiktoken")
 datas += collect_data_files("accelerate")
 datas += collect_data_files("diffusers")
 datas += collect_data_files("transformers")
-datas += collect_data_files("pytorch_lightning")
-datas += collect_data_files("skimage")
 datas += collect_data_files("gradio")
 datas += collect_data_files("gradio_client")
-datas += collect_data_files("iree")
+datas += collect_data_files("iree", include_py_files=True)
 datas += collect_data_files("shark", include_py_files=True)
-datas += collect_data_files("timm", include_py_files=True)
 datas += collect_data_files("tqdm")
 datas += collect_data_files("tkinter")
-datas += collect_data_files("webview")
 datas += collect_data_files("sentencepiece")
 datas += collect_data_files("jsonschema")
 datas += collect_data_files("jsonschema_specifications")
 datas += collect_data_files("cpuinfo")
-datas += collect_data_files("langchain")
-datas += collect_data_files("cv2")
-datas += collect_data_files("einops")
+datas += collect_data_files("scipy", include_py_files=True)
 datas += [
-    ("src/utils/resources/prompts.json", "resources"),
-    ("src/utils/resources/model_db.json", "resources"),
-    ("src/utils/resources/opt_flags.json", "resources"),
-    ("src/utils/resources/base_model.json", "resources"),
    ("web/ui/css/*", "ui/css"),
+    ("web/ui/js/*", "ui/js"),
    ("web/ui/logos/*", "logos"),
-    (
-        "../language_models/src/pipelines/minigpt4_utils/configs/*",
-        "minigpt4_utils/configs",
-    ),
-    (
-        "../language_models/src/pipelines/minigpt4_utils/prompts/*",
-        "minigpt4_utils/prompts",
-    ),
 ]


 # hidden imports for pyinstaller
-hiddenimports = ["shark", "shark.shark_inference", "apps"]
-hiddenimports += [x for x in collect_submodules("skimage") if "tests" not in x]
-hiddenimports += [
-    x for x in collect_submodules("diffusers") if "tests" not in x
-]
+hiddenimports = ["shark", "apps"]
+hiddenimports += [x for x in collect_submodules("gradio") if "tests" not in x]
+hiddenimports += [x for x in collect_submodules("diffusers") if "tests" not in x]
 blacklist = ["tests", "convert"]
 hiddenimports += [
    x
    for x in collect_submodules("transformers")
    if not any(kw in x for kw in blacklist)
 ]
-hiddenimports += [x for x in collect_submodules("iree") if "tests" not in x]
-hiddenimports += ["iree._runtime", "iree.compiler._mlir_libs._mlir.ir"]
+hiddenimports += [x for x in collect_submodules("iree") if "test" not in x]
+hiddenimports += ["iree._runtime"]
+hiddenimports += [x for x in collect_submodules("scipy") if "test" not in x]
--- a/apps/shark_studio/tests/api_test.py
+++ b/apps/shark_studio/tests/api_test.py
@@ -0,0 +1,58 @@
+# Copyright 2023 Nod Labs, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import unittest
+import json
+import gc
+from apps.shark_studio.api.llm import LanguageModel, llm_chat_api
+from apps.shark_studio.api.sd import shark_sd_fn_dict_input, view_json_file
+from apps.shark_studio.web.utils.file_utils import get_resource_path
+
+# class SDAPITest(unittest.TestCase):
+#     def testSDSimple(self):
+#         from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+#         import apps.shark_studio.web.utils.globals as global_obj
+
+#         global_obj._init()
+
+#         sd_json = view_json_file(get_resource_path("../configs/default_sd_config.json"))
+#         sd_kwargs = json.loads(sd_json)
+#         for arg in vars(cmd_opts):
+#             if arg in sd_kwargs:
+#                 sd_kwargs[arg] = getattr(cmd_opts, arg)
+#         for i in shark_sd_fn_dict_input(sd_kwargs):
+#             print(i)
+
+
+class LLMAPITest(unittest.TestCase):
+    def test01_LLMSmall(self):
+        lm = LanguageModel(
+            "TinyPixel/small-llama2",
+            hf_auth_token=None,
+            device="cpu",
+            precision="fp32",
+            quantization="None",
+            streaming_llm=True,
+        )
+        count = 0
+        label = "Turkishoure Turkish"
+        for msg, _ in lm.chat("hi, what are you?"):
+            # skip first token output
+            if count == 0:
+                count += 1
+                continue
+            assert (
+                msg.strip(" ") == label
+            ), f"LLM API failed to return correct response, expected '{label}', received {msg}"
+            break
+        del lm
+        gc.collect()
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    unittest.main()
--- a/apps/shark_studio/tests/export_unet.py
+++ b/apps/shark_studio/tests/export_unet.py
@@ -0,0 +1,41 @@
+import torch
+from diffusers import (
+    UNet2DConditionModel,
+)
+from torch.fx.experimental.proxy_tensor import make_fx
+
+
+class UnetModel(torch.nn.Module):
+    def __init__(self, hf_model_name):
+        super().__init__()
+        self.unet = UNet2DConditionModel.from_pretrained(
+            hf_model_name,
+            subfolder="unet",
+        )
+
+    def forward(self, sample, timestep, encoder_hidden_states, guidance_scale):
+        samples = torch.cat([sample] * 2)
+        unet_out = self.unet.forward(
+            samples, timestep, encoder_hidden_states, return_dict=False
+        )[0]
+        noise_pred_uncond, noise_pred_text = unet_out.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (
+            noise_pred_text - noise_pred_uncond
+        )
+        return noise_pred
+
+
+if __name__ == "__main__":
+    hf_model_name = "CompVis/stable-diffusion-v1-4"
+    unet = UnetModel(hf_model_name)
+    inputs = (torch.randn(1, 4, 64, 64), 1, torch.randn(2, 77, 768), 7.5)
+
+    fx_g = make_fx(
+        unet,
+        decomposition_table={},
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+        _allow_fake_constant=False,
+    )(*inputs)
+
+    print(fx_g)
--- a/apps/shark_studio/tests/jupiter.png
+++ b/apps/shark_studio/tests/jupiter.png
--- a/apps/shark_studio/tests/rest_api_test.py
+++ b/apps/shark_studio/tests/rest_api_test.py
@@ -0,0 +1,45 @@
+import requests
+from PIL import Image
+import base64
+from io import BytesIO
+import json
+
+
+def llm_chat_test(verbose=False):
+    # Define values here
+    prompt = "What is the significance of the number 42?"
+
+    url = "http://127.0.0.1:8080/v1/chat/completions"
+
+    headers = {
+        "User-Agent": "PythonTest",
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+
+    data = {
+        "model": "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+        "messages": [
+            {
+                "role": "",
+                "content": prompt,
+            }
+        ],
+        "device": "vulkan://0",
+        "max_tokens": 4096,
+    }
+
+    res = requests.post(url=url, json=data, headers=headers, timeout=1000)
+    res_dict = json.loads(res.content.decode("utf-8"))
+    print(f"[chat] response from server was : {res.status_code} {res.reason}")
+
+    if verbose or res.status_code != 200:
+        print(f"\n{res_dict['choices'][0]['message']['content']}\n")
+
+
+if __name__ == "__main__":
+    # "Exercises the chatbot REST API of Shark. Make sure "
+    # "Shark is running in API mode on 127.0.0.1:8080 before running"
+    # "this script."
+
+    llm_chat_test(verbose=True)
--- a/apps/shark_studio/web/api/compat.py
+++ b/apps/shark_studio/web/api/compat.py
@@ -0,0 +1,286 @@
+import base64
+import io
+import os
+import time
+import datetime
+import uvicorn
+import ipaddress
+import requests
+import threading
+import collections
+import gradio as gr
+from PIL import Image, PngImagePlugin
+from threading import Lock
+from io import BytesIO
+from fastapi import APIRouter, Depends, FastAPI, Request, Response
+from fastapi.security import HTTPBasic, HTTPBasicCredentials
+from fastapi.exceptions import HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.encoders import jsonable_encoder
+
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+# from sdapi_v1 import shark_sd_api
+from apps.shark_studio.api.llm import llm_chat_api
+
+
+def decode_base64_to_image(encoding):
+    if encoding.startswith("http://") or encoding.startswith("https://"):
+        headers = {}
+        response = requests.get(encoding, timeout=30, headers=headers)
+        try:
+            image = Image.open(BytesIO(response.content))
+            return image
+        except Exception as e:
+            raise HTTPException(status_code=500, detail="Invalid image url") from e
+
+    if encoding.startswith("data:image/"):
+        encoding = encoding.split(";")[1].split(",")[1]
+    try:
+        image = Image.open(BytesIO(base64.b64decode(encoding)))
+        return image
+    except Exception as e:
+        raise HTTPException(status_code=500, detail="Invalid encoded image") from e
+
+
+def encode_pil_to_base64(image):
+    with io.BytesIO() as output_bytes:
+        use_metadata = False
+        metadata = PngImagePlugin.PngInfo()
+        for key, value in image.info.items():
+            if isinstance(key, str) and isinstance(value, str):
+                metadata.add_text(key, value)
+                use_metadata = True
+        image.save(
+            output_bytes,
+            format="PNG",
+            pnginfo=(metadata if use_metadata else None),
+        )
+
+        bytes_data = output_bytes.getvalue()
+
+    return base64.b64encode(bytes_data)
+
+
+# reference: https://gist.github.com/vitaliyp/6d54dd76ca2c3cdfc1149d33007dc34a
+class FIFOLock(object):
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._inner_lock = threading.Lock()
+        self._pending_threads = collections.deque()
+
+    def acquire(self, blocking=True):
+        with self._inner_lock:
+            lock_acquired = self._lock.acquire(False)
+            if lock_acquired:
+                return True
+            elif not blocking:
+                return False
+
+            release_event = threading.Event()
+            self._pending_threads.append(release_event)
+
+        release_event.wait()
+        return self._lock.acquire()
+
+    def release(self):
+        with self._inner_lock:
+            if self._pending_threads:
+                release_event = self._pending_threads.popleft()
+                release_event.set()
+
+            self._lock.release()
+
+    __enter__ = acquire
+
+    def __exit__(self, t, v, tb):
+        self.release()
+
+
+def api_middleware(app: FastAPI):
+    rich_available = False
+    try:
+        if os.environ.get("WEBUI_RICH_EXCEPTIONS", None) is not None:
+            import anyio  # importing just so it can be placed on silent list
+            import starlette  # importing just so it can be placed on silent list
+            from rich.console import Console
+
+            console = Console()
+            rich_available = True
+    except Exception:
+        pass
+
+    @app.middleware("http")
+    async def log_and_time(req: Request, call_next):
+        ts = time.time()
+        res: Response = await call_next(req)
+        duration = str(round(time.time() - ts, 4))
+        res.headers["X-Process-Time"] = duration
+        endpoint = req.scope.get("path", "err")
+        if cmd_opts.api_log and endpoint.startswith("/sdapi"):
+            print(
+                "API {t} {code} {prot}/{ver} {method} {endpoint} {cli} {duration}".format(
+                    t=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),
+                    code=res.status_code,
+                    ver=req.scope.get("http_version", "0.0"),
+                    cli=req.scope.get("client", ("0:0.0.0", 0))[0],
+                    prot=req.scope.get("scheme", "err"),
+                    method=req.scope.get("method", "err"),
+                    endpoint=endpoint,
+                    duration=duration,
+                )
+            )
+        return res
+
+    def handle_exception(request: Request, e: Exception):
+        err = {
+            "error": type(e).__name__,
+            "detail": vars(e).get("detail", ""),
+            "body": vars(e).get("body", ""),
+            "errors": str(e),
+        }
+        if not isinstance(
+            e, HTTPException
+        ):  # do not print backtrace on known httpexceptions
+            message = f"API error: {request.method}: {request.url} {err}"
+            if rich_available:
+                print(message)
+                console.print_exception(
+                    show_locals=True,
+                    max_frames=2,
+                    extra_lines=1,
+                    suppress=[anyio, starlette],
+                    word_wrap=False,
+                    width=min([console.width, 200]),
+                )
+            else:
+                print(message)
+                raise (e)
+        return JSONResponse(
+            status_code=vars(e).get("status_code", 500),
+            content=jsonable_encoder(err),
+        )
+
+    @app.middleware("http")
+    async def exception_handling(request: Request, call_next):
+        try:
+            return await call_next(request)
+        except Exception as e:
+            return handle_exception(request, e)
+
+    @app.exception_handler(Exception)
+    async def fastapi_exception_handler(request: Request, e: Exception):
+        return handle_exception(request, e)
+
+    @app.exception_handler(HTTPException)
+    async def http_exception_handler(request: Request, e: HTTPException):
+        return handle_exception(request, e)
+
+
+class ApiCompat:
+    def __init__(self, app: FastAPI, queue_lock: Lock):
+        self.router = APIRouter()
+        self.app = app
+        self.queue_lock = queue_lock
+        api_middleware(self.app)
+        # self.add_api_route("/sdapi/v1/txt2img", shark_sd_api, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/img2img", shark_sd_api, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/upscaler", self.upscaler_api, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/extra-single-image", self.extras_single_image_api, methods=["POST"], response_model=models.ExtrasSingleImageResponse)
+        # self.add_api_route("/sdapi/v1/extra-batch-images", self.extras_batch_images_api, methods=["POST"], response_model=models.ExtrasBatchImagesResponse)
+        # self.add_api_route("/sdapi/v1/png-info", self.pnginfoapi, methods=["POST"], response_model=models.PNGInfoResponse)
+        # self.add_api_route("/sdapi/v1/progress", self.progressapi, methods=["GET"], response_model=models.ProgressResponse)
+        # self.add_api_route("/sdapi/v1/interrogate", self.interrogateapi, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/interrupt", self.interruptapi, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/skip", self.skip, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/options", self.get_config, methods=["GET"], response_model=models.OptionsModel)
+        # self.add_api_route("/sdapi/v1/options", self.set_config, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/cmd-flags", self.get_cmd_flags, methods=["GET"], response_model=models.FlagsModel)
+        # self.add_api_route("/sdapi/v1/samplers", self.get_samplers, methods=["GET"], response_model=List[models.SamplerItem])
+        # self.add_api_route("/sdapi/v1/upscalers", self.get_upscalers, methods=["GET"], response_model=List[models.UpscalerItem])
+        # self.add_api_route("/sdapi/v1/latent-upscale-modes", self.get_latent_upscale_modes, methods=["GET"], response_model=List[models.LatentUpscalerModeItem])
+        # self.add_api_route("/sdapi/v1/sd-models", self.get_sd_models, methods=["GET"], response_model=List[models.SDModelItem])
+        # self.add_api_route("/sdapi/v1/sd-vae", self.get_sd_vaes, methods=["GET"], response_model=List[models.SDVaeItem])
+        # self.add_api_route("/sdapi/v1/hypernetworks", self.get_hypernetworks, methods=["GET"], response_model=List[models.HypernetworkItem])
+        # self.add_api_route("/sdapi/v1/face-restorers", self.get_face_restorers, methods=["GET"], response_model=List[models.FaceRestorerItem])
+        # self.add_api_route("/sdapi/v1/realesrgan-models", self.get_realesrgan_models, methods=["GET"], response_model=List[models.RealesrganItem])
+        # self.add_api_route("/sdapi/v1/prompt-styles", self.get_prompt_styles, methods=["GET"], response_model=List[models.PromptStyleItem])
+        # self.add_api_route("/sdapi/v1/embeddings", self.get_embeddings, methods=["GET"], response_model=models.EmbeddingsResponse)
+        # self.add_api_route("/sdapi/v1/refresh-checkpoints", self.refresh_checkpoints, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/refresh-vae", self.refresh_vae, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/create/embedding", self.create_embedding, methods=["POST"], response_model=models.CreateResponse)
+        # self.add_api_route("/sdapi/v1/create/hypernetwork", self.create_hypernetwork, methods=["POST"], response_model=models.CreateResponse)
+        # self.add_api_route("/sdapi/v1/preprocess", self.preprocess, methods=["POST"], response_model=models.PreprocessResponse)
+        # self.add_api_route("/sdapi/v1/train/embedding", self.train_embedding, methods=["POST"], response_model=models.TrainResponse)
+        # self.add_api_route("/sdapi/v1/train/hypernetwork", self.train_hypernetwork, methods=["POST"], response_model=models.TrainResponse)
+        # self.add_api_route("/sdapi/v1/memory", self.get_memory, methods=["GET"], response_model=models.MemoryResponse)
+        # self.add_api_route("/sdapi/v1/unload-checkpoint", self.unloadapi, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/reload-checkpoint", self.reloadapi, methods=["POST"])
+        # self.add_api_route("/sdapi/v1/scripts", self.get_scripts_list, methods=["GET"], response_model=models.ScriptsList)
+        # self.add_api_route("/sdapi/v1/script-info", self.get_script_info, methods=["GET"], response_model=List[models.ScriptInfo])
+
+        # chat APIs needed for compatibility with multiple extensions using OpenAI API
+        self.add_api_route("/v1/chat/completions", llm_chat_api, methods=["POST"])
+        self.add_api_route("/v1/completions", llm_chat_api, methods=["POST"])
+        self.add_api_route("/chat/completions", llm_chat_api, methods=["POST"])
+        self.add_api_route("/completions", llm_chat_api, methods=["POST"])
+        self.add_api_route(
+            "/v1/engines/codegen/completions", llm_chat_api, methods=["POST"]
+        )
+
+        self.default_script_arg_txt2img = []
+        self.default_script_arg_img2img = []
+
+    def add_api_route(self, path: str, endpoint, **kwargs):
+        return self.app.add_api_route(path, endpoint, **kwargs)
+
+    # def refresh_checkpoints(self):
+    #     with self.queue_lock:
+    #         studio_data.refresh_checkpoints()
+
+    # def refresh_vae(self):
+    #     with self.queue_lock:
+    #         studio_data.refresh_vae_list()
+
+    # def unloadapi(self):
+    #     unload_model_weights()
+
+    #     return {}
+
+    # def reloadapi(self):
+    #     reload_model_weights()
+
+    #     return {}
+
+    # def skip(self):
+    #     studio.state.skip()
+
+    def launch(self, server_name, port, root_path):
+        self.app.include_router(self.router)
+        uvicorn.run(
+            self.app,
+            host=server_name,
+            port=port,
+            root_path=root_path,
+        )
+
+    # def kill_studio(self):
+    #     restart.stop_program()
+
+    # def restart_studio(self):
+    #     if restart.is_restartable():
+    #         restart.restart_program()
+    #     return Response(status_code=501)
+
+    # def preprocess(self, args: dict):
+    #     try:
+    #         studio.state.begin(job="preprocess")
+    #         preprocess(**args)
+    #         studio.state.end()
+    #         return models.PreprocessResponse(info="preprocess complete")
+    #     except:
+    #         studio.state.end()
+
+    # def stop_studio(request):
+    #     studio.state.server_command = "stop"
+    #     return Response("Stopping.")
--- a/apps/shark_studio/web/api/sd.py
+++ b/apps/shark_studio/web/api/sd.py
@@ -0,0 +1 @@
+
--- a/apps/shark_studio/web/index.py
+++ b/apps/shark_studio/web/index.py
@@ -1,20 +1,59 @@
 from multiprocessing import Process, freeze_support
+
+freeze_support()
+from PIL import Image
+
 import os
+import time
 import sys
 import logging
-from ui.chat import chat_element
+import apps.shark_studio.api.initializers as initialize
+
+
+from apps.shark_studio.modules import timer
+
+startup_timer = timer.startup_timer
+startup_timer.record("launcher")
+
+initialize.imports()

 if sys.platform == "darwin":
    os.environ["DYLD_LIBRARY_PATH"] = "/usr/local/lib"
    # import before IREE to avoid MLIR library issues
    import torch_mlir

-# import PIL, transformers, sentencepiece  # ensures inclusion in pysintaller exe generation
-# from apps.stable_diffusion.src import args, clear_all
-# import apps.stable_diffusion.web.utils.global_obj as global_obj
+
+def create_api(app):
+    from apps.shark_studio.web.api.compat import ApiCompat, FIFOLock
+
+    queue_lock = FIFOLock()
+    api = ApiCompat(app, queue_lock)
+    return api


-def launch_app(address):
+def api_only():
+    from fastapi import FastAPI
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+    initialize.initialize()
+
+    app = FastAPI()
+    initialize.setup_middleware(app)
+    api = create_api(app)
+
+    # from modules import script_callbacks
+    # script_callbacks.before_ui_callback()
+    # script_callbacks.app_started_callback(None, app)
+
+    print(f"Startup time: {startup_timer.summary()}.")
+    api.launch(
+        server_name="0.0.0.0",
+        port=cmd_opts.server_port,
+        root_path="",
+    )
+
+
+def launch_webui(address):
    from tkinter import Tk
    import webview

@@ -34,140 +73,78 @@ def launch_app(address):
    webview.start(private_mode=False, storage_path=os.getcwd())


-if __name__ == "__main__":
-    # if args.debug:
-    logging.basicConfig(level=logging.DEBUG)
+def webui():
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+    from apps.shark_studio.web.ui.utils import (
+        amdicon_loc,
+        amdlogo_loc,
+    )
+
+    launch_api = cmd_opts.api
+    initialize.initialize()
+
+    from ui.chat import chat_element
+    from ui.sd import sd_element
+    from ui.outputgallery import outputgallery_element
+
    # required to do multiprocessing in a pyinstaller freeze
    freeze_support()
-    #    if args.api or "api" in args.ui.split(","):
-    #        from apps.stable_diffusion.web.ui import (
-    #            txt2img_api,
-    #            img2img_api,
-    #            upscaler_api,
-    #            inpaint_api,
-    #            outpaint_api,
-    #            llm_chat_api,
-    #        )
-    #
-    #        from fastapi import FastAPI, APIRouter
-    #        import uvicorn
-    #
-    #        # init global sd pipeline and config
-    #        global_obj._init()
-    #
-    #        app = FastAPI()
-    #        app.add_api_route("/sdapi/v1/txt2img", txt2img_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/img2img", img2img_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/inpaint", inpaint_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/outpaint", outpaint_api, methods=["post"])
-    #        app.add_api_route("/sdapi/v1/upscaler", upscaler_api, methods=["post"])
-    #
-    #        # chat APIs needed for compatibility with multiple extensions using OpenAI API
-    #        app.add_api_route(
-    #            "/v1/chat/completions", llm_chat_api, methods=["post"]
-    #        )
-    #        app.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
-    #        app.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
-    #        app.add_api_route("/completions", llm_chat_api, methods=["post"])
-    #        app.add_api_route(
-    #            "/v1/engines/codegen/completions", llm_chat_api, methods=["post"]
-    #        )
-    #        app.include_router(APIRouter())
-    #        uvicorn.run(app, host="0.0.0.0", port=args.server_port)
-    #        sys.exit(0)
-    #
-    # Setup to use shark_tmp for gradio's temporary image files and clear any
-    # existing temporary images there if they exist. Then we can import gradio.
-    # It has to be in this order or gradio ignores what we've set up.
-    # from apps.stable_diffusion.web.utils.gradio_configs import (
-    #    config_gradio_tmp_imgs_folder,
-    # )

-    # config_gradio_tmp_imgs_folder()
+    # if args.api or "api" in args.ui.split(","):
+    #     from apps.shark_studio.api.llm import (
+    #         chat,
+    #     )
+    #     from apps.shark_studio.web.api import sdapi
+    #
+    #     from fastapi import FastAPI, APIRouter
+    #     from fastapi.middleware.cors import CORSMiddleware
+    #     import uvicorn
+    #
+    #     # init global sd pipeline and config
+    #     global_obj._init()
+    #
+    #     api = FastAPI()
+    #     api.mount("/sdapi/", sdapi)
+    #
+    #     # chat APIs needed for compatibility with multiple extensions using OpenAI API
+    #     api.add_api_route(
+    #         "/v1/chat/completions", llm_chat_api, methods=["post"]
+    #     )
+    #     api.add_api_route("/v1/completions", llm_chat_api, methods=["post"])
+    #     api.add_api_route("/chat/completions", llm_chat_api, methods=["post"])
+    #     api.add_api_route("/completions", llm_chat_api, methods=["post"])
+    #     api.add_api_route(
+    #         "/v1/engines/codegen/completions", llm_chat_api, methods=["post"]
+    #     )
+    #     api.include_router(APIRouter())
+    #
+    #     # deal with CORS requests if CORS accept origins are set
+    #     if args.api_accept_origin:
+    #         print(
+    #             f"API Configured for CORS. Accepting origins: { args.api_accept_origin }"
+    #         )
+    #         api.add_middleware(
+    #             CORSMiddleware,
+    #             allow_origins=args.api_accept_origin,
+    #             allow_methods=["GET", "POST"],
+    #             allow_headers=["*"],
+    #         )
+    #     else:
+    #         print("API not configured for CORS")
+    #
+    #     uvicorn.run(api, host="0.0.0.0", port=args.server_port)
+    #     sys.exit(0)
    import gradio as gr

-    # Create custom models folders if they don't exist
-    # from apps.stable_diffusion.web.ui.utils import create_custom_models_folders
-
-    # create_custom_models_folders()
-
    def resource_path(relative_path):
        """Get absolute path to resource, works for dev and for PyInstaller"""
-        base_path = getattr(
-            sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__))
-        )
+        base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
        return os.path.join(base_path, relative_path)

    dark_theme = resource_path("ui/css/sd_dark_theme.css")
+    gradio_workarounds = resource_path("ui/js/sd_gradio_workarounds.js")

-    # from apps.stable_diffusion.web.ui import (
-    # txt2img_web,
-    # txt2img_custom_model,
-    # txt2img_gallery,
-    # txt2img_png_info_img,
-    # txt2img_status,
-    # txt2img_sendto_img2img,
-    # txt2img_sendto_inpaint,
-    # txt2img_sendto_outpaint,
-    # txt2img_sendto_upscaler,
-    ## h2ogpt_upload,
-    ## h2ogpt_web,
-    # img2img_web,
-    # img2img_custom_model,
-    # img2img_gallery,
-    # img2img_init_image,
-    # img2img_status,
-    # img2img_sendto_inpaint,
-    # img2img_sendto_outpaint,
-    # img2img_sendto_upscaler,
-    # inpaint_web,
-    # inpaint_custom_model,
-    # inpaint_gallery,
-    # inpaint_init_image,
-    # inpaint_status,
-    # inpaint_sendto_img2img,
-    # inpaint_sendto_outpaint,
-    # inpaint_sendto_upscaler,
-    # outpaint_web,
-    # outpaint_custom_model,
-    # outpaint_gallery,
-    # outpaint_init_image,
-    # outpaint_status,
-    # outpaint_sendto_img2img,
-    # outpaint_sendto_inpaint,
-    # outpaint_sendto_upscaler,
-    # upscaler_web,
-    # upscaler_custom_model,
-    # upscaler_gallery,
-    # upscaler_init_image,
-    # upscaler_status,
-    # upscaler_sendto_img2img,
-    # upscaler_sendto_inpaint,
-    # upscaler_sendto_outpaint,
-    ##  lora_train_web,
-    ##  model_web,
-    ##  model_config_web,
-    # hf_models,
-    # modelmanager_sendto_txt2img,
-    # modelmanager_sendto_img2img,
-    # modelmanager_sendto_inpaint,
-    # modelmanager_sendto_outpaint,
-    # modelmanager_sendto_upscaler,
-    # stablelm_chat,
-    # minigpt4_web,
-    # outputgallery_web,
-    # outputgallery_tab_select,
-    # outputgallery_watch,
-    # outputgallery_filename,
-    # outputgallery_sendto_txt2img,
-    # outputgallery_sendto_img2img,
-    # outputgallery_sendto_inpaint,
-    # outputgallery_sendto_outpaint,
-    # outputgallery_sendto_upscaler,
-    # )
-
-    # init global sd pipeline and config
-    # global_obj._init()
+    # from apps.shark_studio.web.ui import load_ui_from_script

    def register_button_click(button, selectedid, inputs, outputs):
        button.click(
@@ -179,17 +156,6 @@ if __name__ == "__main__":
            outputs,
        )

-    def register_modelmanager_button(button, selectedid, inputs, outputs):
-        button.click(
-            lambda x: (
-                "None",
-                x,
-                gr.Tabs.update(selected=selectedid),
-            ),
-            inputs,
-            outputs,
-        )
-
    def register_outputgallery_button(button, selectedid, inputs, outputs):
        button.click(
            lambda x: (
@@ -201,8 +167,19 @@ if __name__ == "__main__":
        )

    with gr.Blocks(
-        css=dark_theme, analytics_enabled=False, title="Stable Diffusion"
-    ) as sd_web:
+        css=dark_theme,
+        js=gradio_workarounds,
+        analytics_enabled=False,
+        title="Shark Studio 2.0 Beta",
+    ) as studio_web:
+        amd_logo = Image.open(amdlogo_loc)
+        gr.Image(
+            value=amd_logo,
+            show_label=False,
+            interactive=False,
+            elem_id="tab_bar_logo",
+            show_download_button=False,
+        )
        with gr.Tabs() as tabs:
            # NOTE: If adding, removing, or re-ordering tabs, make sure that they
            # have a unique id that doesn't clash with any of the other tabs,
@@ -213,216 +190,33 @@ if __name__ == "__main__":
            # destination of one of the 'send to' buttons. If you do have to change
            # that id, make sure you update the relevant register_button_click calls
            # further down with the new id.
-            # with gr.TabItem(label="Text-to-Image", id=0):
-            #    txt2img_web.render()
-            # with gr.TabItem(label="Image-to-Image", id=1):
-            #    img2img_web.render()
-            # with gr.TabItem(label="Inpainting", id=2):
-            #    inpaint_web.render()
-            # with gr.TabItem(label="Outpainting", id=3):
-            #    outpaint_web.render()
-            # with gr.TabItem(label="Upscaler", id=4):
-            #    upscaler_web.render()
-            # if args.output_gallery:
-            #    with gr.TabItem(label="Output Gallery", id=5) as og_tab:
-            #        outputgallery_web.render()
-
-            #    # extra output gallery configuration
-            #    outputgallery_tab_select(og_tab.select)
-            #    outputgallery_watch(
-            #        [
-            #            txt2img_status,
-            #            img2img_status,
-            #            inpaint_status,
-            #            outpaint_status,
-            #            upscaler_status,
-            #        ]
-            #    )
-            ##  with gr.TabItem(label="Model Manager", id=6):
-            ##      model_web.render()
-            ##  with gr.TabItem(label="LoRA Training (Experimental)", id=7):
-            ##      lora_train_web.render()
-            with gr.TabItem(label="Chat Bot", id=0):
+            with gr.TabItem(label="Stable Diffusion", id=0):
+                sd_element.render()
+            with gr.TabItem(label="Output Gallery", id=1):
+                outputgallery_element.render()
+            with gr.TabItem(label="Chat Bot", id=2):
                chat_element.render()
-            ##  with gr.TabItem(
-            ##      label="Generate Sharding Config (Experimental)", id=9
-            ##  ):
-            ##      model_config_web.render()
-            # with gr.TabItem(label="MultiModal (Experimental)", id=10):
-            #    minigpt4_web.render()
-            # with gr.TabItem(label="DocuChat Upload", id=11):
-            #     h2ogpt_upload.render()
-            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
-            #     h2ogpt_web.render()

-        # send to buttons
-        # register_button_click(
-        #    txt2img_sendto_img2img,
-        #    1,
-        #    [txt2img_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    txt2img_sendto_inpaint,
-        #    2,
-        #    [txt2img_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    txt2img_sendto_outpaint,
-        #    3,
-        #    [txt2img_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    txt2img_sendto_upscaler,
-        #    4,
-        #    [txt2img_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    img2img_sendto_inpaint,
-        #    2,
-        #    [img2img_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    img2img_sendto_outpaint,
-        #    3,
-        #    [img2img_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    img2img_sendto_upscaler,
-        #    4,
-        #    [img2img_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    inpaint_sendto_img2img,
-        #    1,
-        #    [inpaint_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    inpaint_sendto_outpaint,
-        #    3,
-        #    [inpaint_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    inpaint_sendto_upscaler,
-        #    4,
-        #    [inpaint_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    outpaint_sendto_img2img,
-        #    1,
-        #    [outpaint_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    outpaint_sendto_inpaint,
-        #    2,
-        #    [outpaint_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    outpaint_sendto_upscaler,
-        #    4,
-        #    [outpaint_gallery],
-        #    [upscaler_init_image, tabs],
-        # )
-        # register_button_click(
-        #    upscaler_sendto_img2img,
-        #    1,
-        #    [upscaler_gallery],
-        #    [img2img_init_image, tabs],
-        # )
-        # register_button_click(
-        #    upscaler_sendto_inpaint,
-        #    2,
-        #    [upscaler_gallery],
-        #    [inpaint_init_image, tabs],
-        # )
-        # register_button_click(
-        #    upscaler_sendto_outpaint,
-        #    3,
-        #    [upscaler_gallery],
-        #    [outpaint_init_image, tabs],
-        # )
-        # if args.output_gallery:
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_txt2img,
-        #        0,
-        #        [outputgallery_filename],
-        #        [txt2img_png_info_img, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_img2img,
-        #        1,
-        #        [outputgallery_filename],
-        #        [img2img_init_image, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_inpaint,
-        #        2,
-        #        [outputgallery_filename],
-        #        [inpaint_init_image, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_outpaint,
-        #        3,
-        #        [outputgallery_filename],
-        #        [outpaint_init_image, tabs],
-        #    )
-        #    register_outputgallery_button(
-        #        outputgallery_sendto_upscaler,
-        #        4,
-        #        [outputgallery_filename],
-        #        [upscaler_init_image, tabs],
-        #    )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_txt2img,
-        #    0,
-        #    [hf_models],
-        #    [txt2img_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_img2img,
-        #    1,
-        #    [hf_models],
-        #    [img2img_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_inpaint,
-        #    2,
-        #    [hf_models],
-        #    [inpaint_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_outpaint,
-        #    3,
-        #    [hf_models],
-        #    [outpaint_custom_model, tabs],
-        # )
-        # register_modelmanager_button(
-        #    modelmanager_sendto_upscaler,
-        #    4,
-        #    [hf_models],
-        #    [upscaler_custom_model, tabs],
-        # )
+    studio_web.queue()

-    sd_web.queue()
    # if args.ui == "app":
    #    t = Process(
    #        target=launch_app, args=[f"http://localhost:{args.server_port}"]
    #    )
    #    t.start()
-    sd_web.launch(
-        share=True,
+    studio_web.launch(
+        share=cmd_opts.share,
        inbrowser=True,
        server_name="0.0.0.0",
-        server_port=11911,  # args.server_port,
+        server_port=cmd_opts.server_port,
+        favicon_path=amdicon_loc,
    )
+
+
+if __name__ == "__main__":
+    from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+    if cmd_opts.webui == False:
+        api_only()
+    else:
+        webui()
--- a/apps/shark_studio/web/ui/chat.py
+++ b/apps/shark_studio/web/ui/chat.py
@@ -1,16 +1,22 @@
 import gradio as gr
+import time
 import os
 from pathlib import Path
 from datetime import datetime as dt
 import json
 import sys
-from apps.shark_studio.api.utils import (
-    get_available_devices,
-)
 from apps.shark_studio.api.llm import (
    llm_model_map,
    LanguageModel,
 )
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+import apps.shark_studio.web.utils.globals as global_obj
+
+B_SYS, E_SYS = "<s>", "</s>"
+
+B_SYS, E_SYS = "<s>", "</s>"
+
+B_SYS, E_SYS = "<s>", "</s>"


 def user(message, history):
@@ -18,107 +24,17 @@ def user(message, history):
    return "", history + [[message, ""]]


+def append_bot_prompt(history, input_prompt):
+    user_prompt = f"{input_prompt} {E_SYS} {E_SYS}"
+    history += user_prompt
+    return history
+
+
 language_model = None


-# NOTE: Each `model_name` should have its own start message
-start_message = {
-    "llama2_7b": (
-        "You are a helpful, respectful and honest assistant. Always answer "
-        "as helpfully as possible, while being safe. Your answers should not "
-        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
-        "illegal content. Please ensure that your responses are socially "
-        "unbiased and positive in nature. If a question does not make any "
-        "sense, or is not factually coherent, explain why instead of "
-        "answering something not correct. If you don't know the answer "
-        "to a question, please don't share false information."
-    ),
-    "llama2_13b": (
-        "You are a helpful, respectful and honest assistant. Always answer "
-        "as helpfully as possible, while being safe. Your answers should not "
-        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
-        "illegal content. Please ensure that your responses are socially "
-        "unbiased and positive in nature. If a question does not make any "
-        "sense, or is not factually coherent, explain why instead of "
-        "answering something not correct. If you don't know the answer "
-        "to a question, please don't share false information."
-    ),
-    "llama2_70b": (
-        "You are a helpful, respectful and honest assistant. Always answer "
-        "as helpfully as possible, while being safe. Your answers should not "
-        "include any harmful, unethical, racist, sexist, toxic, dangerous, or "
-        "illegal content. Please ensure that your responses are socially "
-        "unbiased and positive in nature. If a question does not make any "
-        "sense, or is not factually coherent, explain why instead of "
-        "answering something not correct. If you don't know the answer "
-        "to a question, please don't share false information."
-    ),
-    "vicuna": (
-        "A chat between a curious user and an artificial intelligence "
-        "assistant. The assistant gives helpful, detailed, and "
-        "polite answers to the user's questions.\n"
-    ),
-}
-
-
-def create_prompt(model_name, history, prompt_prefix):
-    return ""
-    system_message = ""
-    if prompt_prefix:
-        system_message = start_message[model_name]
-
-    if "llama2" in model_name:
-        B_INST, E_INST = "[INST]", "[/INST]"
-        B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
-        conversation = "".join(
-            [f"{B_INST} {item[0]} {E_INST} {item[1]} " for item in history[1:]]
-        )
-        if prompt_prefix:
-            msg = f"{B_INST} {B_SYS}{system_message}{E_SYS}{history[0][0]} {E_INST} {history[0][1]} {conversation}"
-        else:
-            msg = f"{B_INST} {history[0][0]} {E_INST} {history[0][1]} {conversation}"
-    elif model_name in ["vicuna"]:
-        conversation = "".join(
-            [
-                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
-                for item in history
-            ]
-        )
-        msg = system_message + conversation
-        msg = msg.strip()
-    else:
-        conversation = "".join(
-            ["".join([item[0], item[1]]) for item in history]
-        )
-        msg = system_message + conversation
-        msg = msg.strip()
-    return msg
-
-
 def get_default_config():
    return False
-    import torch
-    from transformers import AutoTokenizer
-
-    hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
-    tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
-    compilation_prompt = "".join(["0" for _ in range(17)])
-    compilation_input_ids = tokenizer(
-        compilation_prompt,
-        return_tensors="pt",
-    ).input_ids
-    compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
-        [1, 19]
-    )
-    firstVicunaCompileInput = (compilation_input_ids,)
-    from apps.language_models.src.model_wrappers.vicuna_model import (
-        CombinedModel,
-    )
-    from shark.shark_generate_model_config import GenerateConfigFile
-
-    model = CombinedModel()
-    c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
-    c.split_into_layers()


 # model_vmfb_key = ""
@@ -132,259 +48,43 @@ def chat_fn(
    precision,
    download_vmfb,
    config_file,
+    streaming_llm,
    cli=False,
-    progress=gr.Progress(),
 ):
    global language_model
+    if streaming_llm and prompt_prefix == "Clear":
+        language_model = None
+        return "Clearing history...", ""
    if language_model is None:
+        history[-1][-1] = "Getting the model ready..."
+        yield history, ""
        language_model = LanguageModel(
-            model, device=device, precision=precision
-        )
-
-    language_model.chat(prompt_prefix)
-    return "", ""
-    global past_key_values
-    global model_vmfb_key
-
-    device_id = None
-    model_name, model_path = list(map(str.strip, model.split("=>")))
-    if "cuda" in device:
-        device = "cuda"
-    elif "sync" in device:
-        device = "cpu-sync"
-    elif "task" in device:
-        device = "cpu-task"
-    elif "vulkan" in device:
-        device_id = int(device.split("://")[1])
-        device = "vulkan"
-    elif "rocm" in device:
-        device = "rocm"
-    else:
-        print("unrecognized device")
-
-    from apps.language_models.scripts.vicuna import ShardedVicuna
-    from apps.language_models.scripts.vicuna import UnshardedVicuna
-    from apps.stable_diffusion.src import args
-
-    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{device_id}#{precision}#{download_vmfb}"
-    if vicuna_model is None or new_model_vmfb_key != model_vmfb_key:
-        model_vmfb_key = new_model_vmfb_key
-        max_toks = 128 if model_name == "codegen" else 512
-
-        # get iree flags that need to be overridden, from commandline args
-        _extra_args = []
-        # vulkan target triple
-        vulkan_target_triple = args.iree_vulkan_target_triple
-        from shark.iree_utils.vulkan_utils import (
-            get_all_vulkan_devices,
-            get_vulkan_target_triple,
-        )
-
-        if device == "vulkan":
-            vulkaninfo_list = get_all_vulkan_devices()
-            if vulkan_target_triple == "":
-                # We already have the device_id extracted via WebUI, so we directly use
-                # that to find the target triple.
-                vulkan_target_triple = get_vulkan_target_triple(
-                    vulkaninfo_list[device_id]
-                )
-            _extra_args.append(
-                f"-iree-vulkan-target-triple={vulkan_target_triple}"
-            )
-            if "rdna" in vulkan_target_triple:
-                flags_to_add = [
-                    "--iree-spirv-index-bits=64",
-                ]
-                _extra_args = _extra_args + flags_to_add
-
-            if device_id is None:
-                id = 0
-                for device in vulkaninfo_list:
-                    target_triple = get_vulkan_target_triple(
-                        vulkaninfo_list[id]
-                    )
-                    if target_triple == vulkan_target_triple:
-                        device_id = id
-                        break
-                    id += 1
-
-                assert (
-                    device_id
-                ), f"no vulkan hardware for target-triple '{vulkan_target_triple}' exists"
-            print(f"Will use vulkan target triple : {vulkan_target_triple}")
-
-        elif "rocm" in device:
-            # add iree rocm flags
-            _extra_args.append(
-                f"--iree-rocm-target-chip={args.iree_rocm_target_chip}"
-            )
-            print(f"extra args = {_extra_args}")
-
-        if model_name == "vicuna4":
-            vicuna_model = ShardedVicuna(
-                model_name,
-                hf_model_path=model_path,
-                device=device,
-                precision=precision,
-                max_num_tokens=max_toks,
-                compressed=True,
-                extra_args_cmd=_extra_args,
-            )
-        else:
-            #  if config_file is None:
-            vicuna_model = UnshardedVicuna(
-                model_name,
-                hf_model_path=model_path,
-                hf_auth_token=args.hf_auth_token,
-                device=device,
-                vulkan_target_triple=vulkan_target_triple,
-                precision=precision,
-                max_num_tokens=max_toks,
-                download_vmfb=download_vmfb,
-                load_mlir_from_shark_tank=True,
-                extra_args_cmd=_extra_args,
-                device_id=device_id,
-            )
-
-    if vicuna_model is None:
-        sys.exit("Unable to instantiate the model object, exiting.")
-
-    prompt = create_prompt(model_name, history, prompt_prefix)
-
-    partial_text = ""
-    token_count = 0
-    total_time_ms = 0.001  # In order to avoid divide by zero error
-    prefill_time = 0
-    is_first = True
-    for text, msg, exec_time in progress.tqdm(
-        vicuna_model.generate(prompt, cli=cli),
-        desc="generating response",
-    ):
-        if msg is None:
-            if is_first:
-                prefill_time = exec_time
-                is_first = False
-            else:
-                total_time_ms += exec_time
-                token_count += 1
-            partial_text += text + " "
-            history[-1][1] = partial_text
-            yield history, f"Prefill: {prefill_time:.2f}"
-        elif "formatted" in msg:
-            history[-1][1] = text
-            tokens_per_sec = (token_count / total_time_ms) * 1000
-            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
-        else:
-            sys.exit(
-                "unexpected message from the vicuna generate call, exiting."
-            )
-
-    return history, ""
-
-
-def llm_chat_api(InputData: dict):
-    return None
-    print(f"Input keys : {InputData.keys()}")
-    # print(f"model : {InputData['model']}")
-    is_chat_completion_api = (
-        "messages" in InputData.keys()
-    )  # else it is the legacy `completion` api
-    # For Debugging input data from API
-    # if is_chat_completion_api:
-    #     print(f"message -> role : {InputData['messages'][0]['role']}")
-    #     print(f"message -> content : {InputData['messages'][0]['content']}")
-    # else:
-    #     print(f"prompt : {InputData['prompt']}")
-    # print(f"max_tokens : {InputData['max_tokens']}") # Default to 128 for now
-    global vicuna_model
-    model_name = (
-        InputData["model"] if "model" in InputData.keys() else "codegen"
-    )
-    model_path = llm_model_map[model_name]
-    device = "cpu-task"
-    precision = "fp16"
-    max_toks = (
-        None
-        if "max_tokens" not in InputData.keys()
-        else InputData["max_tokens"]
-    )
-    if max_toks is None:
-        max_toks = 128 if model_name == "codegen" else 512
-
-    # make it working for codegen first
-    from apps.language_models.scripts.vicuna import (
-        UnshardedVicuna,
-    )
-
-    device_id = None
-    if vicuna_model == 0:
-        if "cuda" in device:
-            device = "cuda"
-        elif "sync" in device:
-            device = "cpu-sync"
-        elif "task" in device:
-            device = "cpu-task"
-        elif "vulkan" in device:
-            device_id = int(device.split("://")[1])
-            device = "vulkan"
-        else:
-            print("unrecognized device")
-
-        vicuna_model = UnshardedVicuna(
-            model_name,
-            hf_model_path=model_path,
+            model,
            device=device,
            precision=precision,
-            max_num_tokens=max_toks,
-            download_vmfb=True,
-            load_mlir_from_shark_tank=True,
-            device_id=device_id,
+            external_weights="safetensors",
+            use_system_prompt=prompt_prefix,
+            streaming_llm=streaming_llm,
+            hf_auth_token=cmd_opts.hf_auth_token,
        )
-
-    # TODO: add role dict for different models
-    if is_chat_completion_api:
-        # TODO: add funtionality for multiple messages
-        prompt = create_prompt(
-            model_name, [(InputData["messages"][0]["content"], "")]
-        )
-    else:
-        prompt = InputData["prompt"]
-    print("prompt = ", prompt)
-
-    res = vicuna_model.generate(prompt)
-    res_op = None
-    for op in res:
-        res_op = op
-
-    if is_chat_completion_api:
-        choices = [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": res_op,  # since we are yeilding the result
-                },
-                "finish_reason": "stop",  # or length
-            }
-        ]
-    else:
-        choices = [
-            {
-                "text": res_op,
-                "index": 0,
-                "logprobs": None,
-                "finish_reason": "stop",  # or length
-            }
-        ]
-    end_time = dt.now().strftime("%Y%m%d%H%M%S%f")
-    return {
-        "id": end_time,
-        "object": "chat.completion"
-        if is_chat_completion_api
-        else "text_completion",
-        "created": int(end_time),
-        "choices": choices,
-    }
+        history[-1][-1] = "Getting the model ready... Done"
+        yield history, ""
+        history[-1][-1] = ""
+    token_count = 0
+    total_time = 0.001  # In order to avoid divide by zero error
+    prefill_time = 0
+    is_first = True
+    for text, exec_time in language_model.chat(history):
+        history[-1][-1] = f"{text}{E_SYS}"
+        if is_first:
+            prefill_time = exec_time
+            is_first = False
+            yield history, f"Prefill: {prefill_time:.2f}"
+        else:
+            total_time += exec_time
+            token_count += 1
+            tokens_per_sec = token_count / total_time
+            yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"


 def view_json_file(file_obj):
@@ -403,7 +103,7 @@ with gr.Blocks(title="Chat") as chat_element:
            choices=model_choices,
            allow_custom_value=True,
        )
-        supported_devices = get_available_devices()
+        supported_devices = global_obj.get_device_list()
        enabled = True
        if len(supported_devices) == 0:
            supported_devices = ["cpu-task"]
@@ -417,7 +117,7 @@ with gr.Blocks(title="Chat") as chat_element:
        )
        precision = gr.Radio(
            label="Precision",
-            value="int4",
+            value="fp32",
            choices=[
                # "int4",
                # "int8",
@@ -430,12 +130,19 @@ with gr.Blocks(title="Chat") as chat_element:
        with gr.Column():
            download_vmfb = gr.Checkbox(
                label="Download vmfb from Shark tank if available",
-                value=True,
+                value=False,
                interactive=True,
+                visible=False,
+            )
+            streaming_llm = gr.Checkbox(
+                label="Run in streaming mode (requires recompilation)",
+                value=True,
+                interactive=False,
+                visible=False,
            )
            prompt_prefix = gr.Checkbox(
                label="Add System Prompt",
-                value=False,
+                value=True,
                interactive=True,
            )

@@ -457,11 +164,9 @@ with gr.Blocks(title="Chat") as chat_element:

    with gr.Row(visible=False):
        with gr.Group():
-            config_file = gr.File(
-                label="Upload sharding configuration", visible=False
-            )
-            json_view_button = gr.Button(label="View as JSON", visible=False)
-        json_view = gr.JSON(interactive=True, visible=False)
+            config_file = gr.File(label="Upload sharding configuration", visible=False)
+            json_view_button = gr.Button("View as JSON", visible=False)
+        json_view = gr.JSON(visible=False)
        json_view_button.click(
            fn=view_json_file, inputs=[config_file], outputs=[json_view]
        )
@@ -481,6 +186,7 @@ with gr.Blocks(title="Chat") as chat_element:
            precision,
            download_vmfb,
            config_file,
+            streaming_llm,
        ],
        outputs=[chatbot, tokens_time],
        show_progress=False,
@@ -502,6 +208,7 @@ with gr.Blocks(title="Chat") as chat_element:
            precision,
            download_vmfb,
            config_file,
+            streaming_llm,
        ],
        outputs=[chatbot, tokens_time],
        show_progress=False,
@@ -514,4 +221,19 @@ with gr.Blocks(title="Chat") as chat_element:
        cancels=[submit_event, submit_click_event],
        queue=False,
    )
-    clear.click(lambda: None, None, [chatbot], queue=False)
+    clear.click(
+        fn=chat_fn,
+        inputs=[
+            clear,
+            chatbot,
+            model,
+            device,
+            precision,
+            download_vmfb,
+            config_file,
+            streaming_llm,
+        ],
+        outputs=[chatbot, tokens_time],
+        show_progress=False,
+        queue=True,
+    ).then(lambda: None, None, [chatbot], queue=False)
--- a/apps/shark_studio/web/ui/common_events.py
+++ b/apps/shark_studio/web/ui/common_events.py
@@ -0,0 +1,67 @@
+from apps.shark_studio.web.ui.utils import (
+    HSLHue,
+    hsl_color,
+)
+from apps.shark_studio.modules.embeddings import get_lora_metadata
+
+
+# Answers HTML to show the most frequent tags used when a LoRA was trained,
+# taken from the metadata of its .safetensors file.
+def lora_changed(lora_files):
+    # tag frequency percentage, that gets maximum amount of the staring hue
+    TAG_COLOR_THRESHOLD = 0.55
+    # tag frequency percentage, above which a tag is displayed
+    TAG_DISPLAY_THRESHOLD = 0.65
+    # template for the html used to display a tag
+    TAG_HTML_TEMPLATE = (
+        '<span class="lora-tag" style="border: 1px solid {color};">{tag}</span>'
+    )
+    output = []
+    for lora_file in lora_files:
+        if lora_file == "":
+            output.extend(["<div><i>No LoRA selected</i></div>"])
+        elif not lora_file.lower().endswith(".safetensors"):
+            output.extend(
+                [
+                    "<div><i>Only metadata queries for .safetensors files are currently supported</i></div>"
+                ]
+            )
+        else:
+            metadata = get_lora_metadata(lora_file)
+            if metadata:
+                frequencies = metadata["frequencies"]
+                output.extend(
+                    [
+                        "".join(
+                            [
+                                f'<div class="lora-model">Trained against weights in: {metadata["model"]}</div>'
+                            ]
+                            + [
+                                TAG_HTML_TEMPLATE.format(
+                                    color=hsl_color(
+                                        (tag[1] - TAG_COLOR_THRESHOLD)
+                                        / (1 - TAG_COLOR_THRESHOLD),
+                                        start=HSLHue.RED,
+                                        end=HSLHue.GREEN,
+                                    ),
+                                    tag=tag[0],
+                                )
+                                for tag in frequencies
+                                if tag[1] > TAG_DISPLAY_THRESHOLD
+                            ],
+                        )
+                    ]
+                )
+            elif metadata is None:
+                output.extend(
+                    [
+                        "<div><i>This LoRA does not publish tag frequency metadata</i></div>"
+                    ]
+                )
+            else:
+                output.extend(
+                    [
+                        "<div><i>This LoRA has empty tag frequency metadata, or we could not parse it</i></div>"
+                    ]
+                )
+    return output
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -117,7 +117,7 @@ body {
    height: 100% !important;
 }

-/* display in full width for desktop devices */
+/* display in full width for desktop devices, but see below */
@media (min-width: 1536px)
 {
    .gradio-container {
@@ -125,12 +125,17 @@ body {
    }
 }

-.gradio-container .contain {
-    padding: 0 var(--size-4) !important;
+/* media rules in custom css are don't appear to be applied in
+   gradio versions > 4.7, so we have to define a class which
+   we will manually need add and remove using javascript.
+   Remove this once this fixed in gradio.
+*/
+.gradio-container-size-full {
+    max-width: var(--size-full) !important;
 }

-#ui_title {
-    padding: var(--size-2) 0 0 var(--size-1);
+.gradio-container .contain {
+    padding: 0 var(--size-4) !important;
 }

 #top_logo {
@@ -140,6 +145,10 @@ body {
    border: 0;
 }

+#ui_title {
+    padding: var(--size-2) 0 0 var(--size-1);
+}
+
 #demo_title_outer {
    border-radius: 0;
 }
@@ -182,6 +191,7 @@ footer {
    aspect-ratio: unset;
    max-height: calc(55vh - (2 * var(--spacing-lg)));
 }
+/* fix width and height of gallery items when on very large desktop screens, but see below */
@media (min-width: 1921px) {
    /* Force a 768px_height + 4px_margin_height + navbar_height for the gallery */
    #gallery .grid-wrap, #gallery .preview{
@@ -193,6 +203,20 @@ footer {
        max-height: 770px !important;
    }
 }
+
+/* media rules in custom css are don't appear to be applied in
+   gradio versions > 4.7, so we have to define classes which
+   we will manually need add and remove using javascript.
+   Remove this once this fixed in gradio.
+*/
+.gallery-force-height768 .grid-wrap, .gallery-force-height768 .preview {
+    min-height: calc(768px + 4px + var(--size-14)) !important;
+    max-height: calc(768px + 4px + var(--size-14)) !important;
+}
+.gallery-limit-height768 .thumbnail-item.thumbnail-lg {
+    max-height: 770px !important;
+}
+
 /* Don't upscale when viewing in solo image mode */
 #gallery .preview img {
    object-fit: scale-down;
@@ -234,11 +258,6 @@ footer {
    display:none;
 }

-/* Hide the download icon from the nod logo */
-#top_logo button {
-    display: none;
-}
-
 /* workarounds for container=false not currently working for dropdowns */
 .dropdown_no_container {
    padding: 0 !important;
@@ -308,6 +327,15 @@ footer {
    min-height: 89vh !important;
 }

+.sd-right-panel {
+    height: calc(100vmin - var(--size-32) - var(--size-10)) !important;
+    overflow-y: scroll;
+}
+
+.sd-right-panel .fill {
+    flex: 1;
+}
+
 /* don't stretch non-square images to be square, breaking their aspect ratio */
 #outputgallery_gallery .thumbnail-item.thumbnail-lg > img {
    object-fit: contain !important;
@@ -319,7 +347,7 @@ footer {
    width: 100%;
 }

-#top_logo.logo_centered img{
+#top_logo.logo_centered img {
    object-fit: scale-down;
    position: absolute;
    width: 80%;
@@ -327,3 +355,19 @@ footer {
    left: 50%;
    transform: translate(-50%, -50%);
 }
+
+#tab_bar_logo {
+    overflow: visible !important;
+    border-width: 0 !important;
+    height: 0px !important;
+    padding: 0;
+    margin: 0;
+}
+
+#tab_bar_logo .image-container {
+    object-fit: scale-down;
+    position: absolute !important;
+    top: 10px;
+    right: 0px;
+    height: 36px;
+}
--- a/apps/shark_studio/web/ui/js/sd_gradio_workarounds.js
+++ b/apps/shark_studio/web/ui/js/sd_gradio_workarounds.js
@@ -0,0 +1,49 @@
+// workaround gradio after 4.7, not applying any @media rules form the custom .css file
+
+() => {
+    console.log(`innerWidth: ${window.innerWidth}` )
+
+    // 1536px rules
+
+    const mediaQuery1536 = window.matchMedia('(min-width: 1536px)')
+
+    function handleWidth1536(event) {
+
+        // display in full width for desktop devices
+        document.querySelectorAll(".gradio-container")
+            .forEach( (node) => {
+                if (event.matches) {
+                    node.classList.add("gradio-container-size-full");
+                } else {
+                    node.classList.remove("gradio-container-size-full")
+                }
+            });
+    }
+
+    mediaQuery1536.addEventListener("change", handleWidth1536);
+    mediaQuery1536.dispatchEvent(new MediaQueryListEvent("change", {matches: window.innerWidth >= 1536}));
+
+    // 1921px rules
+
+    const mediaQuery1921 = window.matchMedia('(min-width: 1921px)')
+
+    function handleWidth1921(event) {
+
+        /* Force a 768px_height + 4px_margin_height + navbar_height for the gallery */
+        /* Limit height to 768px_height + 2px_margin_height for the thumbnails */
+        document.querySelectorAll("#gallery")
+            .forEach( (node) => {
+                if (event.matches) {
+                    node.classList.add("gallery-force-height768");
+                    node.classList.add("gallery-limit-height768");
+                } else {
+                    node.classList.remove("gallery-force-height768");
+                    node.classList.remove("gallery-limit-height768");
+                }
+            });
+    }
+
+    mediaQuery1921.addEventListener("change", handleWidth1921);
+    mediaQuery1921.dispatchEvent(new MediaQueryListEvent("change", {matches: window.innerWidth >= 1921}));
+
+}
--- a/apps/shark_studio/web/ui/logos/amd-icon.jpg
+++ b/apps/shark_studio/web/ui/logos/amd-icon.jpg
--- a/apps/shark_studio/web/ui/logos/amd-logo.jpg
+++ b/apps/shark_studio/web/ui/logos/amd-logo.jpg
--- a/apps/stable_diffusion/web/ui/outputgallery_ui.py
+++ b/apps/stable_diffusion/web/ui/outputgallery_ui.py
@@ -5,13 +5,13 @@ import subprocess
 import sys
 from PIL import Image

-from apps.stable_diffusion.src import args
-from apps.stable_diffusion.src.utils import (
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+from apps.shark_studio.web.utils.file_utils import (
    get_generated_imgs_path,
    get_generated_imgs_todays_subdir,
 )
-from apps.stable_diffusion.web.ui.utils import nodlogo_loc
-from apps.stable_diffusion.web.utils.metadata import displayable_metadata
+from apps.shark_studio.web.ui.utils import amdlogo_loc
+from apps.shark_studio.web.utils.metadata import displayable_metadata

 # -- Functions for file, directory and image info querying

@@ -22,8 +22,7 @@ def outputgallery_filenames(subdir) -> list[str]:
    new_dir_path = os.path.join(output_dir, subdir)
    if os.path.exists(new_dir_path):
        filenames = [
-            glob.glob(new_dir_path + "/" + ext)
-            for ext in ("*.png", "*.jpg", "*.jpeg")
+            glob.glob(new_dir_path + "/" + ext) for ext in ("*.png", "*.jpg", "*.jpeg")
        ]

        return sorted(sum(filenames, []), key=os.path.getmtime, reverse=True)
@@ -36,7 +35,7 @@ def output_subdirs() -> list[str]:
    relative_paths = [
        os.path.relpath(entry[0], output_dir)
        for entry in os.walk(
-            output_dir, followlinks=args.output_gallery_followlinks
+            output_dir, followlinks=cmd_opts.output_gallery_followlinks
        )
    ]

@@ -52,11 +51,7 @@ def output_subdirs() -> list[str]:
        [path for path in relative_paths if path.isnumeric()], reverse=True
    )
    result_paths = generated_paths + sorted(
-        [
-            path
-            for path in relative_paths
-            if (not path.isnumeric()) and path != "."
-        ]
+        [path for path in relative_paths if (not path.isnumeric()) and path != "."]
    )

    return result_paths
@@ -64,8 +59,8 @@ def output_subdirs() -> list[str]:

 # --- Define UI layout for Gradio

-with gr.Blocks() as outputgallery_web:
-    nod_logo = Image.open(nodlogo_loc)
+with gr.Blocks() as outputgallery_element:
+    amd_logo = Image.open(amdlogo_loc)

    with gr.Row(elem_id="outputgallery_gallery"):
        # needed to workaround gradio issue:
@@ -78,12 +73,13 @@ with gr.Blocks() as outputgallery_web:
        with gr.Column(scale=6):
            logo = gr.Image(
                label="Getting subdirectories...",
-                value=nod_logo,
+                value=amd_logo,
                interactive=False,
                visible=True,
                show_label=True,
                elem_id="top_logo",
                elem_classes="logo_centered",
+                show_download_button=False,
            )

            gallery = gr.Gallery(
@@ -95,7 +91,7 @@ with gr.Blocks() as outputgallery_web:
            )

        with gr.Column(scale=4):
-            with gr.Box():
+            with gr.Group():
                with gr.Row():
                    with gr.Column(
                        scale=15,
@@ -152,40 +148,13 @@ with gr.Blocks() as outputgallery_web:
                    wrap=True,
                    elem_classes="output_parameters_dataframe",
                    value=[["Status", "No image selected"]],
+                    interactive=True,
                )

            with gr.Accordion(label="Send To", open=True):
                with gr.Row():
-                    outputgallery_sendto_txt2img = gr.Button(
-                        value="Txt2Img",
-                        interactive=False,
-                        elem_classes="outputgallery_sendto",
-                        size="sm",
-                    )
-
-                    outputgallery_sendto_img2img = gr.Button(
-                        value="Img2Img",
-                        interactive=False,
-                        elem_classes="outputgallery_sendto",
-                        size="sm",
-                    )
-
-                    outputgallery_sendto_inpaint = gr.Button(
-                        value="Inpaint",
-                        interactive=False,
-                        elem_classes="outputgallery_sendto",
-                        size="sm",
-                    )
-
-                    outputgallery_sendto_outpaint = gr.Button(
-                        value="Outpaint",
-                        interactive=False,
-                        elem_classes="outputgallery_sendto",
-                        size="sm",
-                    )
-
-                    outputgallery_sendto_upscaler = gr.Button(
-                        value="Upscaler",
+                    outputgallery_sendto_sd = gr.Button(
+                        value="Stable Diffusion",
                        interactive=False,
                        elem_classes="outputgallery_sendto",
                        size="sm",
@@ -195,32 +164,30 @@ with gr.Blocks() as outputgallery_web:

    def on_clear_gallery():
        return [
-            gr.Gallery.update(
+            gr.Gallery(
                value=[],
                visible=False,
            ),
-            gr.Image.update(
+            gr.Image(
                visible=True,
            ),
        ]

    def on_image_columns_change(columns):
-        return gr.Gallery.update(columns=columns)
+        return gr.Gallery(columns=columns)

    def on_select_subdir(subdir) -> list:
        # evt.value is the subdirectory name
        new_images = outputgallery_filenames(subdir)
-        new_label = (
-            f"{len(new_images)} images in {os.path.join(output_dir, subdir)}"
-        )
+        new_label = f"{len(new_images)} images in {os.path.join(output_dir, subdir)}"
        return [
            new_images,
-            gr.Gallery.update(
+            gr.Gallery(
                value=new_images,
                label=new_label,
                visible=len(new_images) > 0,
            ),
-            gr.Image.update(
+            gr.Image(
                label=new_label,
                visible=len(new_images) == 0,
            ),
@@ -249,21 +216,18 @@ with gr.Blocks() as outputgallery_web:
        )
        new_images = outputgallery_filenames(new_subdir)
        new_label = (
-            f"{len(new_images)} images in "
-            f"{os.path.join(output_dir, new_subdir)}"
+            f"{len(new_images)} images in " f"{os.path.join(output_dir, new_subdir)}"
        )

        return [
-            gr.Dropdown.update(
+            gr.Dropdown(
                choices=refreshed_subdirs,
                value=new_subdir,
            ),
            refreshed_subdirs,
            new_images,
-            gr.Gallery.update(
-                value=new_images, label=new_label, visible=len(new_images) > 0
-            ),
-            gr.Image.update(
+            gr.Gallery(value=new_images, label=new_label, visible=len(new_images) > 0),
+            gr.Image(
                label=new_label,
                visible=len(new_images) == 0,
            ),
@@ -289,12 +253,12 @@ with gr.Blocks() as outputgallery_web:

            return [
                new_images,
-                gr.Gallery.update(
+                gr.Gallery(
                    value=new_images,
                    label=new_label,
                    visible=len(new_images) > 0,
                ),
-                gr.Image.update(
+                gr.Image(
                    label=new_label,
                    visible=len(new_images) == 0,
                ),
@@ -332,12 +296,7 @@ with gr.Blocks() as outputgallery_web:
        return [
            # disable or enable each of the sendto button based on whether
            # an image is selected
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
-            gr.Button.update(interactive=exists),
+            gr.Button(interactive=exists),
        ]

    # The time first our tab is selected we need to do an initial refresh
@@ -413,11 +372,7 @@ with gr.Blocks() as outputgallery_web:
        on_outputgallery_filename_change,
        [outputgallery_filename],
        [
-            outputgallery_sendto_txt2img,
-            outputgallery_sendto_img2img,
-            outputgallery_sendto_inpaint,
-            outputgallery_sendto_outpaint,
-            outputgallery_sendto_upscaler,
+            outputgallery_sendto_sd,
        ],
        queue=False,
    )
--- a/apps/shark_studio/web/ui/sd.py
+++ b/apps/shark_studio/web/ui/sd.py
@@ -0,0 +1,777 @@
+import os
+import json
+import gradio as gr
+import numpy as np
+from inspect import signature
+from PIL import Image
+from pathlib import Path
+from datetime import datetime as dt
+from gradio.components.image_editor import (
+    EditorValue,
+)
+from apps.shark_studio.web.utils.file_utils import (
+    get_generated_imgs_path,
+    get_checkpoints_path,
+    get_checkpoints,
+    get_configs_path,
+    write_default_sd_configs,
+)
+from apps.shark_studio.api.sd import (
+    shark_sd_fn_dict_input,
+    cancel_sd,
+    unload_sd,
+)
+from apps.shark_studio.api.controlnet import (
+    cnet_preview,
+)
+from apps.shark_studio.modules.schedulers import (
+    scheduler_model_map,
+)
+from apps.shark_studio.modules.img_processing import (
+    resampler_list,
+    resize_stencil,
+)
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+from apps.shark_studio.web.ui.utils import (
+    amdlogo_loc,
+    none_to_str_none,
+    str_none_to_none,
+)
+from apps.shark_studio.web.utils.state import (
+    status_label,
+)
+from apps.shark_studio.web.ui.common_events import lora_changed
+from apps.shark_studio.modules import logger
+import apps.shark_studio.web.utils.globals as global_obj
+
+sd_default_models = [
+    "runwayml/stable-diffusion-v1-5",
+    "stabilityai/stable-diffusion-2-1-base",
+    "stabilityai/stable-diffusion-2-1",
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    "stabilityai/sdxl-turbo",
+]
+
+
+def view_json_file(file_path):
+    content = ""
+    with open(file_path, "r") as fopen:
+        content = fopen.read()
+    return content
+
+
+def submit_to_cnet_config(
+    stencil: str,
+    preprocessed_hint: str,
+    cnet_strength: int,
+    control_mode: str,
+    curr_config: dict,
+):
+    if any(i in [None, ""] for i in [stencil, preprocessed_hint]):
+        return gr.update()
+    if curr_config is not None:
+        if "controlnets" in curr_config:
+            curr_config["controlnets"]["control_mode"] = control_mode
+            curr_config["controlnets"]["model"].append(stencil)
+            curr_config["controlnets"]["hint"].append(preprocessed_hint)
+            curr_config["controlnets"]["strength"].append(cnet_strength)
+            return curr_config
+
+    cnet_map = {}
+    cnet_map["controlnets"] = {
+        "control_mode": control_mode,
+        "model": [stencil],
+        "hint": [preprocessed_hint],
+        "strength": [cnet_strength],
+    }
+    return cnet_map
+
+
+def update_embeddings_json(embedding):
+    return {"embeddings": [embedding]}
+
+
+def submit_to_main_config(input_cfg: dict, main_cfg: dict):
+    if main_cfg in [None, "", {}]:
+        return input_cfg
+
+    for base_key in input_cfg:
+        main_cfg[base_key] = input_cfg[base_key]
+    return main_cfg
+
+
+def pull_sd_configs(
+    prompt,
+    negative_prompt,
+    sd_init_image,
+    height,
+    width,
+    steps,
+    strength,
+    guidance_scale,
+    seed,
+    batch_count,
+    batch_size,
+    scheduler,
+    base_model_id,
+    custom_weights,
+    custom_vae,
+    precision,
+    device,
+    target_triple,
+    ondemand,
+    compiled_pipeline,
+    resample_type,
+    controlnets,
+    embeddings,
+):
+    sd_args = str_none_to_none(locals())
+    sd_cfg = {}
+    for arg in sd_args:
+        if arg in [
+            "prompt",
+            "negative_prompt",
+            "sd_init_image",
+        ]:
+            sd_cfg[arg] = [sd_args[arg]]
+        elif arg in ["controlnets", "embeddings"]:
+            if isinstance(arg, dict):
+                sd_cfg[arg] = json.loads(sd_args[arg])
+            else:
+                sd_cfg[arg] = {}
+        else:
+            sd_cfg[arg] = sd_args[arg]
+
+    return json.dumps(sd_cfg)
+
+
+def load_sd_cfg(sd_json: dict, load_sd_config: str):
+    new_sd_config = none_to_str_none(json.loads(view_json_file(load_sd_config)))
+    if sd_json:
+        for key in new_sd_config:
+            sd_json[key] = new_sd_config[key]
+    else:
+        sd_json = new_sd_config
+    for i in sd_json["sd_init_image"]:
+        if i is not None:
+            if os.path.isfile(i):
+                sd_image = [Image.open(i, mode="r")]
+    else:
+        sd_image = None
+
+    return [
+        sd_json["prompt"][0],
+        sd_json["negative_prompt"][0],
+        sd_image,
+        sd_json["height"],
+        sd_json["width"],
+        sd_json["steps"],
+        sd_json["strength"],
+        sd_json["guidance_scale"],
+        sd_json["seed"],
+        sd_json["batch_count"],
+        sd_json["batch_size"],
+        sd_json["scheduler"],
+        sd_json["base_model_id"],
+        sd_json["custom_weights"],
+        sd_json["custom_vae"],
+        sd_json["precision"],
+        sd_json["device"],
+        sd_json["target_triple"],
+        sd_json["ondemand"],
+        sd_json["compiled_pipeline"],
+        sd_json["resample_type"],
+        sd_json["controlnets"],
+        sd_json["embeddings"],
+        sd_json,
+    ]
+
+
+def save_sd_cfg(config: dict, save_name: str):
+    if os.path.exists(save_name):
+        filepath = save_name
+    elif cmd_opts.configs_path:
+        filepath = os.path.join(cmd_opts.configs_path, save_name)
+    else:
+        filepath = os.path.join(get_configs_path(), save_name)
+    if ".json" not in filepath:
+        filepath += ".json"
+    with open(filepath, mode="w") as f:
+        f.write(json.dumps(config))
+    return "..."
+
+
+def create_canvas(width, height):
+    data = Image.fromarray(
+        np.zeros(
+            shape=(height, width, 3),
+            dtype=np.uint8,
+        )
+        + 255
+    )
+    img_dict = {
+        "background": data,
+        "layers": [],
+        "composite": None,
+    }
+    return EditorValue(img_dict)
+
+
+def import_original(original_img, width, height):
+    if original_img is None:
+        resized_img = create_canvas(width, height)
+        return resized_img
+    else:
+        resized_img, _, _ = resize_stencil(original_img, width, height)
+        img_dict = {
+            "background": resized_img,
+            "layers": [],
+            "composite": None,
+        }
+        return EditorValue(img_dict)
+
+
+def base_model_changed(base_model_id):
+    new_choices = get_checkpoints(
+        os.path.join("checkpoints", os.path.basename(str(base_model_id)))
+    ) + get_checkpoints(model_type="checkpoints")
+
+    return gr.Dropdown(
+        value=new_choices[0] if len(new_choices) > 0 else "None",
+        choices=["None"] + new_choices,
+    )
+
+
+with gr.Blocks(title="Stable Diffusion") as sd_element:
+    with gr.Column(elem_id="ui_body"):
+        with gr.Row():
+            with gr.Column(scale=2, min_width=600):
+                with gr.Accordion(
+                    label="\U0001F4D0\U0000FE0F Device Settings", open=False
+                ):
+                    device = gr.Dropdown(
+                        elem_id="device",
+                        label="Device",
+                        value=global_obj.get_device_list()[0],
+                        choices=global_obj.get_device_list(),
+                        allow_custom_value=False,
+                    )
+                    target_triple = gr.Textbox(
+                        elem_id="target_triple",
+                        label="Architecture",
+                        value="",
+                    )
+                    with gr.Row():
+                        ondemand = gr.Checkbox(
+                            value=cmd_opts.lowvram,
+                            label="Low VRAM",
+                            interactive=True,
+                        )
+                        precision = gr.Radio(
+                            label="Precision",
+                            value=cmd_opts.precision,
+                            choices=[
+                                "fp16",
+                                "fp32",
+                            ],
+                            visible=True,
+                        )
+                sd_model_info = f"Checkpoint Path: {str(get_checkpoints_path())}"
+                base_model_id = gr.Dropdown(
+                    label="\U000026F0\U0000FE0F Base Model",
+                    info="Select or enter HF model ID",
+                    elem_id="custom_model",
+                    value="stabilityai/stable-diffusion-2-1-base",
+                    choices=sd_default_models,
+                    allow_custom_value=True,
+                )  # base_model_id
+                with gr.Row():
+                    height = gr.Slider(
+                        384,
+                        1024,
+                        value=cmd_opts.height,
+                        step=8,
+                        label="\U00002195\U0000FE0F Height",
+                    )
+                    width = gr.Slider(
+                        384,
+                        1024,
+                        value=cmd_opts.width,
+                        step=8,
+                        label="\U00002194\U0000FE0F Width",
+                    )
+                with gr.Accordion(
+                    label="\U00002696\U0000FE0F Model Weights", open=False
+                ):
+                    with gr.Column():
+                        custom_weights = gr.Dropdown(
+                            label="Checkpoint Weights",
+                            info="Select or enter HF model ID",
+                            elem_id="custom_model",
+                            value="None",
+                            allow_custom_value=True,
+                            choices=["None"]
+                            + get_checkpoints(os.path.basename(str(base_model_id))),
+                        )  # custom_weights
+                        base_model_id.change(
+                            fn=base_model_changed,
+                            inputs=[base_model_id],
+                            outputs=[custom_weights],
+                        )
+                        sd_vae_info = (str(get_checkpoints_path("vae"))).replace(
+                            "\\", "\n\\"
+                        )
+                        sd_vae_info = f"VAE Path: {sd_vae_info}"
+                        custom_vae = gr.Dropdown(
+                            label=f"VAE Model",
+                            info=sd_vae_info,
+                            elem_id="custom_model",
+                            value=(
+                                os.path.basename(cmd_opts.custom_vae)
+                                if cmd_opts.custom_vae
+                                else "None"
+                            ),
+                            choices=["None"] + get_checkpoints("vae"),
+                            allow_custom_value=True,
+                            scale=1,
+                        )
+                        sd_lora_info = (str(get_checkpoints_path("loras"))).replace(
+                            "\\", "\n\\"
+                        )
+                        lora_opt = gr.Dropdown(
+                            allow_custom_value=True,
+                            label=f"Standalone LoRA Weights",
+                            info=sd_lora_info,
+                            elem_id="lora_weights",
+                            value=None,
+                            multiselect=True,
+                            choices=[] + get_checkpoints("lora"),
+                            scale=2,
+                        )
+                        lora_tags = gr.HTML(
+                            value="<div><i>No LoRA selected</i></div>",
+                            elem_classes="lora-tags",
+                        )
+                        embeddings_config = gr.JSON(
+                            label="Embeddings Options", min_width=50, scale=1
+                        )
+                        gr.on(
+                            triggers=[lora_opt.change],
+                            fn=lora_changed,
+                            inputs=[lora_opt],
+                            outputs=[lora_tags],
+                            queue=True,
+                            show_progress=False,
+                        ).then(
+                            fn=update_embeddings_json,
+                            inputs=[lora_opt],
+                            outputs=[embeddings_config],
+                            show_progress=False,
+                        )
+                with gr.Accordion(
+                    label="\U0001F9EA\U0000FE0F Input Image Processing", open=False
+                ):
+                    strength = gr.Slider(
+                        0,
+                        1,
+                        value=cmd_opts.strength,
+                        step=0.01,
+                        label="Denoising Strength",
+                    )
+                    resample_type = gr.Dropdown(
+                        value=cmd_opts.resample_type,
+                        choices=resampler_list,
+                        label="Resample Type",
+                        allow_custom_value=True,
+                    )
+                with gr.Group(elem_id="prompt_box_outer"):
+                    prompt = gr.Textbox(
+                        label="\U00002795\U0000FE0F Prompt",
+                        value=cmd_opts.prompt[0],
+                        lines=2,
+                        elem_id="prompt_box",
+                        show_copy_button=True,
+                    )
+                    negative_prompt = gr.Textbox(
+                        label="\U00002796\U0000FE0F Negative Prompt",
+                        value=cmd_opts.negative_prompt[0],
+                        lines=2,
+                        elem_id="negative_prompt_box",
+                        show_copy_button=True,
+                    )
+                with gr.Row(equal_height=True):
+                    seed = gr.Textbox(
+                        value=cmd_opts.seed,
+                        label="\U0001F331\U0000FE0F Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
+                        show_copy_button=True,
+                    )
+                    scheduler = gr.Dropdown(
+                        elem_id="scheduler",
+                        label="\U0001F4C5\U0000FE0F Scheduler",
+                        info="\U000E0020",  # forces same height as seed
+                        value="EulerDiscrete",
+                        choices=scheduler_model_map.keys(),
+                        allow_custom_value=False,
+                    )
+                with gr.Row():
+                    steps = gr.Slider(
+                        1,
+                        100,
+                        value=cmd_opts.steps,
+                        step=1,
+                        label="\U0001F3C3\U0000FE0F Steps",
+                    )
+                    guidance_scale = gr.Slider(
+                        0,
+                        50,
+                        value=cmd_opts.guidance_scale,
+                        step=0.1,
+                        label="\U0001F5C3\U0000FE0F CFG Scale",
+                    )
+                with gr.Accordion(
+                    label="Controlnet Options",
+                    open=False,
+                    visible=False,
+                ):
+                    preprocessed_hints = gr.State([])
+                    with gr.Column():
+                        sd_cnet_info = (
+                            str(get_checkpoints_path("controlnet"))
+                        ).replace("\\", "\n\\")
+                    with gr.Row():
+                        cnet_config = gr.JSON()
+                        with gr.Column():
+                            clear_config = gr.ClearButton(
+                                value="Clear Controlnet Config",
+                                size="sm",
+                                components=cnet_config,
+                            )
+                            control_mode = gr.Radio(
+                                choices=["Prompt", "Balanced", "Controlnet"],
+                                value="Balanced",
+                                label="Control Mode",
+                            )
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            cnet_model = gr.Dropdown(
+                                allow_custom_value=True,
+                                label=f"Controlnet Model",
+                                info=sd_cnet_info,
+                                value="None",
+                                choices=[
+                                    "None",
+                                    "canny",
+                                    "openpose",
+                                    "scribble",
+                                    "zoedepth",
+                                ]
+                                + get_checkpoints("controlnet"),
+                            )
+                            cnet_strength = gr.Slider(
+                                label="Controlnet Strength",
+                                minimum=0,
+                                maximum=100,
+                                value=50,
+                                step=1,
+                            )
+                            with gr.Row():
+                                canvas_width = gr.Slider(
+                                    label="Canvas Width",
+                                    minimum=256,
+                                    maximum=1024,
+                                    value=512,
+                                    step=8,
+                                )
+                                canvas_height = gr.Slider(
+                                    label="Canvas Height",
+                                    minimum=256,
+                                    maximum=1024,
+                                    value=512,
+                                    step=8,
+                                )
+                            make_canvas = gr.Button(
+                                value="Make Canvas!",
+                            )
+                            use_input_img = gr.Button(
+                                value="Use Original Image",
+                                size="sm",
+                            )
+                        cnet_input = gr.Image(
+                            value=None,
+                            type="pil",
+                            image_mode="RGB",
+                            interactive=True,
+                        )
+                        with gr.Column(scale=1):
+                            cnet_output = gr.Image(
+                                value=None,
+                                visible=True,
+                                label="Preprocessed Hint",
+                                interactive=False,
+                                show_label=True,
+                            )
+                            cnet_gen = gr.Button(
+                                value="Preprocess controlnet input",
+                            )
+                            use_result = gr.Button(
+                                "Submit",
+                                size="sm",
+                            )
+                        make_canvas.click(
+                            fn=create_canvas,
+                            inputs=[canvas_width, canvas_height],
+                            outputs=[cnet_input],
+                            queue=False,
+                        )
+                        cnet_gen.click(
+                            fn=cnet_preview,
+                            inputs=[
+                                cnet_model,
+                                cnet_input,
+                            ],
+                            outputs=[
+                                cnet_output,
+                                preprocessed_hints,
+                            ],
+                        )
+                        use_result.click(
+                            fn=submit_to_cnet_config,
+                            inputs=[
+                                cnet_model,
+                                cnet_output,
+                                cnet_strength,
+                                control_mode,
+                                cnet_config,
+                            ],
+                            outputs=[
+                                cnet_config,
+                            ],
+                            queue=False,
+                        )
+            with gr.Column(scale=3, min_width=600):
+                with gr.Tabs() as sd_tabs:
+                    sd_element.load(
+                        # Workaround for Gradio issue #7085
+                        # TODO: revert to setting selected= in gr.Tabs declaration
+                        # once this is resolved in Gradio
+                        lambda: gr.Tabs(selected=101),
+                        outputs=[sd_tabs],
+                    )
+                    with gr.Tab(label="Input Image", id=100) as sd_tab_init_image:
+                        with gr.Column(elem_classes=["sd-right-panel"]):
+                            with gr.Row(elem_classes=["fill"]):
+                                # TODO: make this import image prompt info if it exists
+                                sd_init_image = gr.Image(
+                                    type="pil",
+                                    interactive=True,
+                                    show_label=False,
+                                )
+                                use_input_img.click(
+                                    fn=import_original,
+                                    inputs=[
+                                        sd_init_image,
+                                        canvas_width,
+                                        canvas_height,
+                                    ],
+                                    outputs=[cnet_input],
+                                    queue=False,
+                                )
+                    with gr.Tab(label="Generate Images", id=101) as sd_tab_gallery:
+                        with gr.Column(elem_classes=["sd-right-panel"]):
+                            with gr.Row(elem_classes=["fill"]):
+                                sd_gallery = gr.Gallery(
+                                    label="Generated images",
+                                    show_label=False,
+                                    elem_id="gallery",
+                                    columns=2,
+                                    object_fit="fit",
+                                    preview=True,
+                                )
+                            with gr.Row():
+                                batch_count = gr.Slider(
+                                    1,
+                                    100,
+                                    value=cmd_opts.batch_count,
+                                    step=1,
+                                    label="Batch Count",
+                                    interactive=True,
+                                )
+                                batch_size = gr.Slider(
+                                    1,
+                                    4,
+                                    value=cmd_opts.batch_size,
+                                    step=1,
+                                    label="Batch Size",
+                                    interactive=True,
+                                    visible=True,
+                                )
+                                compiled_pipeline = gr.Checkbox(
+                                    False,
+                                    label="Faster txt2img (SDXL only)",
+                                )
+                            with gr.Row():
+                                stable_diffusion = gr.Button("Start")
+                                unload = gr.Button("Unload Models")
+                                unload.click(
+                                    fn=unload_sd,
+                                    queue=False,
+                                    show_progress=False,
+                                )
+                                stop_batch = gr.Button("Stop")
+                    with gr.Tab(label="Config", id=102) as sd_tab_config:
+                        with gr.Column(elem_classes=["sd-right-panel"]):
+                            with gr.Row(elem_classes=["fill"]):
+                                Path(get_configs_path()).mkdir(
+                                    parents=True, exist_ok=True
+                                )
+                                default_config_file = os.path.join(
+                                    get_configs_path(),
+                                    "default_sd_config.json",
+                                )
+                                write_default_sd_configs(get_configs_path())
+                                sd_json = gr.JSON(
+                                    elem_classes=["fill"],
+                                    value=view_json_file(default_config_file),
+                                )
+                            with gr.Row():
+                                with gr.Column(scale=3):
+                                    load_sd_config = gr.FileExplorer(
+                                        label="Load Config",
+                                        file_count="single",
+                                        root_dir=(
+                                            cmd_opts.configs_path
+                                            if cmd_opts.configs_path
+                                            else get_configs_path()
+                                        ),
+                                        height=75,
+                                    )
+                                with gr.Column(scale=1):
+                                    save_sd_config = gr.Button(
+                                        value="Save Config", size="sm"
+                                    )
+                                    clear_sd_config = gr.ClearButton(
+                                        value="Clear Config",
+                                        size="sm",
+                                        components=sd_json,
+                                    )
+                            with gr.Row():
+                                sd_config_name = gr.Textbox(
+                                    value="Config Name",
+                                    info="Name of the file this config will be saved to.",
+                                    interactive=True,
+                                    show_label=False,
+                                )
+                                load_sd_config.change(
+                                    fn=load_sd_cfg,
+                                    inputs=[sd_json, load_sd_config],
+                                    outputs=[
+                                        prompt,
+                                        negative_prompt,
+                                        sd_init_image,
+                                        height,
+                                        width,
+                                        steps,
+                                        strength,
+                                        guidance_scale,
+                                        seed,
+                                        batch_count,
+                                        batch_size,
+                                        scheduler,
+                                        base_model_id,
+                                        custom_weights,
+                                        custom_vae,
+                                        precision,
+                                        device,
+                                        target_triple,
+                                        ondemand,
+                                        compiled_pipeline,
+                                        resample_type,
+                                        cnet_config,
+                                        embeddings_config,
+                                        sd_json,
+                                    ],
+                                )
+                                save_sd_config.click(
+                                    fn=save_sd_cfg,
+                                    inputs=[sd_json, sd_config_name],
+                                    outputs=[sd_config_name],
+                                )
+                        save_sd_config.click(
+                            fn=save_sd_cfg,
+                            inputs=[sd_json, sd_config_name],
+                            outputs=[sd_config_name],
+                        )
+                    with gr.Tab(label="Log", id=103) as sd_tab_log:
+                        with gr.Row():
+                            std_output = gr.Textbox(
+                                value=f"{sd_model_info}\n"
+                                f"Images will be saved at "
+                                f"{get_generated_imgs_path()}",
+                                lines=2,
+                                elem_id="std_output",
+                                show_label=True,
+                                label="Log",
+                                show_copy_button=True,
+                            )
+                            sd_element.load(
+                                logger.read_sd_logs, None, std_output, every=1
+                            )
+                            sd_status = gr.Textbox(visible=False)
+
+    pull_kwargs = dict(
+        fn=pull_sd_configs,
+        inputs=[
+            prompt,
+            negative_prompt,
+            sd_init_image,
+            height,
+            width,
+            steps,
+            strength,
+            guidance_scale,
+            seed,
+            batch_count,
+            batch_size,
+            scheduler,
+            base_model_id,
+            custom_weights,
+            custom_vae,
+            precision,
+            device,
+            target_triple,
+            ondemand,
+            compiled_pipeline,
+            resample_type,
+            cnet_config,
+            embeddings_config,
+        ],
+        outputs=[
+            sd_json,
+        ],
+    )
+
+    status_kwargs = dict(
+        fn=lambda bc, bs: status_label("Stable Diffusion", 0, bc, bs),
+        inputs=[batch_count, batch_size],
+        outputs=sd_status,
+    )
+
+    gen_kwargs = dict(
+        fn=shark_sd_fn_dict_input,
+        inputs=[sd_json],
+        outputs=[
+            sd_gallery,
+            sd_status,
+        ],
+    )
+
+    prompt_submit = prompt.submit(**status_kwargs).then(**pull_kwargs)
+    neg_prompt_submit = negative_prompt.submit(**status_kwargs).then(**pull_kwargs)
+    generate_click = (
+        stable_diffusion.click(**status_kwargs).then(**pull_kwargs).then(**gen_kwargs)
+    )
+    stop_batch.click(
+        fn=cancel_sd,
+        cancels=[prompt_submit, neg_prompt_submit, generate_click],
+    )
--- a/apps/shark_studio/web/ui/utils.py
+++ b/apps/shark_studio/web/ui/utils.py
@@ -0,0 +1,43 @@
+from enum import IntEnum
+import math
+import sys
+import os
+
+
+def resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
+    return os.path.join(base_path, relative_path)
+
+
+amdlogo_loc = resource_path("logos/amd-logo.jpg")
+amdicon_loc = resource_path("logos/amd-icon.jpg")
+
+
+class HSLHue(IntEnum):
+    RED = 0
+    YELLOW = 60
+    GREEN = 120
+    CYAN = 180
+    BLUE = 240
+    MAGENTA = 300
+
+
+def hsl_color(alpha: float, start, end):
+    b = (end - start) * (alpha if alpha > 0 else 0)
+    result = b + start
+
+    # Return a CSS HSL string
+    return f"hsl({math.floor(result)}, 80%, 35%)"
+
+
+def none_to_str_none(props: dict):
+    for key in props:
+        props[key] = "None" if props[key] == None else props[key]
+    return props
+
+
+def str_none_to_none(props: dict):
+    for key in props:
+        props[key] = None if props[key] == "None" else props[key]
+    return props
--- a/apps/shark_studio/web/utils.py
+++ b/apps/shark_studio/web/utils.py
@@ -0,0 +1,12 @@
+import os
+import sys
+
+
+def get_available_devices():
+    return ["cpu-task"]
+
+
+def get_resource_path(relative_path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
+    return os.path.join(base_path, relative_path)
--- a/apps/shark_studio/web/utils/init.py
+++ b/apps/shark_studio/web/utils/init.py
--- a/apps/shark_studio/web/utils/default_configs.py
+++ b/apps/shark_studio/web/utils/default_configs.py
@@ -0,0 +1,95 @@
+default_sd_config = r"""{
+  "prompt": [
+    "a photo taken of the front of a super-car drifting on a road near mountains at high speeds with smoke coming off the tires, front angle, front point of view, trees in the mountains of the background, ((sharp focus))"
+  ],
+  "negative_prompt": [
+    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
+  ],
+  "sd_init_image": [null],
+  "height": 512,
+  "width": 512,
+  "steps": 50,
+  "strength": 0.8,
+  "guidance_scale": 7.5,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerDiscrete",
+  "base_model_id": "stabilityai/stable-diffusion-2-1-base",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "",
+  "target_triple": "",
+  "ondemand": false,
+  "compiled_pipeline": false,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""
+
+sdxl_30steps = r"""{
+  "prompt": [
+    "a cat under the snow with blue eyes, covered by snow, cinematic style, medium shot, professional photo, animal"
+  ],
+  "negative_prompt": [
+    "watermark, signature, logo, text, lowres, ((monochrome, grayscale)), blurry, ugly, blur, oversaturated, cropped"
+  ],
+  "sd_init_image": [null],
+  "height": 1024,
+  "width": 1024,
+  "steps": 30,
+  "strength": 0.8,
+  "guidance_scale": 7.5,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerDiscrete",
+  "base_model_id": "stabilityai/stable-diffusion-xl-base-1.0",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "",
+  "target_triple": "",
+  "ondemand": false,
+  "compiled_pipeline": true,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""
+
+sdxl_turbo = r"""{
+  "prompt": [
+    "A cat wearing a hat that says 'TURBO' on it. The cat is sitting on a skateboard."
+  ],
+  "negative_prompt": [
+    ""
+  ],
+  "sd_init_image": [null],
+  "height": 512,
+  "width": 512,
+  "steps": 2,
+  "strength": 0.8,
+  "guidance_scale": 0,
+  "seed": "-1",
+  "batch_count": 1,
+  "batch_size": 1,
+  "scheduler": "EulerAncestralDiscrete",
+  "base_model_id": "stabilityai/sdxl-turbo",
+  "custom_weights": null,
+  "custom_vae": null,
+  "precision": "fp16",
+  "device": "",
+  "target_triple": "",
+  "ondemand": false,
+  "compiled_pipeline": true,
+  "resample_type": "Nearest Neighbor",
+  "controlnets": {},
+  "embeddings": {}
+}"""
+
+default_sd_configs = {
+    "default_sd_config.json": default_sd_config,
+    "sdxl-30steps.json": sdxl_30steps,
+    "sdxl-turbo.json": sdxl_turbo,
+}
--- a/apps/shark_studio/web/utils/file_utils.py
+++ b/apps/shark_studio/web/utils/file_utils.py
@@ -0,0 +1,102 @@
+import os
+import sys
+import glob
+from datetime import datetime as dt
+from pathlib import Path
+
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+checkpoints_filetypes = (
+    "*.ckpt",
+    "*.safetensors",
+)
+
+from apps.shark_studio.web.utils.default_configs import default_sd_configs
+
+
+def write_default_sd_configs(path):
+    for key in default_sd_configs.keys():
+        config_fpath = os.path.join(path, key)
+        with open(config_fpath, "w") as f:
+            f.write(default_sd_configs[key])
+
+
+def safe_name(name):
+    return name.split("/")[-1].replace("-", "_")
+
+
+def get_path_stem(path):
+    path = Path(path)
+    return path.stem
+
+
+def get_resource_path(path):
+    """Get absolute path to resource, works for dev and for PyInstaller"""
+    if os.path.isabs(path):
+        return path
+    else:
+        base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
+        result = Path(os.path.join(base_path, path)).resolve(strict=False)
+        return result
+
+
+def get_configs_path() -> Path:
+    configs = get_resource_path(cmd_opts.config_dir)
+    if not os.path.exists(configs):
+        os.mkdir(configs)
+    return Path(configs)
+
+
+def get_generated_imgs_path() -> Path:
+    outputs = get_resource_path(cmd_opts.output_dir)
+    if not os.path.exists(outputs):
+        os.mkdir(outputs)
+    return Path(outputs)
+
+
+def get_tmp_path() -> Path:
+    tmpdir = get_resource_path(cmd_opts.model_dir)
+    if not os.path.exists(tmpdir):
+        os.mkdir(tmpdir)
+    return Path(tmpdir)
+
+
+def get_generated_imgs_todays_subdir() -> str:
+    return dt.now().strftime("%Y%m%d")
+
+
+def create_model_folders():
+    dir = ["checkpoints", "vae", "lora", "vmfb"]
+    if not os.path.isdir(cmd_opts.model_dir):
+        try:
+            os.makedirs(cmd_opts.model_dir)
+        except OSError:
+            sys.exit(
+                f"Invalid --model_dir argument, "
+                f"{cmd_opts.model_dir} folder does not exist, and cannot be created."
+            )
+
+    for root in dir:
+        Path(get_checkpoints_path(root)).mkdir(parents=True, exist_ok=True)
+
+
+def get_checkpoints_path(model_type=""):
+    return get_resource_path(os.path.join(cmd_opts.model_dir, model_type))
+
+
+def get_checkpoints(model_type="checkpoints"):
+    ckpt_files = []
+    file_types = checkpoints_filetypes
+    if model_type == "lora":
+        file_types = file_types + ("*.pt", "*.bin")
+    for extn in file_types:
+        files = [
+            os.path.basename(x)
+            for x in glob.glob(os.path.join(get_checkpoints_path(model_type), extn))
+        ]
+    ckpt_files.extend(files)
+    return sorted(ckpt_files, key=str.casefold)
+
+
+def get_checkpoint_pathfile(checkpoint_name, model_type="checkpoints"):
+    return os.path.join(get_checkpoints_path(model_type), checkpoint_name)
--- a/apps/shark_studio/web/utils/globals.py
+++ b/apps/shark_studio/web/utils/globals.py
@@ -0,0 +1,134 @@
+import gc
+from ...api.utils import get_available_devices
+
+"""
+The global objects include SD pipeline and config.
+Maintaining the global objects would avoid creating extra pipeline objects when switching modes.
+Also we could avoid memory leak when switching models by clearing the cache.
+"""
+
+
+def _init():
+    global _sd_obj
+    global _llm_obj
+    global _devices
+    global _pipe_kwargs
+    global _prep_kwargs
+    global _gen_kwargs
+    global _schedulers
+    _sd_obj = None
+    _llm_obj = None
+    _devices = None
+    _pipe_kwargs = None
+    _prep_kwargs = None
+    _gen_kwargs = None
+    _schedulers = None
+    set_devices()
+
+
+def set_sd_obj(value):
+    global _sd_obj
+    global _llm_obj
+    _llm_obj = None
+    _sd_obj = value
+
+
+def set_llm_obj(value):
+    global _sd_obj
+    global _llm_obj
+    _llm_obj = value
+    _sd_obj = None
+
+
+def set_devices():
+    global _devices
+    _devices = get_available_devices()
+
+
+def set_sd_scheduler(key):
+    global _sd_obj
+    _sd_obj.scheduler = _schedulers[key]
+
+
+def set_sd_status(value):
+    global _sd_obj
+    _sd_obj.status = value
+
+
+def set_pipe_kwargs(value):
+    global _pipe_kwargs
+    _pipe_kwargs = value
+
+
+def set_prep_kwargs(value):
+    global _prep_kwargs
+    _prep_kwargs = value
+
+
+def set_gen_kwargs(value):
+    global _gen_kwargs
+    _gen_kwargs = value
+
+
+def set_schedulers(value):
+    global _schedulers
+    _schedulers = value
+
+
+def get_sd_obj():
+    global _sd_obj
+    return _sd_obj
+
+
+def get_llm_obj():
+    global _llm_obj
+    return _llm_obj
+
+
+def get_device_list():
+    global _devices
+    return _devices
+
+
+def get_sd_status():
+    global _sd_obj
+    return _sd_obj.status
+
+
+def get_pipe_kwargs():
+    global _pipe_kwargs
+    return _pipe_kwargs
+
+
+def get_prep_kwargs():
+    global _prep_kwargs
+    return _prep_kwargs
+
+
+def get_gen_kwargs():
+    global _gen_kwargs
+    return _gen_kwargs
+
+
+def get_scheduler(key):
+    global _schedulers
+    return _schedulers[key]
+
+
+def clear_cache():
+    global _sd_obj
+    global _llm_obj
+    global _pipe_kwargs
+    global _prep_kwargs
+    global _gen_kwargs
+    global _schedulers
+    del _sd_obj
+    del _llm_obj
+    del _schedulers
+    gc.collect()
+    _sd_obj = None
+    _llm_obj = None
+    _pipe_kwargs = None
+    _prep_kwargs = None
+    _gen_kwargs = None
+    _schedulers = None
--- a/apps/stable_diffusion/web/utils/metadata/init.py
+++ b/apps/stable_diffusion/web/utils/metadata/init.py
--- a/apps/stable_diffusion/web/utils/metadata/csv_metadata.py
+++ b/apps/stable_diffusion/web/utils/metadata/csv_metadata.py
@@ -29,9 +29,7 @@ def parse_csv(image_filename: str):
        has_header = csv.Sniffer().has_header(csv_file.read(2048))
        csv_file.seek(0)

-        reader = (
-            csv.DictReader(csv_file) if has_header else csv.reader(csv_file)
-        )
+        reader = csv.DictReader(csv_file) if has_header else csv.reader(csv_file)

        matches = [
            # we rely on humanize and humanizable to work out the parsing of the individual .csv rows
--- a/apps/stable_diffusion/web/utils/metadata/display.py
+++ b/apps/stable_diffusion/web/utils/metadata/display.py
--- a/apps/stable_diffusion/web/utils/metadata/exif_metadata.py
+++ b/apps/stable_diffusion/web/utils/metadata/exif_metadata.py
--- a/apps/stable_diffusion/web/utils/metadata/format.py
+++ b/apps/stable_diffusion/web/utils/metadata/format.py
@@ -92,15 +92,11 @@ def compact(metadata: dict) -> dict:
            result["Hires resize"] = f"{hires_y}x{hires_x}"

    # remove VAE if it exists and is empty
-    if (result.keys() & {"VAE"}) and (
-        not result["VAE"] or result["VAE"] == "None"
-    ):
+    if (result.keys() & {"VAE"}) and (not result["VAE"] or result["VAE"] == "None"):
        result.pop("VAE")

    # remove LoRA if it exists and is empty
-    if (result.keys() & {"LoRA"}) and (
-        not result["LoRA"] or result["LoRA"] == "None"
-    ):
+    if (result.keys() & {"LoRA"}) and (not result["LoRA"] or result["LoRA"] == "None"):
        result.pop("LoRA")

    return result
--- a/apps/stable_diffusion/web/utils/metadata/png_metadata.py
+++ b/apps/stable_diffusion/web/utils/metadata/png_metadata.py
@@ -1,9 +1,12 @@
 import re
 from pathlib import Path
-from apps.stable_diffusion.web.ui.utils import (
-    get_custom_model_pathfile,
-    scheduler_list,
-    predefined_models,
+from apps.shark_studio.web.utils.file_utils import (
+    get_checkpoint_pathfile,
+)
+from apps.shark_studio.api.sd import EMPTY_SD_MAP as sd_model_map
+
+from apps.shark_studio.modules.schedulers import (
+    scheduler_model_map,
 )

 re_param_code = r'\s*([\w ]+):\s*("(?:\\"[^,]|\\"|\\|[^\"])+"|[^,]*)(?:,|$)'
@@ -62,20 +65,16 @@ def parse_generation_parameters(x: str):
    return res


-def try_find_model_base_from_png_metadata(
-    file: str, folder: str = "models"
-) -> str:
+def try_find_model_base_from_png_metadata(file: str, folder: str = "models") -> str:
    custom = ""

    # Remove extension from file info
    if file.endswith(".safetensors") or file.endswith(".ckpt"):
        file = Path(file).stem
    # Check for the file name match with one of the local ckpt or safetensors files
-    if Path(get_custom_model_pathfile(file + ".ckpt", folder)).is_file():
+    if Path(get_checkpoint_pathfile(file + ".ckpt", folder)).is_file():
        custom = file + ".ckpt"
-    if Path(
-        get_custom_model_pathfile(file + ".safetensors", folder)
-    ).is_file():
+    if Path(get_checkpoint_pathfile(file + ".safetensors", folder)).is_file():
        custom = file + ".safetensors"

    return custom
@@ -91,7 +90,7 @@ def find_model_from_png_metadata(
        model_file = metadata[key]
        png_custom = try_find_model_base_from_png_metadata(model_file)
        # Check for a model match with one of the default model list (ex: "Linaqruf/anything-v3.0")
-        if model_file in predefined_models:
+        if model_file in sd_model_map:
            png_custom = model_file
        # If nothing had matched, check vendor/hf_model_id
        if not png_custom and model_file.count("/"):
@@ -99,16 +98,13 @@ def find_model_from_png_metadata(
        # No matching model was found
        if not png_custom and not png_hf_id:
            print(
-                "Import PNG info: Unable to find a matching model for %s"
-                % model_file
+                "Import PNG info: Unable to find a matching model for %s" % model_file
            )

    return png_custom, png_hf_id


-def find_vae_from_png_metadata(
-    key: str, metadata: dict[str, str | int]
-) -> str:
+def find_vae_from_png_metadata(key: str, metadata: dict[str, str | int]) -> str:
    vae_custom = ""

    if key in metadata:
@@ -190,7 +186,7 @@ def import_png_metadata(
        if "Prompt" in metadata:
            prompt = metadata["Prompt"]
        if "Sampler" in metadata:
-            if metadata["Sampler"] in scheduler_list:
+            if metadata["Sampler"] in scheduler_model_map:
                sampler = metadata["Sampler"]
            else:
                print(
--- a/apps/shark_studio/web/utils/state.py
+++ b/apps/shark_studio/web/utils/state.py
@@ -0,0 +1,39 @@
+import apps.shark_studio.web.utils.globals as global_obj
+import gc
+
+
+def status_label(tab_name, batch_index=0, batch_count=1, batch_size=1):
+    if batch_index < batch_count:
+        bs = f"x{batch_size}" if batch_size > 1 else ""
+        return f"{tab_name} generating {batch_index+1}/{batch_count}{bs}"
+    else:
+        return f"{tab_name} complete"
+
+
+def get_generation_text_info(seeds, device):
+    cfg_dump = {}
+    for cfg in global_obj.get_config_dict():
+        cfg_dump[cfg] = cfg
+    text_output = f"prompt={cfg_dump['prompts']}"
+    text_output += f"\nnegative prompt={cfg_dump['negative_prompts']}"
+    text_output += (
+        f"\nmodel_id={cfg_dump['hf_model_id']}, " f"ckpt_loc={cfg_dump['ckpt_loc']}"
+    )
+    text_output += f"\nscheduler={cfg_dump['scheduler']}, " f"device={device}"
+    text_output += (
+        f"\nsteps={cfg_dump['steps']}, "
+        f"guidance_scale={cfg_dump['guidance_scale']}, "
+        f"seed={seeds}"
+    )
+    text_output += (
+        f"\nsize={cfg_dump['height']}x{cfg_dump['width']}, "
+        if not cfg_dump.use_hiresfix
+        else f"\nsize={cfg_dump['hiresfix_height']}x{cfg_dump['hiresfix_width']}, "
+    )
+    text_output += (
+        f"batch_count={cfg_dump['batch_count']}, "
+        f"batch_size={cfg_dump['batch_size']}, "
+        f"max_length={cfg_dump['max_length']}"
+    )
+
+    return text_output
--- a/apps/stable_diffusion/web/utils/tmp_configs.py
+++ b/apps/stable_diffusion/web/utils/tmp_configs.py
@@ -2,14 +2,14 @@ import os
 import shutil
 from time import time

-shark_tmp = os.path.join(os.getcwd(), "shark_tmp/")
+from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
+
+shark_tmp = cmd_opts.tmp_dir  # os.path.join(os.getcwd(), "shark_tmp/")


 def clear_tmp_mlir():
    cleanup_start = time()
-    print(
-        "Clearing .mlir temporary files from a prior run. This may take some time..."
-    )
+    print("Clearing .mlir temporary files from a prior run. This may take some time...")
    mlir_files = [
        filename
        for filename in os.listdir(shark_tmp)
@@ -17,10 +17,8 @@ def clear_tmp_mlir():
        and filename.endswith(".mlir")
    ]
    for filename in mlir_files:
-        os.remove(shark_tmp + filename)
-    print(
-        f"Clearing .mlir temporary files took {time() - cleanup_start:.4f} seconds."
-    )
+        os.remove(os.path.join(shark_tmp, filename))
+    print(f"Clearing .mlir temporary files took {time() - cleanup_start:.4f} seconds.")


 def clear_tmp_imgs():
--- a/apps/stable_diffusion/profiling_with_iree.md
+++ b/apps/stable_diffusion/profiling_with_iree.md
@@ -1,87 +0,0 @@
-Compile / Run Instructions:
-
-To compile .vmfb for SD (vae, unet, CLIP), run the following commands with the .mlir in your local shark_tank cache (default location for Linux users is `~/.local/shark_tank`). These will be available once the script from [this README](https://github.com/nod-ai/SHARK/blob/main/shark/examples/shark_inference/stable_diffusion/README.md) is run once.
-Running the script mentioned above with the `--save_vmfb` flag will also save the .vmfb in your SHARK base directory if you want to skip straight to benchmarks.
-
-Compile Commands FP32/FP16: 
-
-```shell
-Vulkan AMD: 
-iree-compile --iree-input-type=none --iree-hal-target-backends=vulkan --iree-vulkan-target-triple=rdna2-unknown-linux /path/to/input/mlir -o /path/to/output/vmfb
-
-#  add --mlir-print-debuginfo --mlir-print-op-on-diagnostic=true for debug
-#  use –iree-input-type=auto or "mhlo_legacy" or "stablehlo" for TF models
-
-CUDA NVIDIA:
-iree-compile --iree-input-type=none --iree-hal-target-backends=cuda /path/to/input/mlir -o /path/to/output/vmfb
-
-CPU:
-iree-compile --iree-input-type=none --iree-hal-target-backends=llvm-cpu /path/to/input/mlir -o /path/to/output/vmfb
-```
-
-
-
-Run / Benchmark Command (FP32 - NCHW):
-(NEED to use BS=2 since we do two forward passes to unet as a result of classifier free guidance.)
-
-```shell
-## Vulkan AMD:
-iree-benchmark-module --module=/path/to/output/vmfb --function=forward --device=vulkan --input=1x4x64x64xf32 --input=1xf32 --input=2x77x768xf32 --input=f32=1.0 --input=f32=1.0
-
-## CUDA:
-iree-benchmark-module --module=/path/to/vmfb --function=forward --device=cuda  --input=1x4x64x64xf32 --input=1xf32 --input=2x77x768xf32 --input=f32=1.0 --input=f32=1.0
-
-## CPU:
-iree-benchmark-module --module=/path/to/vmfb --function=forward --device=local-task  --input=1x4x64x64xf32 --input=1xf32 --input=2x77x768xf32 --input=f32=1.0 --input=f32=1.0
-
-```
-
-Run via vulkan_gui for RGP Profiling:
-
-To build the vulkan app for profiling UNet follow the instructions [here](https://github.com/nod-ai/SHARK/tree/main/cpp) and then run the following command from the cpp directory with your compiled stable_diff.vmfb
-```shell
-./build/vulkan_gui/iree-vulkan-gui --module=/path/to/unet.vmfb --input=1x4x64x64xf32 --input=1xf32 --input=2x77x768xf32 --input=f32=1.0 --input=f32=1.0
-```
-
-</details>
-  <details>
-  <summary>Debug Commands</summary>
-
-## Debug commands and other advanced usage follows.
-
-```shell
-python txt2img.py --precision="fp32"|"fp16" --device="cpu"|"cuda"|"vulkan" --import_mlir|--no-import_mlir --prompt "enter the text" 
-```
-
-## dump all dispatch .spv and isa using amdllpc
-
-```shell
-python txt2img.py --precision="fp16" --device="vulkan" --iree-vulkan-target-triple=rdna3-unknown-linux --no-load_vmfb --dispatch_benchmarks="all" --dispatch_benchmarks_dir="SD_dispatches" --dump_isa
-```
-
-## Compile and save the .vmfb (using vulkan fp16 as an example):
-
-```shell
-python txt2img.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb
-```
-
-## Capture an RGP trace
-
-```shell
-python txt2img.py --precision=fp16 --device=vulkan --steps=50 --save_vmfb --enable_rgp
-```
-
-## Run the vae module with iree-benchmark-module (NCHW, fp16, vulkan, for example):
-
-```shell
-iree-benchmark-module --module=/path/to/output/vmfb --function=forward --device=vulkan --input=1x4x64x64xf16  
-```
-
-## Run the unet module with iree-benchmark-module (same config as above):
-```shell
-##if you want to use .npz inputs:
-unzip ~/.local/shark_tank/<your unet>/inputs.npz
-iree-benchmark-module --module=/path/to/output/vmfb --function=forward --input=@arr_0.npy --input=1xf16 --input=@arr_2.npy --input=@arr_3.npy --input=@arr_4.npy  
-```
-
-</details>
--- a/apps/stable_diffusion/scripts/init.py
+++ b/apps/stable_diffusion/scripts/init.py
@@ -1 +0,0 @@
-from apps.stable_diffusion.scripts.train_lora_word import lora_train
--- a/apps/stable_diffusion/scripts/img2img.py
+++ b/apps/stable_diffusion/scripts/img2img.py
@@ -1,128 +0,0 @@
-import sys
-import torch
-import time
-from PIL import Image
-import transformers
-from apps.stable_diffusion.src import (
-    args,
-    Image2ImagePipeline,
-    StencilPipeline,
-    resize_stencil,
-    get_schedulers,
-    set_init_device_flags,
-    utils,
-    clear_all,
-    save_output_img,
-)
-from apps.stable_diffusion.src.utils import get_generation_text_info
-
-
-def main():
-    if args.clear_all:
-        clear_all()
-
-    if args.img_path is None:
-        print("Flag --img_path is required.")
-        exit()
-
-    image = Image.open(args.img_path).convert("RGB")
-    # When the models get uploaded, it should be default to False.
-    args.import_mlir = True
-
-    use_stencil = args.use_stencil
-    if use_stencil:
-        args.scheduler = "DDIM"
-        args.hf_model_id = "runwayml/stable-diffusion-v1-5"
-        image, args.width, args.height = resize_stencil(image)
-    elif "Shark" in args.scheduler:
-        print(
-            f"Shark schedulers are not supported. Switching to EulerDiscrete scheduler"
-        )
-        args.scheduler = "EulerDiscrete"
-    cpu_scheduling = not args.scheduler.startswith("Shark")
-    dtype = torch.float32 if args.precision == "fp32" else torch.half
-    set_init_device_flags()
-    schedulers = get_schedulers(args.hf_model_id)
-    scheduler_obj = schedulers[args.scheduler]
-    seed = utils.sanitize_seed(args.seed)
-    # Adjust for height and width based on model
-
-    if use_stencil:
-        img2img_obj = StencilPipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-            low_cpu_mem_usage=args.low_cpu_mem_usage,
-            use_stencil=use_stencil,
-            debug=args.import_debug if args.import_mlir else False,
-            use_lora=args.use_lora,
-            ondemand=args.ondemand,
-        )
-    else:
-        img2img_obj = Image2ImagePipeline.from_pretrained(
-            scheduler_obj,
-            args.import_mlir,
-            args.hf_model_id,
-            args.ckpt_loc,
-            args.custom_vae,
-            args.precision,
-            args.max_length,
-            args.batch_size,
-            args.height,
-            args.width,
-            args.use_base_vae,
-            args.use_tuned,
-            low_cpu_mem_usage=args.low_cpu_mem_usage,
-            debug=args.import_debug if args.import_mlir else False,
-            use_lora=args.use_lora,
-            ondemand=args.ondemand,
-        )
-
-    start_time = time.time()
-    generated_imgs = img2img_obj.generate_images(
-        args.prompts,
-        args.negative_prompts,
-        image,
-        args.batch_size,
-        args.height,
-        args.width,
-        args.steps,
-        args.strength,
-        args.guidance_scale,
-        seed,
-        args.max_length,
-        dtype,
-        args.use_base_vae,
-        cpu_scheduling,
-        args.max_embeddings_multiples,
-        use_stencil=use_stencil,
-        control_mode=args.control_mode,
-    )
-    total_time = time.time() - start_time
-    text_output = f"prompt={args.prompts}"
-    text_output += f"\nnegative prompt={args.negative_prompts}"
-    text_output += f"\nmodel_id={args.hf_model_id}, ckpt_loc={args.ckpt_loc}"
-    text_output += f"\nscheduler={args.scheduler}, device={args.device}"
-    text_output += f"\nsteps={args.steps}, strength={args.strength}, guidance_scale={args.guidance_scale}, seed={seed}, size={args.height}x{args.width}"
-    text_output += (
-        f", batch size={args.batch_size}, max_length={args.max_length}"
-    )
-    text_output += img2img_obj.log
-    text_output += f"\nTotal image generation time: {total_time:.4f}sec"
-
-    extra_info = {"STRENGTH": args.strength}
-    save_output_img(generated_imgs[0], seed, extra_info)
-    print(text_output)
-
-
-if __name__ == "__main__":
-    main()
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`from apps.stable_diffusion.scripts.train_lora_word import lora_train`