Clearly differentiate phases of loading modules to better understand if things are taking a long time. (#1733 )

gpt_langchain.py fixes for pydantic (#1722 )
Remove SD from all_models.csv (#1706 )
2026-04-20 03:00:34 -04:00 · 2023-08-07 14:03:12 -07:00 · 2023-08-07 00:55:38 -07:00 · 2023-08-05 15:55:52 -05:00 · 2023-08-05 21:33:53 +05:30 · 2023-08-04 18:22:06 -05:00
79 changed files with 20073 additions and 1003 deletions
--- a/.flake8
+++ b/.flake8
@@ -2,4 +2,4 @@
 count = 1
 show-source = 1
 select = E9,F63,F7,F82
-exclude = lit.cfg.py, apps/language_models/scripts/vicuna.py
+exclude = lit.cfg.py, apps/language_models/scripts/vicuna.py, apps/language_models/src/pipelines/minigpt4_pipeline.py, apps/language_models/langchain/h2oai_pipeline.py
--- a/.github/workflows/test-models.yml
+++ b/.github/workflows/test-models.yml
@@ -115,6 +115,7 @@ jobs:
        pytest --forked --benchmark=native --ci --ci_sha=${SHORT_SHA} --update_tank --tank_url="gs://shark_tank/nightly/" -k cpu 
        gsutil cp ./bench_results.csv gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv
        gsutil cp gs://shark-public/builder/bench_results/${DATE}/bench_results_cpu_${SHORT_SHA}.csv gs://shark-public/builder/bench_results/latest/bench_results_cpu_latest.csv
+        python build_tools/vicuna_testing.py

    - name: Validate Models on NVIDIA GPU
      if: matrix.suite == 'cuda'
--- a/.gitignore
+++ b/.gitignore
@@ -189,3 +189,7 @@ apps/stable_diffusion/web/models/

 # Stencil annotators.
 stencil_annotator/
+
+# For DocuChat
+apps/language_models/langchain/user_path/
+db_dir_UserData
--- a/apps/language_models/README.md
+++ b/apps/language_models/README.md
@@ -0,0 +1,16 @@
+## CodeGen Setup using SHARK-server
+
+### Setup Server
+- clone SHARK and setup the venv
+- host the server using `python apps/stable_diffusion/web/index.py --api --server_port=<PORT>`
+- default server address is `http://0.0.0.0:8080`
+
+### Setup Client
+1. fauxpilot-vscode (VSCode Extension):
+- Code for the extension can be found [here](https://github.com/Venthe/vscode-fauxpilot)
+- PreReq: VSCode extension (will need [`nodejs` and `npm`](https://nodejs.org/en/download) to compile and run the extension)
+- Compile and Run the extension on VSCode (press F5 on VSCode), this opens a new VSCode window with the extension running
+- Open VSCode settings, search for fauxpilot in settings and modify `server : http://<IP>:<PORT>`, `Model : codegen` , `Max Lines : 30`
+
+2. Others (REST API curl, OpenAI Python bindings) as shown [here](https://github.com/fauxpilot/fauxpilot/blob/main/documentation/client.md)
+- using Github Copilot VSCode extension with SHARK-server needs more work to be functional.
--- a/apps/language_models/langchain/README.md
+++ b/apps/language_models/langchain/README.md
@@ -0,0 +1,18 @@
+# Langchain
+
+## How to run the model
+
+1.) Install all the dependencies by running:
+```shell
+pip install -r apps/language_models/langchain/langchain_requirements.txt
+sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
+```
+
+2.) Create a folder named `user_path` in `apps/language_models/langchain/` directory.
+
+Now, you are ready to use the model.
+
+3.) To run the model, run the following command:
+```shell
+python apps/language_models/langchain/gen.py --cli=True
+```
--- a/apps/language_models/langchain/cli.py
+++ b/apps/language_models/langchain/cli.py
@@ -0,0 +1,186 @@
+import copy
+import torch
+
+from evaluate_params import eval_func_param_names
+from gen import Langchain
+from prompter import non_hf_types
+from utils import clear_torch_cache, NullContext, get_kwargs
+
+
+def run_cli(  # for local function:
+    base_model=None,
+    lora_weights=None,
+    inference_server=None,
+    debug=None,
+    chat_context=None,
+    examples=None,
+    memory_restriction_level=None,
+    # for get_model:
+    score_model=None,
+    load_8bit=None,
+    load_4bit=None,
+    load_half=None,
+    load_gptq=None,
+    use_safetensors=None,
+    infer_devices=None,
+    tokenizer_base_model=None,
+    gpu_id=None,
+    local_files_only=None,
+    resume_download=None,
+    use_auth_token=None,
+    trust_remote_code=None,
+    offload_folder=None,
+    compile_model=None,
+    # for some evaluate args
+    stream_output=None,
+    prompt_type=None,
+    prompt_dict=None,
+    temperature=None,
+    top_p=None,
+    top_k=None,
+    num_beams=None,
+    max_new_tokens=None,
+    min_new_tokens=None,
+    early_stopping=None,
+    max_time=None,
+    repetition_penalty=None,
+    num_return_sequences=None,
+    do_sample=None,
+    chat=None,
+    langchain_mode=None,
+    langchain_action=None,
+    document_choice=None,
+    top_k_docs=None,
+    chunk=None,
+    chunk_size=None,
+    # for evaluate kwargs
+    src_lang=None,
+    tgt_lang=None,
+    concurrency_count=None,
+    save_dir=None,
+    sanitize_bot_response=None,
+    model_state0=None,
+    max_max_new_tokens=None,
+    is_public=None,
+    max_max_time=None,
+    raise_generate_gpu_exceptions=None,
+    load_db_if_exists=None,
+    dbs=None,
+    user_path=None,
+    detect_user_path_changes_every_query=None,
+    use_openai_embedding=None,
+    use_openai_model=None,
+    hf_embedding_model=None,
+    db_type=None,
+    n_jobs=None,
+    first_para=None,
+    text_limit=None,
+    verbose=None,
+    cli=None,
+    reverse_docs=None,
+    use_cache=None,
+    auto_reduce_chunks=None,
+    max_chunks=None,
+    model_lock=None,
+    force_langchain_evaluate=None,
+    model_state_none=None,
+    # unique to this function:
+    cli_loop=None,
+):
+    Langchain.check_locals(**locals())
+
+    score_model = ""  # FIXME: For now, so user doesn't have to pass
+    n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
+    device = "cpu" if n_gpus == 0 else "cuda"
+    context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device
+
+    with context_class(device):
+        from functools import partial
+
+        # get score model
+        smodel, stokenizer, sdevice = Langchain.get_score_model(
+            reward_type=True,
+            **get_kwargs(
+                Langchain.get_score_model,
+                exclude_names=["reward_type"],
+                **locals()
+            )
+        )
+
+        model, tokenizer, device = Langchain.get_model(
+            reward_type=False,
+            **get_kwargs(
+                Langchain.get_model, exclude_names=["reward_type"], **locals()
+            )
+        )
+        model_dict = dict(
+            base_model=base_model,
+            tokenizer_base_model=tokenizer_base_model,
+            lora_weights=lora_weights,
+            inference_server=inference_server,
+            prompt_type=prompt_type,
+            prompt_dict=prompt_dict,
+        )
+        model_state = dict(model=model, tokenizer=tokenizer, device=device)
+        model_state.update(model_dict)
+        my_db_state = [None]
+        fun = partial(
+            Langchain.evaluate,
+            model_state,
+            my_db_state,
+            **get_kwargs(
+                Langchain.evaluate,
+                exclude_names=["model_state", "my_db_state"]
+                + eval_func_param_names,
+                **locals()
+            )
+        )
+
+        example1 = examples[-1]  # pick reference example
+        all_generations = []
+        while True:
+            clear_torch_cache()
+            instruction = input("\nEnter an instruction: ")
+            if instruction == "exit":
+                break
+
+            eval_vars = copy.deepcopy(example1)
+            eval_vars[eval_func_param_names.index("instruction")] = eval_vars[
+                eval_func_param_names.index("instruction_nochat")
+            ] = instruction
+            eval_vars[eval_func_param_names.index("iinput")] = eval_vars[
+                eval_func_param_names.index("iinput_nochat")
+            ] = ""  # no input yet
+            eval_vars[
+                eval_func_param_names.index("context")
+            ] = ""  # no context yet
+
+            # grab other parameters, like langchain_mode
+            for k in eval_func_param_names:
+                if k in locals():
+                    eval_vars[eval_func_param_names.index(k)] = locals()[k]
+
+            gener = fun(*tuple(eval_vars))
+            outr = ""
+            res_old = ""
+            for gen_output in gener:
+                res = gen_output["response"]
+                extra = gen_output["sources"]
+                if base_model not in non_hf_types or base_model in ["llama"]:
+                    if not stream_output:
+                        print(res)
+                    else:
+                        # then stream output for gradio that has full output each generation, so need here to show only new chars
+                        diff = res[len(res_old) :]
+                        print(diff, end="", flush=True)
+                        res_old = res
+                    outr = res  # don't accumulate
+                else:
+                    outr += res  # just is one thing
+                    if extra:
+                        # show sources at end after model itself had streamed to std rest of response
+                        print(extra, flush=True)
+            all_generations.append(outr + "\n")
+            if not cli_loop:
+                break
+    return all_generations
--- a/apps/language_models/langchain/create_data.py
+++ b/apps/language_models/langchain/create_data.py
--- a/apps/language_models/langchain/enums.py
+++ b/apps/language_models/langchain/enums.py
@@ -0,0 +1,103 @@
+from enum import Enum
+
+
+class PromptType(Enum):
+    custom = -1
+    plain = 0
+    instruct = 1
+    quality = 2
+    human_bot = 3
+    dai_faq = 4
+    summarize = 5
+    simple_instruct = 6
+    instruct_vicuna = 7
+    instruct_with_end = 8
+    human_bot_orig = 9
+    prompt_answer = 10
+    open_assistant = 11
+    wizard_lm = 12
+    wizard_mega = 13
+    instruct_vicuna2 = 14
+    instruct_vicuna3 = 15
+    wizard2 = 16
+    wizard3 = 17
+    instruct_simple = 18
+    wizard_vicuna = 19
+    openai = 20
+    openai_chat = 21
+    gptj = 22
+    prompt_answer_openllama = 23
+    vicuna11 = 24
+    mptinstruct = 25
+    mptchat = 26
+    falcon = 27
+
+
+class DocumentChoices(Enum):
+    All_Relevant = 0
+    All_Relevant_Only_Sources = 1
+    Only_All_Sources = 2
+    Just_LLM = 3
+
+
+non_query_commands = [
+    DocumentChoices.All_Relevant_Only_Sources.name,
+    DocumentChoices.Only_All_Sources.name,
+]
+
+
+class LangChainMode(Enum):
+    """LangChain mode"""
+
+    DISABLED = "Disabled"
+    CHAT_LLM = "ChatLLM"
+    LLM = "LLM"
+    ALL = "All"
+    WIKI = "wiki"
+    WIKI_FULL = "wiki_full"
+    USER_DATA = "UserData"
+    MY_DATA = "MyData"
+    GITHUB_H2OGPT = "github h2oGPT"
+    H2O_DAI_DOCS = "DriverlessAI docs"
+
+
+class LangChainAction(Enum):
+    """LangChain action"""
+
+    QUERY = "Query"
+    # WIP:
+    # SUMMARIZE_MAP = "Summarize_map_reduce"
+    SUMMARIZE_MAP = "Summarize"
+    SUMMARIZE_ALL = "Summarize_all"
+    SUMMARIZE_REFINE = "Summarize_refine"
+
+
+no_server_str = no_lora_str = no_model_str = "[None/Remove]"
+
+# from site-packages/langchain/llms/openai.py
+# but needed since ChatOpenAI doesn't have this information
+model_token_mapping = {
+    "gpt-4": 8192,
+    "gpt-4-0314": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-4-32k-0314": 32768,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-16k": 16 * 1024,
+    "gpt-3.5-turbo-0301": 4096,
+    "text-ada-001": 2049,
+    "ada": 2049,
+    "text-babbage-001": 2040,
+    "babbage": 2049,
+    "text-curie-001": 2049,
+    "curie": 2049,
+    "davinci": 2049,
+    "text-davinci-003": 4097,
+    "text-davinci-002": 4097,
+    "code-davinci-002": 8001,
+    "code-davinci-001": 8001,
+    "code-cushman-002": 2048,
+    "code-cushman-001": 2048,
+}
+
+source_prefix = "Sources [Score | Link]:"
+source_postfix = "End Sources<p>"
--- a/apps/language_models/langchain/evaluate_params.py
+++ b/apps/language_models/langchain/evaluate_params.py
@@ -0,0 +1,53 @@
+no_default_param_names = [
+    "instruction",
+    "iinput",
+    "context",
+    "instruction_nochat",
+    "iinput_nochat",
+]
+
+gen_hyper = [
+    "temperature",
+    "top_p",
+    "top_k",
+    "num_beams",
+    "max_new_tokens",
+    "min_new_tokens",
+    "early_stopping",
+    "max_time",
+    "repetition_penalty",
+    "num_return_sequences",
+    "do_sample",
+]
+
+eval_func_param_names = (
+    [
+        "instruction",
+        "iinput",
+        "context",
+        "stream_output",
+        "prompt_type",
+        "prompt_dict",
+    ]
+    + gen_hyper
+    + [
+        "chat",
+        "instruction_nochat",
+        "iinput_nochat",
+        "langchain_mode",
+        "langchain_action",
+        "top_k_docs",
+        "chunk",
+        "chunk_size",
+        "document_choice",
+    ]
+)
+
+# form evaluate defaults for submit_nochat_api
+eval_func_param_names_defaults = eval_func_param_names.copy()
+for k in no_default_param_names:
+    if k in eval_func_param_names_defaults:
+        eval_func_param_names_defaults.remove(k)
+
+
+eval_extra_columns = ["prompt", "response", "score"]
--- a/apps/language_models/langchain/expanded_pipelines.py
+++ b/apps/language_models/langchain/expanded_pipelines.py
@@ -0,0 +1,432 @@
+"""Load question answering chains."""
+from __future__ import annotations
+from typing import (
+    Any,
+    Mapping,
+    Optional,
+    Dict,
+    List,
+    Sequence,
+    Tuple,
+    Union,
+    Protocol,
+)
+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.base import BaseCallbackManager
+from langchain.chains.question_answering import stuff_prompt
+from langchain.prompts.base import BasePromptTemplate
+from langchain.docstore.document import Document
+from abc import ABC, abstractmethod
+from langchain.chains.base import Chain
+from langchain.callbacks.manager import (
+    CallbackManager,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.input import get_colored_text
+from langchain.load.dump import dumpd
+from langchain.prompts.prompt import PromptTemplate
+from langchain.schema import LLMResult, PromptValue
+from pydantic import Extra, Field, root_validator
+
+
+def format_document(doc: Document, prompt: BasePromptTemplate) -> str:
+    """Format a document into a string based on a prompt template."""
+    base_info = {"page_content": doc.page_content}
+    base_info.update(doc.metadata)
+    missing_metadata = set(prompt.input_variables).difference(base_info)
+    if len(missing_metadata) > 0:
+        required_metadata = [
+            iv for iv in prompt.input_variables if iv != "page_content"
+        ]
+        raise ValueError(
+            f"Document prompt requires documents to have metadata variables: "
+            f"{required_metadata}. Received document with missing metadata: "
+            f"{list(missing_metadata)}."
+        )
+    document_info = {k: base_info[k] for k in prompt.input_variables}
+    return prompt.format(**document_info)
+
+
+class BaseCombineDocumentsChain(Chain, ABC):
+    """Base interface for chains combining documents."""
+
+    input_key: str = "input_documents"  #: :meta private:
+    output_key: str = "output_text"  #: :meta private:
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Expect input key.
+
+        :meta private:
+        """
+        return [self.input_key]
+
+    @property
+    def output_keys(self) -> List[str]:
+        """Return output key.
+
+        :meta private:
+        """
+        return [self.output_key]
+
+    def prompt_length(
+        self, docs: List[Document], **kwargs: Any
+    ) -> Optional[int]:
+        """Return the prompt length given the documents passed in.
+
+        Returns None if the method does not depend on the prompt length.
+        """
+        return None
+
+    @abstractmethod
+    def combine_docs(
+        self, docs: List[Document], **kwargs: Any
+    ) -> Tuple[str, dict]:
+        """Combine documents into a single string."""
+
+    def _call(
+        self,
+        inputs: Dict[str, List[Document]],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        _run_manager = (
+            run_manager or CallbackManagerForChainRun.get_noop_manager()
+        )
+        docs = inputs[self.input_key]
+        # Other keys are assumed to be needed for LLM prediction
+        other_keys = {k: v for k, v in inputs.items() if k != self.input_key}
+        output, extra_return_dict = self.combine_docs(
+            docs, callbacks=_run_manager.get_child(), **other_keys
+        )
+        extra_return_dict[self.output_key] = output
+        return extra_return_dict
+
+
+class LLMChain(Chain):
+    """Chain to run queries against LLMs.
+
+    Example:
+        .. code-block:: python
+
+            from langchain import LLMChain, OpenAI, PromptTemplate
+            prompt_template = "Tell me a {adjective} joke"
+            prompt = PromptTemplate(
+                input_variables=["adjective"], template=prompt_template
+            )
+            llm = LLMChain(llm=OpenAI(), prompt=prompt)
+    """
+
+    @property
+    def lc_serializable(self) -> bool:
+        return True
+
+    prompt: BasePromptTemplate
+    """Prompt object to use."""
+    llm: BaseLanguageModel
+    output_key: str = "text"  #: :meta private:
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+        arbitrary_types_allowed = True
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Will be whatever keys the prompt expects.
+
+        :meta private:
+        """
+        return self.prompt.input_variables
+
+    @property
+    def output_keys(self) -> List[str]:
+        """Will always return text key.
+
+        :meta private:
+        """
+        return [self.output_key]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        response = self.generate([inputs], run_manager=run_manager)
+        return self.create_outputs(response)[0]
+
+    def generate(
+        self,
+        input_list: List[Dict[str, Any]],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> LLMResult:
+        """Generate LLM result from inputs."""
+        prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
+        return self.llm.generate_prompt(
+            prompts,
+            stop,
+            callbacks=run_manager.get_child() if run_manager else None,
+        )
+
+    def prep_prompts(
+        self,
+        input_list: List[Dict[str, Any]],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Tuple[List[PromptValue], Optional[List[str]]]:
+        """Prepare prompts from inputs."""
+        stop = None
+        if "stop" in input_list[0]:
+            stop = input_list[0]["stop"]
+        prompts = []
+        for inputs in input_list:
+            selected_inputs = {
+                k: inputs[k] for k in self.prompt.input_variables
+            }
+            prompt = self.prompt.format_prompt(**selected_inputs)
+            _colored_text = get_colored_text(prompt.to_string(), "green")
+            _text = "Prompt after formatting:\n" + _colored_text
+            if run_manager:
+                run_manager.on_text(_text, end="\n", verbose=self.verbose)
+            if "stop" in inputs and inputs["stop"] != stop:
+                raise ValueError(
+                    "If `stop` is present in any inputs, should be present in all."
+                )
+            prompts.append(prompt)
+        return prompts, stop
+
+    def apply(
+        self, input_list: List[Dict[str, Any]], callbacks: Callbacks = None
+    ) -> List[Dict[str, str]]:
+        """Utilize the LLM generate method for speed gains."""
+        callback_manager = CallbackManager.configure(
+            callbacks, self.callbacks, self.verbose
+        )
+        run_manager = callback_manager.on_chain_start(
+            dumpd(self),
+            {"input_list": input_list},
+        )
+        try:
+            response = self.generate(input_list, run_manager=run_manager)
+        except (KeyboardInterrupt, Exception) as e:
+            run_manager.on_chain_error(e)
+            raise e
+        outputs = self.create_outputs(response)
+        run_manager.on_chain_end({"outputs": outputs})
+        return outputs
+
+    def create_outputs(self, response: LLMResult) -> List[Dict[str, str]]:
+        """Create outputs from response."""
+        return [
+            # Get the text of the top generated string.
+            {self.output_key: generation[0].text}
+            for generation in response.generations
+        ]
+
+    def predict(self, callbacks: Callbacks = None, **kwargs: Any) -> str:
+        """Format prompt with kwargs and pass to LLM.
+
+        Args:
+            callbacks: Callbacks to pass to LLMChain
+            **kwargs: Keys to pass to prompt template.
+
+        Returns:
+            Completion from LLM.
+
+        Example:
+            .. code-block:: python
+
+                completion = llm.predict(adjective="funny")
+        """
+        return self(kwargs, callbacks=callbacks)[self.output_key]
+
+    def predict_and_parse(
+        self, callbacks: Callbacks = None, **kwargs: Any
+    ) -> Union[str, List[str], Dict[str, Any]]:
+        """Call predict and then parse the results."""
+        result = self.predict(callbacks=callbacks, **kwargs)
+        if self.prompt.output_parser is not None:
+            return self.prompt.output_parser.parse(result)
+        else:
+            return result
+
+    def apply_and_parse(
+        self, input_list: List[Dict[str, Any]], callbacks: Callbacks = None
+    ) -> Sequence[Union[str, List[str], Dict[str, str]]]:
+        """Call apply and then parse the results."""
+        result = self.apply(input_list, callbacks=callbacks)
+        return self._parse_result(result)
+
+    def _parse_result(
+        self, result: List[Dict[str, str]]
+    ) -> Sequence[Union[str, List[str], Dict[str, str]]]:
+        if self.prompt.output_parser is not None:
+            return [
+                self.prompt.output_parser.parse(res[self.output_key])
+                for res in result
+            ]
+        else:
+            return result
+
+    @property
+    def _chain_type(self) -> str:
+        return "llm_chain"
+
+    @classmethod
+    def from_string(cls, llm: BaseLanguageModel, template: str) -> LLMChain:
+        """Create LLMChain from LLM and template."""
+        prompt_template = PromptTemplate.from_template(template)
+        return cls(llm=llm, prompt=prompt_template)
+
+
+def _get_default_document_prompt() -> PromptTemplate:
+    return PromptTemplate(
+        input_variables=["page_content"], template="{page_content}"
+    )
+
+
+class StuffDocumentsChain(BaseCombineDocumentsChain):
+    """Chain that combines documents by stuffing into context."""
+
+    llm_chain: LLMChain
+    """LLM wrapper to use after formatting documents."""
+    document_prompt: BasePromptTemplate = Field(
+        default_factory=_get_default_document_prompt
+    )
+    """Prompt to use to format each document."""
+    document_variable_name: str
+    """The variable name in the llm_chain to put the documents in.
+    If only one variable in the llm_chain, this need not be provided."""
+    document_separator: str = "\n\n"
+    """The string with which to join the formatted documents"""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+        arbitrary_types_allowed = True
+
+    @root_validator(pre=True)
+    def get_default_document_variable_name(cls, values: Dict) -> Dict:
+        """Get default document variable name, if not provided."""
+        llm_chain_variables = values["llm_chain"].prompt.input_variables
+        if "document_variable_name" not in values:
+            if len(llm_chain_variables) == 1:
+                values["document_variable_name"] = llm_chain_variables[0]
+            else:
+                raise ValueError(
+                    "document_variable_name must be provided if there are "
+                    "multiple llm_chain_variables"
+                )
+        else:
+            if values["document_variable_name"] not in llm_chain_variables:
+                raise ValueError(
+                    f"document_variable_name {values['document_variable_name']} was "
+                    f"not found in llm_chain input_variables: {llm_chain_variables}"
+                )
+        return values
+
+    def _get_inputs(self, docs: List[Document], **kwargs: Any) -> dict:
+        # Format each document according to the prompt
+        doc_strings = [
+            format_document(doc, self.document_prompt) for doc in docs
+        ]
+        # Join the documents together to put them in the prompt.
+        inputs = {
+            k: v
+            for k, v in kwargs.items()
+            if k in self.llm_chain.prompt.input_variables
+        }
+        inputs[self.document_variable_name] = self.document_separator.join(
+            doc_strings
+        )
+        return inputs
+
+    def prompt_length(
+        self, docs: List[Document], **kwargs: Any
+    ) -> Optional[int]:
+        """Get the prompt length by formatting the prompt."""
+        inputs = self._get_inputs(docs, **kwargs)
+        prompt = self.llm_chain.prompt.format(**inputs)
+        return self.llm_chain.llm.get_num_tokens(prompt)
+
+    def combine_docs(
+        self, docs: List[Document], callbacks: Callbacks = None, **kwargs: Any
+    ) -> Tuple[str, dict]:
+        """Stuff all documents into one prompt and pass to LLM."""
+        inputs = self._get_inputs(docs, **kwargs)
+        # Call predict on the LLM.
+        return self.llm_chain.predict(callbacks=callbacks, **inputs), {}
+
+    @property
+    def _chain_type(self) -> str:
+        return "stuff_documents_chain"
+
+
+class LoadingCallable(Protocol):
+    """Interface for loading the combine documents chain."""
+
+    def __call__(
+        self, llm: BaseLanguageModel, **kwargs: Any
+    ) -> BaseCombineDocumentsChain:
+        """Callable to load the combine documents chain."""
+
+
+def _load_stuff_chain(
+    llm: BaseLanguageModel,
+    prompt: Optional[BasePromptTemplate] = None,
+    document_variable_name: str = "context",
+    verbose: Optional[bool] = None,
+    callback_manager: Optional[BaseCallbackManager] = None,
+    callbacks: Callbacks = None,
+    **kwargs: Any,
+) -> StuffDocumentsChain:
+    _prompt = prompt or stuff_prompt.PROMPT_SELECTOR.get_prompt(llm)
+    llm_chain = LLMChain(
+        llm=llm,
+        prompt=_prompt,
+        verbose=verbose,
+        callback_manager=callback_manager,
+        callbacks=callbacks,
+    )
+    # TODO: document prompt
+    return StuffDocumentsChain(
+        llm_chain=llm_chain,
+        document_variable_name=document_variable_name,
+        verbose=verbose,
+        callback_manager=callback_manager,
+        **kwargs,
+    )
+
+
+def load_qa_chain(
+    llm: BaseLanguageModel,
+    chain_type: str = "stuff",
+    verbose: Optional[bool] = None,
+    callback_manager: Optional[BaseCallbackManager] = None,
+    **kwargs: Any,
+) -> BaseCombineDocumentsChain:
+    """Load question answering chain.
+
+    Args:
+        llm: Language Model to use in the chain.
+        chain_type: Type of document combining chain to use. Should be one of "stuff",
+            "map_reduce", "map_rerank", and "refine".
+        verbose: Whether chains should be run in verbose mode or not. Note that this
+            applies to all chains that make up the final chain.
+        callback_manager: Callback manager to use for the chain.
+
+    Returns:
+        A chain to use for question answering.
+    """
+    loader_mapping: Mapping[str, LoadingCallable] = {
+        "stuff": _load_stuff_chain,
+    }
+    if chain_type not in loader_mapping:
+        raise ValueError(
+            f"Got unsupported chain type: {chain_type}. "
+            f"Should be one of {loader_mapping.keys()}"
+        )
+    return loader_mapping[chain_type](
+        llm, verbose=verbose, callback_manager=callback_manager, **kwargs
+    )
--- a/apps/language_models/langchain/gen.py
+++ b/apps/language_models/langchain/gen.py
--- a/apps/language_models/langchain/gpt4all_llm.py
+++ b/apps/language_models/langchain/gpt4all_llm.py
@@ -0,0 +1,380 @@
+import inspect
+import os
+from functools import partial
+from typing import Dict, Any, Optional, List
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from pydantic import root_validator
+from langchain.llms import gpt4all
+from dotenv import dotenv_values
+
+from utils import FakeTokenizer
+
+
+def get_model_tokenizer_gpt4all(base_model, **kwargs):
+    # defaults (some of these are generation parameters, so need to be passed in at generation time)
+    model_kwargs = dict(
+        n_threads=os.cpu_count() // 2,
+        temp=kwargs.get("temperature", 0.2),
+        top_p=kwargs.get("top_p", 0.75),
+        top_k=kwargs.get("top_k", 40),
+        n_ctx=2048 - 256,
+    )
+    env_gpt4all_file = ".env_gpt4all"
+    model_kwargs.update(dotenv_values(env_gpt4all_file))
+    # make int or float if can to satisfy types for class
+    for k, v in model_kwargs.items():
+        try:
+            if float(v) == int(v):
+                model_kwargs[k] = int(v)
+            else:
+                model_kwargs[k] = float(v)
+        except:
+            pass
+
+    if base_model == "llama":
+        if "model_path_llama" not in model_kwargs:
+            raise ValueError("No model_path_llama in %s" % env_gpt4all_file)
+        model_path = model_kwargs.pop("model_path_llama")
+        # FIXME: GPT4All version of llama doesn't handle new quantization, so use llama_cpp_python
+        from llama_cpp import Llama
+
+        # llama sets some things at init model time, not generation time
+        func_names = list(inspect.signature(Llama.__init__).parameters)
+        model_kwargs = {
+            k: v for k, v in model_kwargs.items() if k in func_names
+        }
+        model_kwargs["n_ctx"] = int(model_kwargs["n_ctx"])
+        model = Llama(model_path=model_path, **model_kwargs)
+    elif base_model in "gpt4all_llama":
+        if (
+            "model_name_gpt4all_llama" not in model_kwargs
+            and "model_path_gpt4all_llama" not in model_kwargs
+        ):
+            raise ValueError(
+                "No model_name_gpt4all_llama or model_path_gpt4all_llama in %s"
+                % env_gpt4all_file
+            )
+        model_name = model_kwargs.pop("model_name_gpt4all_llama")
+        model_type = "llama"
+        from gpt4all import GPT4All as GPT4AllModel
+
+        model = GPT4AllModel(model_name=model_name, model_type=model_type)
+    elif base_model in "gptj":
+        if (
+            "model_name_gptj" not in model_kwargs
+            and "model_path_gptj" not in model_kwargs
+        ):
+            raise ValueError(
+                "No model_name_gpt4j or model_path_gpt4j in %s"
+                % env_gpt4all_file
+            )
+        model_name = model_kwargs.pop("model_name_gptj")
+        model_type = "gptj"
+        from gpt4all import GPT4All as GPT4AllModel
+
+        model = GPT4AllModel(model_name=model_name, model_type=model_type)
+    else:
+        raise ValueError("No such base_model %s" % base_model)
+    return model, FakeTokenizer(), "cpu"
+
+
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+
+
+class H2OStreamingStdOutCallbackHandler(StreamingStdOutCallbackHandler):
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        """Run on new LLM token. Only available when streaming is enabled."""
+        # streaming to std already occurs without this
+        # sys.stdout.write(token)
+        # sys.stdout.flush()
+        pass
+
+
+def get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=[]):
+    # default from class
+    model_kwargs = {
+        k: v.default
+        for k, v in dict(inspect.signature(cls).parameters).items()
+        if k not in exclude_list
+    }
+    # from our defaults
+    model_kwargs.update(default_kwargs)
+    # from user defaults
+    model_kwargs.update(env_kwargs)
+    # ensure only valid keys
+    func_names = list(inspect.signature(cls).parameters)
+    model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
+    return model_kwargs
+
+
+def get_llm_gpt4all(
+    model_name,
+    model=None,
+    max_new_tokens=256,
+    temperature=0.1,
+    repetition_penalty=1.0,
+    top_k=40,
+    top_p=0.7,
+    streaming=False,
+    callbacks=None,
+    prompter=None,
+    verbose=False,
+):
+    assert prompter is not None
+    env_gpt4all_file = ".env_gpt4all"
+    env_kwargs = dotenv_values(env_gpt4all_file)
+    n_ctx = env_kwargs.pop("n_ctx", 2048 - max_new_tokens)
+    default_kwargs = dict(
+        context_erase=0.5,
+        n_batch=1,
+        n_ctx=n_ctx,
+        n_predict=max_new_tokens,
+        repeat_last_n=64 if repetition_penalty != 1.0 else 0,
+        repeat_penalty=repetition_penalty,
+        temp=temperature,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        use_mlock=True,
+        verbose=verbose,
+    )
+    if model_name == "llama":
+        cls = H2OLlamaCpp
+        model_path = (
+            env_kwargs.pop("model_path_llama") if model is None else model
+        )
+        model_kwargs = get_model_kwargs(
+            env_kwargs, default_kwargs, cls, exclude_list=["lc_kwargs"]
+        )
+        model_kwargs.update(
+            dict(
+                model_path=model_path,
+                callbacks=callbacks,
+                streaming=streaming,
+                prompter=prompter,
+            )
+        )
+        llm = cls(**model_kwargs)
+        llm.client.verbose = verbose
+    elif model_name == "gpt4all_llama":
+        cls = H2OGPT4All
+        model_path = (
+            env_kwargs.pop("model_path_gpt4all_llama")
+            if model is None
+            else model
+        )
+        model_kwargs = get_model_kwargs(
+            env_kwargs, default_kwargs, cls, exclude_list=["lc_kwargs"]
+        )
+        model_kwargs.update(
+            dict(
+                model=model_path,
+                backend="llama",
+                callbacks=callbacks,
+                streaming=streaming,
+                prompter=prompter,
+            )
+        )
+        llm = cls(**model_kwargs)
+    elif model_name == "gptj":
+        cls = H2OGPT4All
+        model_path = (
+            env_kwargs.pop("model_path_gptj") if model is None else model
+        )
+        model_kwargs = get_model_kwargs(
+            env_kwargs, default_kwargs, cls, exclude_list=["lc_kwargs"]
+        )
+        model_kwargs.update(
+            dict(
+                model=model_path,
+                backend="gptj",
+                callbacks=callbacks,
+                streaming=streaming,
+                prompter=prompter,
+            )
+        )
+        llm = cls(**model_kwargs)
+    else:
+        raise RuntimeError("No such model_name %s" % model_name)
+    return llm
+
+
+class H2OGPT4All(gpt4all.GPT4All):
+    model: Any
+    prompter: Any
+    """Path to the pre-trained GPT4All model file."""
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that the python package exists in the environment."""
+        try:
+            if isinstance(values["model"], str):
+                from gpt4all import GPT4All as GPT4AllModel
+
+                full_path = values["model"]
+                model_path, delimiter, model_name = full_path.rpartition("/")
+                model_path += delimiter
+
+                values["client"] = GPT4AllModel(
+                    model_name=model_name,
+                    model_path=model_path or None,
+                    model_type=values["backend"],
+                    allow_download=False,
+                )
+                if values["n_threads"] is not None:
+                    # set n_threads
+                    values["client"].model.set_thread_count(
+                        values["n_threads"]
+                    )
+            else:
+                values["client"] = values["model"]
+            try:
+                values["backend"] = values["client"].model_type
+            except AttributeError:
+                # The below is for compatibility with GPT4All Python bindings <= 0.2.3.
+                values["backend"] = values["client"].model.model_type
+
+        except ImportError:
+            raise ValueError(
+                "Could not import gpt4all python package. "
+                "Please install it with `pip install gpt4all`."
+            )
+        return values
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs,
+    ) -> str:
+        # Roughly 4 chars per token if natural language
+        prompt = prompt[-self.n_ctx * 4 :]
+
+        # use instruct prompting
+        data_point = dict(context="", instruction=prompt, input="")
+        prompt = self.prompter.generate_prompt(data_point)
+
+        verbose = False
+        if verbose:
+            print("_call prompt: %s" % prompt, flush=True)
+        # FIXME: GPT4ALl doesn't support yield during generate, so cannot support streaming except via itself to stdout
+        return super()._call(prompt, stop=stop, run_manager=run_manager)
+
+
+from langchain.llms import LlamaCpp
+
+
+class H2OLlamaCpp(LlamaCpp):
+    model_path: Any
+    prompter: Any
+    """Path to the pre-trained GPT4All model file."""
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        if isinstance(values["model_path"], str):
+            model_path = values["model_path"]
+            model_param_names = [
+                "lora_path",
+                "lora_base",
+                "n_ctx",
+                "n_parts",
+                "seed",
+                "f16_kv",
+                "logits_all",
+                "vocab_only",
+                "use_mlock",
+                "n_threads",
+                "n_batch",
+                "use_mmap",
+                "last_n_tokens_size",
+            ]
+            model_params = {k: values[k] for k in model_param_names}
+            # For backwards compatibility, only include if non-null.
+            if values["n_gpu_layers"] is not None:
+                model_params["n_gpu_layers"] = values["n_gpu_layers"]
+
+            try:
+                from llama_cpp import Llama
+
+                values["client"] = Llama(model_path, **model_params)
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Could not import llama-cpp-python library. "
+                    "Please install the llama-cpp-python library to "
+                    "use this embedding model: pip install llama-cpp-python"
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Could not load Llama model from path: {model_path}. "
+                    f"Received error {e}"
+                )
+        else:
+            values["client"] = values["model_path"]
+        return values
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs,
+    ) -> str:
+        verbose = False
+        # tokenize twice, just to count tokens, since llama cpp python wrapper has no way to truncate
+        # still have to avoid crazy sizes, else hit llama_tokenize: too many tokens -- might still hit, not fatal
+        prompt = prompt[-self.n_ctx * 4 :]
+        prompt_tokens = self.client.tokenize(b" " + prompt.encode("utf-8"))
+        num_prompt_tokens = len(prompt_tokens)
+        if num_prompt_tokens > self.n_ctx:
+            # conservative by using int()
+            chars_per_token = int(len(prompt) / num_prompt_tokens)
+            prompt = prompt[-self.n_ctx * chars_per_token :]
+            if verbose:
+                print(
+                    "reducing tokens, assuming average of %s chars/token: %s"
+                    % chars_per_token,
+                    flush=True,
+                )
+                prompt_tokens2 = self.client.tokenize(
+                    b" " + prompt.encode("utf-8")
+                )
+                num_prompt_tokens2 = len(prompt_tokens2)
+                print(
+                    "reduced tokens from %d -> %d"
+                    % (num_prompt_tokens, num_prompt_tokens2),
+                    flush=True,
+                )
+
+        # use instruct prompting
+        data_point = dict(context="", instruction=prompt, input="")
+        prompt = self.prompter.generate_prompt(data_point)
+
+        if verbose:
+            print("_call prompt: %s" % prompt, flush=True)
+
+        if self.streaming:
+            text_callback = None
+            if run_manager:
+                text_callback = partial(
+                    run_manager.on_llm_new_token, verbose=self.verbose
+                )
+            # parent handler of streamer expects to see prompt first else output="" and lose if prompt=None in prompter
+            if text_callback:
+                text_callback(prompt)
+            text = ""
+            for token in self.stream(
+                prompt=prompt, stop=stop, run_manager=run_manager
+            ):
+                text_chunk = token["choices"][0]["text"]
+                # self.stream already calls text_callback
+                # if text_callback:
+                #    text_callback(text_chunk)
+                text += text_chunk
+            return text
+        else:
+            params = self._get_parameters(stop)
+            params = {**params, **kwargs}
+            result = self.client(prompt=prompt, **params)
+            return result["choices"][0]["text"]
--- a/apps/language_models/langchain/gpt_langchain.py
+++ b/apps/language_models/langchain/gpt_langchain.py
--- a/apps/language_models/langchain/gradio_utils/grclient.py
+++ b/apps/language_models/langchain/gradio_utils/grclient.py
@@ -0,0 +1,93 @@
+import traceback
+from typing import Callable
+import os
+
+from gradio_client.client import Job
+
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+
+from gradio_client import Client
+
+
+class GradioClient(Client):
+    """
+    Parent class of gradio client
+    To handle automatically refreshing client if detect gradio server changed
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+        super().__init__(*args, **kwargs)
+        self.server_hash = self.get_server_hash()
+
+    def get_server_hash(self):
+        """
+        Get server hash using super without any refresh action triggered
+        Returns: git hash of gradio server
+        """
+        return super().submit(api_name="/system_hash").result()
+
+    def refresh_client_if_should(self):
+        # get current hash in order to update api_name -> fn_index map in case gradio server changed
+        # FIXME: Could add cli api as hash
+        server_hash = self.get_server_hash()
+        if self.server_hash != server_hash:
+            self.refresh_client()
+            self.server_hash = server_hash
+        else:
+            self.reset_session()
+
+    def refresh_client(self):
+        """
+        Ensure every client call is independent
+        Also ensure map between api_name and fn_index is updated in case server changed (e.g. restarted with new code)
+        Returns:
+        """
+        # need session hash to be new every time, to avoid "generator already executing"
+        self.reset_session()
+
+        client = Client(*self.args, **self.kwargs)
+        for k, v in client.__dict__.items():
+            setattr(self, k, v)
+
+    def submit(
+        self,
+        *args,
+        api_name: str | None = None,
+        fn_index: int | None = None,
+        result_callbacks: Callable | list[Callable] | None = None,
+    ) -> Job:
+        # Note predict calls submit
+        try:
+            self.refresh_client_if_should()
+            job = super().submit(*args, api_name=api_name, fn_index=fn_index)
+        except Exception as e:
+            print("Hit e=%s" % str(e), flush=True)
+            # force reconfig in case only that
+            self.refresh_client()
+            job = super().submit(*args, api_name=api_name, fn_index=fn_index)
+
+        # see if immediately failed
+        e = job.future._exception
+        if e is not None:
+            print(
+                "GR job failed: %s %s"
+                % (str(e), "".join(traceback.format_tb(e.__traceback__))),
+                flush=True,
+            )
+            # force reconfig in case only that
+            self.refresh_client()
+            job = super().submit(*args, api_name=api_name, fn_index=fn_index)
+            e2 = job.future._exception
+            if e2 is not None:
+                print(
+                    "GR job failed again: %s\n%s"
+                    % (
+                        str(e2),
+                        "".join(traceback.format_tb(e2.__traceback__)),
+                    ),
+                    flush=True,
+                )
+
+        return job
--- a/apps/language_models/langchain/h2oai_pipeline.py
+++ b/apps/language_models/langchain/h2oai_pipeline.py
@@ -0,0 +1,802 @@
+import os
+from apps.stable_diffusion.src.utils.utils import _compile_module
+from io import BytesIO
+import torch_mlir
+
+from transformers import TextGenerationPipeline
+from transformers.pipelines.text_generation import ReturnType
+
+from stopping import get_stopping
+from prompter import Prompter, PromptType
+
+
+from transformers.generation import (
+    GenerationConfig,
+    LogitsProcessorList,
+    StoppingCriteriaList,
+)
+import copy
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+import gc
+from pathlib import Path
+from shark.shark_inference import SharkInference
+from shark.shark_downloader import download_public_file
+from shark.shark_importer import import_with_fx
+from apps.stable_diffusion.src import args
+
+# Brevitas
+from typing import List, Tuple
+from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
+
+
+def brevitas〇matmul_rhs_group_quant〡shape(
+    lhs: List[int],
+    rhs: List[int],
+    rhs_scale: List[int],
+    rhs_zero_point: List[int],
+    rhs_bit_width: int,
+    rhs_group_size: int,
+) -> List[int]:
+    if len(lhs) == 3 and len(rhs) == 2:
+        return [lhs[0], lhs[1], rhs[0]]
+    elif len(lhs) == 2 and len(rhs) == 2:
+        return [lhs[0], rhs[0]]
+    else:
+        raise ValueError("Input shapes not supported.")
+
+
+def brevitas〇matmul_rhs_group_quant〡dtype(
+    lhs_rank_dtype: Tuple[int, int],
+    rhs_rank_dtype: Tuple[int, int],
+    rhs_scale_rank_dtype: Tuple[int, int],
+    rhs_zero_point_rank_dtype: Tuple[int, int],
+    rhs_bit_width: int,
+    rhs_group_size: int,
+) -> int:
+    # output dtype is the dtype of the lhs float input
+    lhs_rank, lhs_dtype = lhs_rank_dtype
+    return lhs_dtype
+
+
+def brevitas〇matmul_rhs_group_quant〡has_value_semantics(
+    lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size
+) -> None:
+    return
+
+
+brevitas_matmul_rhs_group_quant_library = [
+    brevitas〇matmul_rhs_group_quant〡shape,
+    brevitas〇matmul_rhs_group_quant〡dtype,
+    brevitas〇matmul_rhs_group_quant〡has_value_semantics,
+]
+
+global_device = "cuda"
+global_precision = "fp16"
+
+if not args.run_docuchat_web:
+    args.device = global_device
+    args.precision = global_precision
+tensor_device = "cpu" if args.device == "cpu" else "cuda"
+
+
+class H2OGPTModel(torch.nn.Module):
+    def __init__(self, device, precision):
+        super().__init__()
+        torch_dtype = (
+            torch.float32
+            if precision == "fp32" or device == "cpu"
+            else torch.float16
+        )
+        device_map = {"": "cpu"} if device == "cpu" else {"": 0}
+        model_kwargs = {
+            "local_files_only": False,
+            "torch_dtype": torch_dtype,
+            "resume_download": True,
+            "use_auth_token": False,
+            "trust_remote_code": True,
+            "offload_folder": "offline_folder",
+            "device_map": device_map,
+        }
+        config = AutoConfig.from_pretrained(
+            "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+            use_auth_token=False,
+            trust_remote_code=True,
+            offload_folder="offline_folder",
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+            config=config,
+            **model_kwargs,
+        )
+        if precision in ["int4", "int8"]:
+            print("Applying weight quantization..")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                self.model.transformer.h,
+                dtype=torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=128,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(self, input_ids, attention_mask):
+        input_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": None,
+            "use_cache": True,
+        }
+        output = self.model(
+            **input_dict,
+            return_dict=True,
+            output_attentions=False,
+            output_hidden_states=False,
+        )
+        return output.logits[:, -1, :]
+
+
+class H2OGPTSHARKModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        model_name = "h2ogpt_falcon_7b"
+        extended_model_name = (
+            model_name + "_" + args.precision + "_" + args.device
+        )
+        vmfb_path = Path(extended_model_name + ".vmfb")
+        mlir_path = Path(model_name + "_" + args.precision + ".mlir")
+        shark_module = None
+
+        need_to_compile = False
+        if not vmfb_path.exists():
+            need_to_compile = True
+            # Downloading VMFB from shark_tank
+            print("Trying to download pre-compiled vmfb from shark tank.")
+            download_public_file(
+                "gs://shark_tank/langchain/" + str(vmfb_path),
+                vmfb_path.absolute(),
+                single_file=True,
+            )
+            if vmfb_path.exists():
+                print(
+                    "Pre-compiled vmfb downloaded from shark tank successfully."
+                )
+                need_to_compile = False
+
+        if need_to_compile:
+            if not mlir_path.exists():
+                print("Trying to download pre-generated mlir from shark tank.")
+                # Downloading MLIR from shark_tank
+                download_public_file(
+                    "gs://shark_tank/langchain/" + str(mlir_path),
+                    mlir_path.absolute(),
+                    single_file=True,
+                )
+            if mlir_path.exists():
+                with open(mlir_path, "rb") as f:
+                    bytecode = f.read()
+            else:
+                # Generating the mlir
+                bytecode = self.get_bytecode(tensor_device, args.precision)
+
+            shark_module = SharkInference(
+                mlir_module=bytecode,
+                device=args.device,
+                mlir_dialect="linalg",
+            )
+            print(f"[DEBUG] generating vmfb.")
+            shark_module = _compile_module(
+                shark_module, extended_model_name, []
+            )
+            print("Saved newly generated vmfb.")
+
+        if shark_module is None:
+            if vmfb_path.exists():
+                print("Compiled vmfb found. Loading it from: ", vmfb_path)
+                shark_module = SharkInference(
+                    None, device=args.device, mlir_dialect="linalg"
+                )
+                shark_module.load_module(str(vmfb_path))
+                print("Compiled vmfb loaded successfully.")
+            else:
+                raise ValueError("Unable to download/generate a vmfb.")
+
+        self.model = shark_module
+
+    def get_bytecode(self, device, precision):
+        h2ogpt_model = H2OGPTModel(device, precision)
+
+        compilation_input_ids = torch.randint(
+            low=1, high=10000, size=(1, 400)
+        ).to(device=device)
+        compilation_attention_mask = torch.ones(1, 400, dtype=torch.int64).to(
+            device=device
+        )
+
+        h2ogptCompileInput = (
+            compilation_input_ids,
+            compilation_attention_mask,
+        )
+
+        print(f"[DEBUG] generating torchscript graph")
+        ts_graph = import_with_fx(
+            h2ogpt_model,
+            h2ogptCompileInput,
+            is_f16=False,
+            precision=precision,
+            f16_input_mask=[False, False],
+            mlir_type="torchscript",
+        )
+        del h2ogpt_model
+        del self.src_model
+
+        print(f"[DEBUG] generating torch mlir")
+        if precision in ["int4", "int8"]:
+            from torch_mlir.compiler_utils import (
+                run_pipeline_with_repro_report,
+            )
+
+            module = torch_mlir.compile(
+                ts_graph,
+                [*h2ogptCompileInput],
+                output_type=torch_mlir.OutputType.TORCH,
+                backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+                extra_library=brevitas_matmul_rhs_group_quant_library,
+                use_tracing=False,
+                verbose=False,
+            )
+            print(f"[DEBUG] converting torch to linalg")
+            run_pipeline_with_repro_report(
+                module,
+                "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+                description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
+            )
+        else:
+            module = torch_mlir.compile(
+                ts_graph,
+                [*h2ogptCompileInput],
+                torch_mlir.OutputType.LINALG_ON_TENSORS,
+                use_tracing=False,
+                verbose=False,
+            )
+        del ts_graph
+
+        print(f"[DEBUG] converting to bytecode")
+        bytecode_stream = BytesIO()
+        module.operation.write_bytecode(bytecode_stream)
+        bytecode = bytecode_stream.getvalue()
+        del module
+
+        return bytecode
+
+    def forward(self, input_ids, attention_mask):
+        result = torch.from_numpy(
+            self.model(
+                "forward",
+                (input_ids.to(device="cpu"), attention_mask.to(device="cpu")),
+            )
+        ).to(device=tensor_device)
+        return result
+
+
+h2ogpt_model = H2OGPTSHARKModel()
+
+
+def pad_or_truncate_inputs(
+    input_ids, attention_mask, max_padding_length=400, do_truncation=False
+):
+    inp_shape = input_ids.shape
+    if inp_shape[1] < max_padding_length:
+        # do padding
+        num_add_token = max_padding_length - inp_shape[1]
+        padded_input_ids = torch.cat(
+            [
+                torch.tensor([[11] * num_add_token]).to(device=tensor_device),
+                input_ids,
+            ],
+            dim=1,
+        )
+        padded_attention_mask = torch.cat(
+            [
+                torch.tensor([[0] * num_add_token]).to(device=tensor_device),
+                attention_mask,
+            ],
+            dim=1,
+        )
+        return padded_input_ids, padded_attention_mask
+    elif inp_shape[1] > max_padding_length or do_truncation:
+        # do truncation
+        num_remove_token = inp_shape[1] - max_padding_length
+        truncated_input_ids = input_ids[:, num_remove_token:]
+        truncated_attention_mask = attention_mask[:, num_remove_token:]
+        return truncated_input_ids, truncated_attention_mask
+    else:
+        return input_ids, attention_mask
+
+
+class H2OTextGenerationPipeline(TextGenerationPipeline):
+    def __init__(
+        self,
+        *args,
+        debug=False,
+        chat=False,
+        stream_output=False,
+        sanitize_bot_response=False,
+        use_prompter=True,
+        prompter=None,
+        prompt_type=None,
+        prompt_dict=None,
+        max_input_tokens=2048 - 256,
+        **kwargs,
+    ):
+        """
+        HF-like pipeline, but handle instruction prompting and stopping (for some models)
+        :param args:
+        :param debug:
+        :param chat:
+        :param stream_output:
+        :param sanitize_bot_response:
+        :param use_prompter: Whether to use prompter.  If pass prompt_type, will make prompter
+        :param prompter: prompter, can pass if have already
+        :param prompt_type: prompt_type, e.g. human_bot.  See prompt_type to model mapping in from prompter.py.
+                            If use_prompter, then will make prompter and use it.
+        :param prompt_dict: dict of get_prompt(, return_dict=True) for prompt_type=custom
+        :param max_input_tokens:
+        :param kwargs:
+        """
+        super().__init__(*args, **kwargs)
+        self.prompt_text = None
+        self.use_prompter = use_prompter
+        self.prompt_type = prompt_type
+        self.prompt_dict = prompt_dict
+        self.prompter = prompter
+        if self.use_prompter:
+            if self.prompter is not None:
+                assert self.prompter.prompt_type is not None
+            else:
+                self.prompter = Prompter(
+                    self.prompt_type,
+                    self.prompt_dict,
+                    debug=debug,
+                    chat=chat,
+                    stream_output=stream_output,
+                )
+            self.human = self.prompter.humanstr
+            self.bot = self.prompter.botstr
+            self.can_stop = True
+        else:
+            self.prompter = None
+            self.human = None
+            self.bot = None
+            self.can_stop = False
+        self.sanitize_bot_response = sanitize_bot_response
+        self.max_input_tokens = (
+            max_input_tokens  # not for generate, so ok that not kwargs
+        )
+
+    @staticmethod
+    def limit_prompt(prompt_text, tokenizer, max_prompt_length=None):
+        verbose = bool(int(os.getenv("VERBOSE_PIPELINE", "0")))
+
+        if hasattr(tokenizer, "model_max_length"):
+            # model_max_length only defined for generate.py, not raw use of h2oai_pipeline.py
+            model_max_length = tokenizer.model_max_length
+            if max_prompt_length is not None:
+                model_max_length = min(model_max_length, max_prompt_length)
+            # cut at some upper likely limit to avoid excessive tokenization etc
+            # upper bound of 10 chars/token, e.g. special chars sometimes are long
+            if len(prompt_text) > model_max_length * 10:
+                len0 = len(prompt_text)
+                prompt_text = prompt_text[-model_max_length * 10 :]
+                if verbose:
+                    print(
+                        "Cut of input: %s -> %s" % (len0, len(prompt_text)),
+                        flush=True,
+                    )
+        else:
+            # unknown
+            model_max_length = None
+
+        num_prompt_tokens = None
+        if model_max_length is not None:
+            # can't wait for "hole" if not plain prompt_type, since would lose prefix like <human>:
+            # For https://github.com/h2oai/h2ogpt/issues/192
+            for trial in range(0, 3):
+                prompt_tokens = tokenizer(prompt_text)["input_ids"]
+                num_prompt_tokens = len(prompt_tokens)
+                if num_prompt_tokens > model_max_length:
+                    # conservative by using int()
+                    chars_per_token = int(len(prompt_text) / num_prompt_tokens)
+                    # keep tail, where question is if using langchain
+                    prompt_text = prompt_text[
+                        -model_max_length * chars_per_token :
+                    ]
+                    if verbose:
+                        print(
+                            "reducing %s tokens, assuming average of %s chars/token for %s characters"
+                            % (
+                                num_prompt_tokens,
+                                chars_per_token,
+                                len(prompt_text),
+                            ),
+                            flush=True,
+                        )
+                else:
+                    if verbose:
+                        print(
+                            "using %s tokens with %s chars"
+                            % (num_prompt_tokens, len(prompt_text)),
+                            flush=True,
+                        )
+                    break
+
+        return prompt_text, num_prompt_tokens
+
+    def preprocess(
+        self,
+        prompt_text,
+        prefix="",
+        handle_long_generation=None,
+        **generate_kwargs,
+    ):
+        (
+            prompt_text,
+            num_prompt_tokens,
+        ) = H2OTextGenerationPipeline.limit_prompt(prompt_text, self.tokenizer)
+
+        data_point = dict(context="", instruction=prompt_text, input="")
+        if self.prompter is not None:
+            prompt_text = self.prompter.generate_prompt(data_point)
+        self.prompt_text = prompt_text
+        if handle_long_generation is None:
+            # forces truncation of inputs to avoid critical failure
+            handle_long_generation = None  # disable with new approaches
+        return super().preprocess(
+            prompt_text,
+            prefix=prefix,
+            handle_long_generation=handle_long_generation,
+            **generate_kwargs,
+        )
+
+    def postprocess(
+        self,
+        model_outputs,
+        return_type=ReturnType.FULL_TEXT,
+        clean_up_tokenization_spaces=True,
+    ):
+        records = super().postprocess(
+            model_outputs,
+            return_type=return_type,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+        )
+        for rec in records:
+            if self.use_prompter:
+                outputs = rec["generated_text"]
+                outputs = self.prompter.get_response(
+                    outputs,
+                    prompt=self.prompt_text,
+                    sanitize_bot_response=self.sanitize_bot_response,
+                )
+            elif self.bot and self.human:
+                outputs = (
+                    rec["generated_text"]
+                    .split(self.bot)[1]
+                    .split(self.human)[0]
+                )
+            else:
+                outputs = rec["generated_text"]
+            rec["generated_text"] = outputs
+            print(
+                "prompt: %s\noutputs: %s\n\n" % (self.prompt_text, outputs),
+                flush=True,
+            )
+        return records
+
+    def generate_new_token(self):
+        model_inputs = self.model.prepare_inputs_for_generation(
+            self.input_ids, **self.model_kwargs
+        )
+
+        outputs = h2ogpt_model.forward(
+            model_inputs["input_ids"], model_inputs["attention_mask"]
+        )
+
+        if args.precision == "fp16":
+            outputs = outputs.to(dtype=torch.float32)
+        next_token_logits = outputs
+
+        # pre-process distribution
+        next_token_scores = self.logits_processor(
+            self.input_ids, next_token_logits
+        )
+        next_token_scores = self.logits_warper(
+            self.input_ids, next_token_scores
+        )
+
+        # sample
+        probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+
+        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+        # finished sentences should have their next token be a padding token
+        if self.eos_token_id is not None:
+            if self.pad_token_id is None:
+                raise ValueError(
+                    "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
+                )
+            next_token = (
+                next_token * self.unfinished_sequences
+                + self.pad_token_id * (1 - self.unfinished_sequences)
+            )
+
+        self.input_ids = torch.cat(
+            [self.input_ids, next_token[:, None]], dim=-1
+        )
+
+        self.model_kwargs["past_key_values"] = None
+        if "attention_mask" in self.model_kwargs:
+            attention_mask = self.model_kwargs["attention_mask"]
+            self.model_kwargs["attention_mask"] = torch.cat(
+                [
+                    attention_mask,
+                    attention_mask.new_ones((attention_mask.shape[0], 1)),
+                ],
+                dim=-1,
+            )
+
+        self.truncated_input_ids.append(self.input_ids[:, 0])
+        self.input_ids = self.input_ids[:, 1:]
+        self.model_kwargs["attention_mask"] = self.model_kwargs[
+            "attention_mask"
+        ][:, 1:]
+
+        return next_token
+
+    def generate_token(self, **generate_kwargs):
+        del generate_kwargs["max_time"]
+        self.truncated_input_ids = []
+
+        generation_config_ = GenerationConfig.from_model_config(
+            self.model.config
+        )
+        generation_config = copy.deepcopy(generation_config_)
+        self.model_kwargs = generation_config.update(**generate_kwargs)
+
+        logits_processor = LogitsProcessorList()
+        self.stopping_criteria = (
+            self.stopping_criteria
+            if self.stopping_criteria is not None
+            else StoppingCriteriaList()
+        )
+
+        eos_token_id = generation_config.eos_token_id
+        generation_config.pad_token_id = eos_token_id
+
+        (
+            inputs_tensor,
+            model_input_name,
+            self.model_kwargs,
+        ) = self.model._prepare_model_inputs(
+            None, generation_config.bos_token_id, self.model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        self.model_kwargs[
+            "output_attentions"
+        ] = generation_config.output_attentions
+        self.model_kwargs[
+            "output_hidden_states"
+        ] = generation_config.output_hidden_states
+        self.model_kwargs["use_cache"] = generation_config.use_cache
+
+        self.input_ids = (
+            inputs_tensor
+            if model_input_name == "input_ids"
+            else self.model_kwargs.pop("input_ids")
+        )
+
+        input_ids_seq_length = self.input_ids.shape[-1]
+
+        generation_config.max_length = (
+            generation_config.max_new_tokens + input_ids_seq_length
+        )
+
+        self.logits_processor = self.model._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=logits_processor,
+        )
+
+        self.stopping_criteria = self.model._get_stopping_criteria(
+            generation_config=generation_config,
+            stopping_criteria=self.stopping_criteria,
+        )
+
+        self.logits_warper = self.model._get_logits_warper(generation_config)
+
+        (
+            self.input_ids,
+            self.model_kwargs,
+        ) = self.model._expand_inputs_for_generation(
+            input_ids=self.input_ids,
+            expand_size=generation_config.num_return_sequences,  # 1
+            is_encoder_decoder=self.model.config.is_encoder_decoder,  # False
+            **self.model_kwargs,
+        )
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        self.eos_token_id_tensor = (
+            torch.tensor(eos_token_id).to(device=tensor_device)
+            if eos_token_id is not None
+            else None
+        )
+
+        self.pad_token_id = generation_config.pad_token_id
+        self.eos_token_id = eos_token_id
+
+        output_scores = generation_config.output_scores  # False
+        output_attentions = generation_config.output_attentions  # False
+        output_hidden_states = generation_config.output_hidden_states  # False
+        return_dict_in_generate = (
+            generation_config.return_dict_in_generate  # False
+        )
+
+        # init attention / hidden states / scores tuples
+        self.scores = (
+            () if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = (
+            () if (return_dict_in_generate and output_attentions) else None
+        )
+        cross_attentions = (
+            () if (return_dict_in_generate and output_attentions) else None
+        )
+        decoder_hidden_states = (
+            () if (return_dict_in_generate and output_hidden_states) else None
+        )
+
+        # keep track of which sequences are already finished
+        self.unfinished_sequences = torch.ones(
+            self.input_ids.shape[0],
+            dtype=torch.long,
+            device=self.input_ids.device,
+        )
+
+        timesRan = 0
+        import time
+
+        start = time.time()
+        print("\n")
+
+        while True:
+            next_token = self.generate_new_token()
+            new_word = self.tokenizer.decode(
+                next_token.cpu().numpy(),
+                add_special_tokens=False,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
+
+            print(f"{new_word}", end="", flush=True)
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if self.eos_token_id_tensor is not None:
+                self.unfinished_sequences = self.unfinished_sequences.mul(
+                    next_token.tile(self.eos_token_id_tensor.shape[0], 1)
+                    .ne(self.eos_token_id_tensor.unsqueeze(1))
+                    .prod(dim=0)
+                )
+                # stop when each sentence is finished
+                if (
+                    self.unfinished_sequences.max() == 0
+                    or self.stopping_criteria(self.input_ids, self.scores)
+                ):
+                    break
+            timesRan = timesRan + 1
+
+        end = time.time()
+        print(
+            "\n\nTime taken is {:.2f} seconds/token\n".format(
+                (end - start) / timesRan
+            )
+        )
+
+        self.input_ids = torch.cat(
+            [
+                torch.tensor(self.truncated_input_ids)
+                .to(device=tensor_device)
+                .unsqueeze(dim=0),
+                self.input_ids,
+            ],
+            dim=-1,
+        )
+
+        torch.cuda.empty_cache()
+        gc.collect()
+
+        return self.input_ids
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.can_stop:
+            stopping_criteria = get_stopping(
+                self.prompt_type,
+                self.prompt_dict,
+                self.tokenizer,
+                self.device,
+                human=self.human,
+                bot=self.bot,
+                model_max_length=self.tokenizer.model_max_length,
+            )
+            generate_kwargs["stopping_criteria"] = stopping_criteria
+        # return super()._forward(model_inputs, **generate_kwargs)
+        return self.__forward(model_inputs, **generate_kwargs)
+
+    # FIXME: Copy-paste of original _forward, but removed copy.deepcopy()
+    # FIXME: https://github.com/h2oai/h2ogpt/issues/172
+    def __forward(self, model_inputs, **generate_kwargs):
+        input_ids = model_inputs["input_ids"]
+        attention_mask = model_inputs.get("attention_mask", None)
+        # Allow empty prompts
+        if input_ids.shape[1] == 0:
+            input_ids = None
+            attention_mask = None
+            in_b = 1
+        else:
+            in_b = input_ids.shape[0]
+        prompt_text = model_inputs.pop("prompt_text")
+
+        ## If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        ## generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        # generate_kwargs = copy.deepcopy(generate_kwargs)
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].max_new_tokens
+                is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length")
+                    or self.model.config.max_length
+                )
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].min_new_tokens
+                is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+
+        # BS x SL
+        # pad or truncate the input_ids and attention_mask
+        max_padding_length = 400
+        input_ids, attention_mask = pad_or_truncate_inputs(
+            input_ids, attention_mask, max_padding_length=max_padding_length
+        )
+        self.stopping_criteria = generate_kwargs["stopping_criteria"]
+
+        generated_sequence = self.generate_token(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+        out_b = generated_sequence.shape[0]
+        generated_sequence = generated_sequence.reshape(
+            in_b, out_b // in_b, *generated_sequence.shape[1:]
+        )
+        return {
+            "generated_sequence": generated_sequence,
+            "input_ids": input_ids,
+            "prompt_text": prompt_text,
+        }
--- a/apps/language_models/langchain/image_captions.py
+++ b/apps/language_models/langchain/image_captions.py
@@ -0,0 +1,247 @@
+"""
+Based upon ImageCaptionLoader in LangChain version: langchain/document_loaders/image_captions.py
+But accepts preloaded model to avoid slowness in use and CUDA forking issues
+
+Loader that loads image captions
+By default, the loader utilizes the pre-trained BLIP image captioning model.
+https://huggingface.co/Salesforce/blip-image-captioning-base
+
+"""
+from typing import List, Union, Any, Tuple
+
+import requests
+from langchain.docstore.document import Document
+from langchain.document_loaders import ImageCaptionLoader
+
+from utils import get_device, NullContext
+
+import pkg_resources
+
+try:
+    assert pkg_resources.get_distribution("bitsandbytes") is not None
+    have_bitsandbytes = True
+except (pkg_resources.DistributionNotFound, AssertionError):
+    have_bitsandbytes = False
+
+
+class H2OImageCaptionLoader(ImageCaptionLoader):
+    """Loader that loads the captions of an image"""
+
+    def __init__(
+        self,
+        path_images: Union[str, List[str]] = None,
+        blip_processor: str = None,
+        blip_model: str = None,
+        caption_gpu=True,
+        load_in_8bit=True,
+        # True doesn't seem to work, even though https://huggingface.co/Salesforce/blip2-flan-t5-xxl#in-8-bit-precision-int8
+        load_half=False,
+        load_gptq="",
+        use_safetensors=False,
+        min_new_tokens=20,
+        max_tokens=50,
+    ):
+        if blip_model is None or blip_model is None:
+            blip_processor = "Salesforce/blip-image-captioning-base"
+            blip_model = "Salesforce/blip-image-captioning-base"
+
+        super().__init__(path_images, blip_processor, blip_model)
+        self.blip_processor = blip_processor
+        self.blip_model = blip_model
+        self.processor = None
+        self.model = None
+        self.caption_gpu = caption_gpu
+        self.context_class = NullContext
+        self.device = "cpu"
+        self.load_in_8bit = (
+            load_in_8bit and have_bitsandbytes
+        )  # only for blip2
+        self.load_half = load_half
+        self.load_gptq = load_gptq
+        self.use_safetensors = use_safetensors
+        self.gpu_id = "auto"
+        # default prompt
+        self.prompt = "image of"
+        self.min_new_tokens = min_new_tokens
+        self.max_tokens = max_tokens
+
+    def set_context(self):
+        if get_device() == "cuda" and self.caption_gpu:
+            import torch
+
+            n_gpus = (
+                torch.cuda.device_count() if torch.cuda.is_available else 0
+            )
+            if n_gpus > 0:
+                self.context_class = torch.device
+                self.device = "cuda"
+
+    def load_model(self):
+        try:
+            import transformers
+        except ImportError:
+            raise ValueError(
+                "`transformers` package not found, please install with "
+                "`pip install transformers`."
+            )
+        self.set_context()
+        if self.caption_gpu:
+            if self.gpu_id == "auto":
+                # blip2 has issues with multi-GPU.  Error says need to somehow set language model in device map
+                # device_map = 'auto'
+                device_map = {"": 0}
+            else:
+                if self.device == "cuda":
+                    device_map = {"": self.gpu_id}
+                else:
+                    device_map = {"": "cpu"}
+        else:
+            device_map = {"": "cpu"}
+        import torch
+
+        with torch.no_grad():
+            with self.context_class(self.device):
+                context_class_cast = (
+                    NullContext if self.device == "cpu" else torch.autocast
+                )
+                with context_class_cast(self.device):
+                    if "blip2" in self.blip_processor.lower():
+                        from transformers import (
+                            Blip2Processor,
+                            Blip2ForConditionalGeneration,
+                        )
+
+                        if self.load_half and not self.load_in_8bit:
+                            self.processor = Blip2Processor.from_pretrained(
+                                self.blip_processor, device_map=device_map
+                            ).half()
+                            self.model = (
+                                Blip2ForConditionalGeneration.from_pretrained(
+                                    self.blip_model, device_map=device_map
+                                ).half()
+                            )
+                        else:
+                            self.processor = Blip2Processor.from_pretrained(
+                                self.blip_processor,
+                                load_in_8bit=self.load_in_8bit,
+                                device_map=device_map,
+                            )
+                            self.model = (
+                                Blip2ForConditionalGeneration.from_pretrained(
+                                    self.blip_model,
+                                    load_in_8bit=self.load_in_8bit,
+                                    device_map=device_map,
+                                )
+                            )
+                    else:
+                        from transformers import (
+                            BlipForConditionalGeneration,
+                            BlipProcessor,
+                        )
+
+                        self.load_half = False  # not supported
+                        if self.caption_gpu:
+                            if device_map == "auto":
+                                # Blip doesn't support device_map='auto'
+                                if self.device == "cuda":
+                                    if self.gpu_id == "auto":
+                                        device_map = {"": 0}
+                                    else:
+                                        device_map = {"": self.gpu_id}
+                                else:
+                                    device_map = {"": "cpu"}
+                        else:
+                            device_map = {"": "cpu"}
+                        self.processor = BlipProcessor.from_pretrained(
+                            self.blip_processor, device_map=device_map
+                        )
+                        self.model = (
+                            BlipForConditionalGeneration.from_pretrained(
+                                self.blip_model, device_map=device_map
+                            )
+                        )
+        return self
+
+    def set_image_paths(self, path_images: Union[str, List[str]]):
+        """
+        Load from a list of image files
+        """
+        if isinstance(path_images, str):
+            self.image_paths = [path_images]
+        else:
+            self.image_paths = path_images
+
+    def load(self, prompt=None) -> List[Document]:
+        if self.processor is None or self.model is None:
+            self.load_model()
+        results = []
+        for path_image in self.image_paths:
+            caption, metadata = self._get_captions_and_metadata(
+                model=self.model,
+                processor=self.processor,
+                path_image=path_image,
+                prompt=prompt,
+            )
+            doc = Document(page_content=caption, metadata=metadata)
+            results.append(doc)
+
+        return results
+
+    def _get_captions_and_metadata(
+        self, model: Any, processor: Any, path_image: str, prompt=None
+    ) -> Tuple[str, dict]:
+        """
+        Helper function for getting the captions and metadata of an image
+        """
+        if prompt is None:
+            prompt = self.prompt
+        try:
+            from PIL import Image
+        except ImportError:
+            raise ValueError(
+                "`PIL` package not found, please install with `pip install pillow`"
+            )
+
+        try:
+            if path_image.startswith("http://") or path_image.startswith(
+                "https://"
+            ):
+                image = Image.open(
+                    requests.get(path_image, stream=True).raw
+                ).convert("RGB")
+            else:
+                image = Image.open(path_image).convert("RGB")
+        except Exception:
+            raise ValueError(f"Could not get image data for {path_image}")
+
+        import torch
+
+        with torch.no_grad():
+            with self.context_class(self.device):
+                context_class_cast = (
+                    NullContext if self.device == "cpu" else torch.autocast
+                )
+                with context_class_cast(self.device):
+                    if self.load_half:
+                        inputs = processor(
+                            image, prompt, return_tensors="pt"
+                        ).half()
+                    else:
+                        inputs = processor(image, prompt, return_tensors="pt")
+                    min_length = len(prompt) // 4 + self.min_new_tokens
+                    self.max_tokens = max(self.max_tokens, min_length)
+                    output = model.generate(
+                        **inputs,
+                        min_length=min_length,
+                        max_length=self.max_tokens,
+                    )
+
+                    caption: str = processor.decode(
+                        output[0], skip_special_tokens=True
+                    )
+                    prompti = caption.find(prompt)
+                    if prompti >= 0:
+                        caption = caption[prompti + len(prompt) :]
+                    metadata: dict = {"image_path": path_image}
+
+        return caption, metadata
--- a/apps/language_models/langchain/langchain_requirements.txt
+++ b/apps/language_models/langchain/langchain_requirements.txt
@@ -0,0 +1,120 @@
+# for generate (gradio server) and finetune
+datasets==2.13.0
+sentencepiece==0.1.99
+huggingface_hub==0.16.4
+appdirs==1.4.4
+fire==0.5.0
+docutils==0.20.1
+evaluate==0.4.0
+rouge_score==0.1.2
+sacrebleu==2.3.1
+scikit-learn==1.2.2
+alt-profanity-check==1.2.2
+better-profanity==0.7.0
+numpy==1.24.3
+pandas==2.0.2
+matplotlib==3.7.1
+loralib==0.1.1
+bitsandbytes==0.39.0
+accelerate==0.20.3
+peft==0.4.0
+# 4.31.0+ breaks load_in_8bit=True (https://github.com/huggingface/transformers/issues/25026)
+transformers==4.30.2
+tokenizers==0.13.3
+APScheduler==3.10.1
+
+# optional for generate
+pynvml==11.5.0
+psutil==5.9.5
+boto3==1.26.101
+botocore==1.29.101
+
+# optional for finetune
+tensorboard==2.13.0
+neptune==1.2.0
+
+# for gradio client
+gradio_client==0.2.10
+beautifulsoup4==4.12.2
+markdown==3.4.3
+
+# data and testing
+pytest==7.2.2
+pytest-xdist==3.2.1
+nltk==3.8.1
+textstat==0.7.3
+# pandoc==2.3
+pypandoc==1.11; sys_platform == "darwin" and platform_machine == "arm64"
+pypandoc_binary==1.11; platform_machine == "x86_64"
+pypandoc_binary==1.11; sys_platform == "win32"
+openpyxl==3.1.2
+lm_dataformat==0.0.20
+bioc==2.0
+
+# falcon
+einops==0.6.1
+instructorembedding==1.0.1
+
+# for gpt4all .env file, but avoid worrying about imports
+python-dotenv==1.0.0
+
+text-generation==0.6.0
+# for tokenization when don't have HF tokenizer
+tiktoken==0.4.0
+# optional: for OpenAI endpoint or embeddings (requires key)
+openai==0.27.8
+
+# optional for chat with PDF
+langchain==0.0.202
+pypdf==3.12.2
+# avoid textract, requires old six
+#textract==1.6.5
+
+# for HF embeddings
+sentence_transformers==2.2.2
+
+# local vector db
+chromadb==0.3.25
+# server vector db
+#pymilvus==2.2.8
+
+# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
+# unstructured==0.8.1
+
+# strong support for images
+# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libtesseract-dev libreoffice
+unstructured[local-inference]==0.7.4
+#pdf2image==1.16.3
+#pytesseract==0.3.10
+pillow
+
+pdfminer.six==20221105
+urllib3
+requests_file
+
+#pdf2image==1.16.3
+#pytesseract==0.3.10
+tabulate==0.9.0
+# FYI pandoc already part of requirements.txt
+
+# JSONLoader, but makes some trouble for some users
+# jq==1.4.1
+
+# to check licenses
+# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
+pip-licenses==4.3.0
+
+# weaviate vector db
+weaviate-client==3.22.1
+
+gpt4all==1.0.5
+llama-cpp-python==0.1.73
+
+arxiv==1.4.8
+pymupdf==1.22.5 # AGPL license
+# extract-msg==0.41.1  # GPL3
+
+# sometimes unstructured fails, these work in those cases.  See https://github.com/h2oai/h2ogpt/issues/320
+playwright==1.36.0
+# requires Chrome binary to be in path
+selenium==4.10.0
--- a/apps/language_models/langchain/llama_flash_attn_monkey_patch.py
+++ b/apps/language_models/langchain/llama_flash_attn_monkey_patch.py
@@ -0,0 +1,124 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+from einops import rearrange
+
+from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[
+    torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]
+]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+
+    kv_seq_len = key_states.shape[-2]
+    assert past_key_value is None, "past_key_value is not supported"
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+    assert not output_attentions, "output_attentions is not supported"
+    assert not use_cache, "use_cache is not supported"
+
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+
+    if key_padding_mask is None:
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        max_s = q_len
+        cu_q_lens = torch.arange(
+            0,
+            (bsz + 1) * q_len,
+            step=q_len,
+            dtype=torch.int32,
+            device=qkv.device,
+        )
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    else:
+        nheads = qkv.shape[-2]
+        x = rearrange(qkv, "b s three h d -> b s (three h d)")
+        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+        x_unpad = rearrange(
+            x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+        )
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(
+            pad_input(
+                rearrange(output_unpad, "nnz h d -> nnz (h d)"),
+                indices,
+                bsz,
+                q_len,
+            ),
+            "b s (h d) -> b s h d",
+            h=nheads,
+        )
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
+
+
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+
+
+def replace_llama_attn_with_flash_attn():
+    print(
+        "Replacing original LLaMa attention with flash attention", flush=True
+    )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask
+    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--- a/apps/language_models/langchain/loaders.py
+++ b/apps/language_models/langchain/loaders.py
@@ -0,0 +1,109 @@
+import functools
+
+
+def get_loaders(model_name, reward_type, llama_type=None, load_gptq=""):
+    # NOTE: Some models need specific new prompt_type
+    # E.g. t5_xxl_true_nli_mixture has input format: "premise: PREMISE_TEXT hypothesis: HYPOTHESIS_TEXT".)
+    if load_gptq:
+        from transformers import AutoTokenizer
+        from auto_gptq import AutoGPTQForCausalLM
+
+        use_triton = False
+        functools.partial(
+            AutoGPTQForCausalLM.from_quantized,
+            quantize_config=None,
+            use_triton=use_triton,
+        )
+        return AutoGPTQForCausalLM.from_quantized, AutoTokenizer
+    if llama_type is None:
+        llama_type = "llama" in model_name.lower()
+    if llama_type:
+        from transformers import LlamaForCausalLM, LlamaTokenizer
+
+        return LlamaForCausalLM.from_pretrained, LlamaTokenizer
+    elif "distilgpt2" in model_name.lower():
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        return AutoModelForCausalLM.from_pretrained, AutoTokenizer
+    elif "gpt2" in model_name.lower():
+        from transformers import GPT2LMHeadModel, GPT2Tokenizer
+
+        return GPT2LMHeadModel.from_pretrained, GPT2Tokenizer
+    elif "mbart-" in model_name.lower():
+        from transformers import (
+            MBartForConditionalGeneration,
+            MBart50TokenizerFast,
+        )
+
+        return (
+            MBartForConditionalGeneration.from_pretrained,
+            MBart50TokenizerFast,
+        )
+    elif (
+        "t5" == model_name.lower()
+        or "t5-" in model_name.lower()
+        or "flan-" in model_name.lower()
+    ):
+        from transformers import AutoTokenizer, T5ForConditionalGeneration
+
+        return T5ForConditionalGeneration.from_pretrained, AutoTokenizer
+    elif "bigbird" in model_name:
+        from transformers import (
+            BigBirdPegasusForConditionalGeneration,
+            AutoTokenizer,
+        )
+
+        return (
+            BigBirdPegasusForConditionalGeneration.from_pretrained,
+            AutoTokenizer,
+        )
+    elif (
+        "bart-large-cnn-samsum" in model_name
+        or "flan-t5-base-samsum" in model_name
+    ):
+        from transformers import pipeline
+
+        return pipeline, "summarization"
+    elif (
+        reward_type
+        or "OpenAssistant/reward-model".lower() in model_name.lower()
+    ):
+        from transformers import (
+            AutoModelForSequenceClassification,
+            AutoTokenizer,
+        )
+
+        return (
+            AutoModelForSequenceClassification.from_pretrained,
+            AutoTokenizer,
+        )
+    else:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+
+        model_loader = AutoModelForCausalLM
+        tokenizer_loader = AutoTokenizer
+        return model_loader.from_pretrained, tokenizer_loader
+
+
+def get_tokenizer(
+    tokenizer_loader,
+    tokenizer_base_model,
+    local_files_only,
+    resume_download,
+    use_auth_token,
+):
+    tokenizer = tokenizer_loader.from_pretrained(
+        tokenizer_base_model,
+        local_files_only=local_files_only,
+        resume_download=resume_download,
+        use_auth_token=use_auth_token,
+        padding_side="left",
+    )
+
+    tokenizer.pad_token_id = 0  # different from the eos token
+    # when generating, we will use the logits of right-most token to predict the next token
+    # so the padding should be on the left,
+    # e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
+    tokenizer.padding_side = "left"  # Allow batched inference
+
+    return tokenizer
--- a/apps/language_models/langchain/make_db.py
+++ b/apps/language_models/langchain/make_db.py
@@ -0,0 +1,203 @@
+import os
+
+from gpt_langchain import (
+    path_to_docs,
+    get_some_dbs_from_hf,
+    all_db_zips,
+    some_db_zips,
+    create_or_update_db,
+)
+from utils import get_ngpus_vis
+
+
+def glob_to_db(
+    user_path,
+    chunk=True,
+    chunk_size=512,
+    verbose=False,
+    fail_any_exception=False,
+    n_jobs=-1,
+    url=None,
+    enable_captions=True,
+    captions_model=None,
+    caption_loader=None,
+    enable_ocr=False,
+):
+    sources1 = path_to_docs(
+        user_path,
+        verbose=verbose,
+        fail_any_exception=fail_any_exception,
+        n_jobs=n_jobs,
+        chunk=chunk,
+        chunk_size=chunk_size,
+        url=url,
+        enable_captions=enable_captions,
+        captions_model=captions_model,
+        caption_loader=caption_loader,
+        enable_ocr=enable_ocr,
+    )
+    return sources1
+
+
+def make_db_main(
+    use_openai_embedding: bool = False,
+    hf_embedding_model: str = None,
+    persist_directory: str = "db_dir_UserData",
+    user_path: str = "user_path",
+    url: str = None,
+    add_if_exists: bool = True,
+    collection_name: str = "UserData",
+    verbose: bool = False,
+    chunk: bool = True,
+    chunk_size: int = 512,
+    fail_any_exception: bool = False,
+    download_all: bool = False,
+    download_some: bool = False,
+    download_one: str = None,
+    download_dest: str = "./",
+    n_jobs: int = -1,
+    enable_captions: bool = True,
+    captions_model: str = "Salesforce/blip-image-captioning-base",
+    pre_load_caption_model: bool = False,
+    caption_gpu: bool = True,
+    enable_ocr: bool = False,
+    db_type: str = "chroma",
+):
+    """
+    # To make UserData db for generate.py, put pdfs, etc. into path user_path and run:
+    python make_db.py
+
+    # once db is made, can use in generate.py like:
+
+    python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6_9b --langchain_mode=UserData
+
+    or zip-up the db_dir_UserData and share:
+
+    zip -r db_dir_UserData.zip db_dir_UserData
+
+    # To get all db files (except large wiki_full) do:
+    python make_db.py --download_some=True
+
+    # To get a single db file from HF:
+    python make_db.py --download_one=db_dir_DriverlessAI_docs.zip
+
+    :param use_openai_embedding: Whether to use OpenAI embedding
+    :param hf_embedding_model: HF embedding model to use. Like generate.py, uses 'hkunlp/instructor-large' if have GPUs, else "sentence-transformers/all-MiniLM-L6-v2"
+    :param persist_directory: where to persist db
+    :param user_path: where to pull documents from (None means url is not None.  If url is not None, this is ignored.)
+    :param url: url to generate documents from (None means user_path is not None)
+    :param add_if_exists: Add to db if already exists, but will not add duplicate sources
+    :param collection_name: Collection name for new db if not adding
+    :param verbose: whether to show verbose messages
+    :param chunk: whether to chunk data
+    :param chunk_size: chunk size for chunking
+    :param fail_any_exception: whether to fail if any exception hit during ingestion of files
+    :param download_all: whether to download all (including 23GB Wikipedia) example databases from h2o.ai HF
+    :param download_some: whether to download some small example databases from h2o.ai HF
+    :param download_one: whether to download one chosen example databases from h2o.ai HF
+    :param download_dest: Destination for downloads
+    :param n_jobs: Number of cores to use for ingesting multiple files
+    :param enable_captions: Whether to enable captions on images
+    :param captions_model: See generate.py
+    :param pre_load_caption_model: See generate.py
+    :param caption_gpu: Caption images on GPU if present
+    :param enable_ocr: Whether to enable OCR on images
+    :param db_type: Type of db to create. Currently only 'chroma' and 'weaviate' is supported.
+    :return: None
+    """
+    db = None
+
+    # match behavior of main() in generate.py for non-HF case
+    n_gpus = get_ngpus_vis()
+    if n_gpus == 0:
+        if hf_embedding_model is None:
+            # if no GPUs, use simpler embedding model to avoid cost in time
+            hf_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+    else:
+        if hf_embedding_model is None:
+            # if still None, then set default
+            hf_embedding_model = "hkunlp/instructor-large"
+
+    if download_all:
+        print("Downloading all (and unzipping): %s" % all_db_zips, flush=True)
+        get_some_dbs_from_hf(download_dest, db_zips=all_db_zips)
+        if verbose:
+            print("DONE", flush=True)
+        return db, collection_name
+    elif download_some:
+        print(
+            "Downloading some (and unzipping): %s" % some_db_zips, flush=True
+        )
+        get_some_dbs_from_hf(download_dest, db_zips=some_db_zips)
+        if verbose:
+            print("DONE", flush=True)
+        return db, collection_name
+    elif download_one:
+        print("Downloading %s (and unzipping)" % download_one, flush=True)
+        get_some_dbs_from_hf(
+            download_dest, db_zips=[[download_one, "", "Unknown License"]]
+        )
+        if verbose:
+            print("DONE", flush=True)
+        return db, collection_name
+
+    if enable_captions and pre_load_caption_model:
+        # preload, else can be too slow or if on GPU have cuda context issues
+        # Inside ingestion, this will disable parallel loading of multiple other kinds of docs
+        # However, if have many images, all those images will be handled more quickly by preloaded model on GPU
+        from image_captions import H2OImageCaptionLoader
+
+        caption_loader = H2OImageCaptionLoader(
+            None,
+            blip_model=captions_model,
+            blip_processor=captions_model,
+            caption_gpu=caption_gpu,
+        ).load_model()
+    else:
+        if enable_captions:
+            caption_loader = "gpu" if caption_gpu else "cpu"
+        else:
+            caption_loader = False
+
+    if verbose:
+        print("Getting sources", flush=True)
+    assert (
+        user_path is not None or url is not None
+    ), "Can't have both user_path and url as None"
+    if not url:
+        assert os.path.isdir(user_path), (
+            "user_path=%s does not exist" % user_path
+        )
+    sources = glob_to_db(
+        user_path,
+        chunk=chunk,
+        chunk_size=chunk_size,
+        verbose=verbose,
+        fail_any_exception=fail_any_exception,
+        n_jobs=n_jobs,
+        url=url,
+        enable_captions=enable_captions,
+        captions_model=captions_model,
+        caption_loader=caption_loader,
+        enable_ocr=enable_ocr,
+    )
+    exceptions = [x for x in sources if x.metadata.get("exception")]
+    print("Exceptions: %s" % exceptions, flush=True)
+    sources = [x for x in sources if "exception" not in x.metadata]
+
+    assert len(sources) > 0, "No sources found"
+    db = create_or_update_db(
+        db_type,
+        persist_directory,
+        collection_name,
+        sources,
+        use_openai_embedding,
+        add_if_exists,
+        verbose,
+        hf_embedding_model,
+    )
+
+    assert db is not None
+    if verbose:
+        print("DONE", flush=True)
+    return db, collection_name
--- a/apps/language_models/langchain/prompter.py
+++ b/apps/language_models/langchain/prompter.py
--- a/apps/language_models/langchain/read_wiki_full.py
+++ b/apps/language_models/langchain/read_wiki_full.py
@@ -0,0 +1,403 @@
+"""Load Data from a MediaWiki dump xml."""
+import ast
+import glob
+import pickle
+import uuid
+from typing import List, Optional
+import os
+import bz2
+import csv
+import numpy as np
+import pandas as pd
+import pytest
+from matplotlib import pyplot as plt
+
+from langchain.docstore.document import Document
+from langchain.document_loaders import MWDumpLoader
+
+# path where downloaded wiki files exist, to be processed
+root_path = "/data/jon/h2o-llm"
+
+
+def unescape(x):
+    try:
+        x = ast.literal_eval(x)
+    except:
+        try:
+            x = x.encode("ascii", "ignore").decode("unicode_escape")
+        except:
+            pass
+    return x
+
+
+def get_views():
+    # views = pd.read_csv('wiki_page_views_more_1000month.csv')
+    views = pd.read_csv("wiki_page_views_more_5000month.csv")
+    views.index = views["title"]
+    views = views["views"]
+    views = views.to_dict()
+    views = {str(unescape(str(k))): v for k, v in views.items()}
+    views2 = {k.replace("_", " "): v for k, v in views.items()}
+    # views has _ but pages has " "
+    views.update(views2)
+    return views
+
+
+class MWDumpDirectLoader(MWDumpLoader):
+    def __init__(
+        self,
+        data: str,
+        encoding: Optional[str] = "utf8",
+        title_words_limit=None,
+        use_views=True,
+        verbose=True,
+    ):
+        """Initialize with file path."""
+        self.data = data
+        self.encoding = encoding
+        self.title_words_limit = title_words_limit
+        self.verbose = verbose
+        if use_views:
+            # self.views = get_views()
+            # faster to use global shared values
+            self.views = global_views
+        else:
+            self.views = None
+
+    def load(self) -> List[Document]:
+        """Load from file path."""
+        import mwparserfromhell
+        import mwxml
+
+        dump = mwxml.Dump.from_page_xml(self.data)
+
+        docs = []
+
+        for page in dump.pages:
+            if self.views is not None and page.title not in self.views:
+                if self.verbose:
+                    print("Skipped %s low views" % page.title, flush=True)
+                continue
+            for revision in page:
+                if self.title_words_limit is not None:
+                    num_words = len(" ".join(page.title.split("_")).split(" "))
+                    if num_words > self.title_words_limit:
+                        if self.verbose:
+                            print("Skipped %s" % page.title, flush=True)
+                        continue
+                if self.verbose:
+                    if self.views is not None:
+                        print(
+                            "Kept %s views: %s"
+                            % (page.title, self.views[page.title]),
+                            flush=True,
+                        )
+                    else:
+                        print("Kept %s" % page.title, flush=True)
+
+                code = mwparserfromhell.parse(revision.text)
+                text = code.strip_code(
+                    normalize=True, collapse=True, keep_template_params=False
+                )
+                title_url = str(page.title).replace(" ", "_")
+                metadata = dict(
+                    title=page.title,
+                    source="https://en.wikipedia.org/wiki/" + title_url,
+                    id=page.id,
+                    redirect=page.redirect,
+                    views=self.views[page.title]
+                    if self.views is not None
+                    else -1,
+                )
+                metadata = {k: v for k, v in metadata.items() if v is not None}
+                docs.append(Document(page_content=text, metadata=metadata))
+
+        return docs
+
+
+def search_index(search_term, index_filename):
+    byte_flag = False
+    data_length = start_byte = 0
+    index_file = open(index_filename, "r")
+    csv_reader = csv.reader(index_file, delimiter=":")
+    for line in csv_reader:
+        if not byte_flag and search_term == line[2]:
+            start_byte = int(line[0])
+            byte_flag = True
+        elif byte_flag and int(line[0]) != start_byte:
+            data_length = int(line[0]) - start_byte
+            break
+    index_file.close()
+    return start_byte, data_length
+
+
+def get_start_bytes(index_filename):
+    index_file = open(index_filename, "r")
+    csv_reader = csv.reader(index_file, delimiter=":")
+    start_bytes = set()
+    for line in csv_reader:
+        start_bytes.add(int(line[0]))
+    index_file.close()
+    return sorted(start_bytes)
+
+
+def get_wiki_filenames():
+    # requires
+    # wget http://ftp.acc.umu.se/mirror/wikimedia.org/dumps/enwiki/20230401/enwiki-20230401-pages-articles-multistream-index.txt.bz2
+    base_path = os.path.join(
+        root_path, "enwiki-20230401-pages-articles-multistream"
+    )
+    index_file = "enwiki-20230401-pages-articles-multistream-index.txt"
+    index_filename = os.path.join(base_path, index_file)
+    wiki_filename = os.path.join(
+        base_path, "enwiki-20230401-pages-articles-multistream.xml.bz2"
+    )
+    return index_filename, wiki_filename
+
+
+def get_documents_by_search_term(search_term):
+    index_filename, wiki_filename = get_wiki_filenames()
+    start_byte, data_length = search_index(search_term, index_filename)
+    with open(wiki_filename, "rb") as wiki_file:
+        wiki_file.seek(start_byte)
+        data = bz2.BZ2Decompressor().decompress(wiki_file.read(data_length))
+
+    loader = MWDumpDirectLoader(data.decode())
+    documents = loader.load()
+    return documents
+
+
+def get_one_chunk(
+    wiki_filename,
+    start_byte,
+    end_byte,
+    return_file=True,
+    title_words_limit=None,
+    use_views=True,
+):
+    data_length = end_byte - start_byte
+    with open(wiki_filename, "rb") as wiki_file:
+        wiki_file.seek(start_byte)
+        data = bz2.BZ2Decompressor().decompress(wiki_file.read(data_length))
+
+    loader = MWDumpDirectLoader(
+        data.decode(), title_words_limit=title_words_limit, use_views=use_views
+    )
+    documents1 = loader.load()
+    if return_file:
+        base_tmp = "temp_wiki"
+        if not os.path.isdir(base_tmp):
+            os.makedirs(base_tmp, exist_ok=True)
+        filename = os.path.join(base_tmp, str(uuid.uuid4()) + ".tmp.pickle")
+        with open(filename, "wb") as f:
+            pickle.dump(documents1, f)
+        return filename
+    return documents1
+
+
+from joblib import Parallel, delayed
+
+global_views = get_views()
+
+
+def get_all_documents(small_test=2, n_jobs=None, use_views=True):
+    print("DO get all wiki docs: %s" % small_test, flush=True)
+    index_filename, wiki_filename = get_wiki_filenames()
+    start_bytes = get_start_bytes(index_filename)
+    end_bytes = start_bytes[1:]
+    start_bytes = start_bytes[:-1]
+
+    if small_test:
+        start_bytes = start_bytes[:small_test]
+        end_bytes = end_bytes[:small_test]
+        if n_jobs is None:
+            n_jobs = 5
+    else:
+        if n_jobs is None:
+            n_jobs = os.cpu_count() // 4
+
+    # default loky backend leads to name space conflict problems
+    return_file = True  # large return from joblib hangs
+    documents = Parallel(n_jobs=n_jobs, verbose=10, backend="multiprocessing")(
+        delayed(get_one_chunk)(
+            wiki_filename,
+            start_byte,
+            end_byte,
+            return_file=return_file,
+            use_views=use_views,
+        )
+        for start_byte, end_byte in zip(start_bytes, end_bytes)
+    )
+    if return_file:
+        # then documents really are files
+        files = documents.copy()
+        documents = []
+        for fil in files:
+            with open(fil, "rb") as f:
+                documents.extend(pickle.load(f))
+            os.remove(fil)
+    else:
+        from functools import reduce
+        from operator import concat
+
+        documents = reduce(concat, documents)
+    assert isinstance(documents, list)
+
+    print("DONE get all wiki docs", flush=True)
+    return documents
+
+
+def test_by_search_term():
+    search_term = "Apollo"
+    assert len(get_documents_by_search_term(search_term)) == 100
+
+    search_term = "Abstract (law)"
+    assert len(get_documents_by_search_term(search_term)) == 100
+
+    search_term = "Artificial languages"
+    assert len(get_documents_by_search_term(search_term)) == 100
+
+
+def test_start_bytes():
+    index_filename, wiki_filename = get_wiki_filenames()
+    assert len(get_start_bytes(index_filename)) == 227850
+
+
+def test_get_all_documents():
+    small_test = 20  # 227850
+    n_jobs = os.cpu_count() // 4
+
+    assert (
+        len(
+            get_all_documents(
+                small_test=small_test, n_jobs=n_jobs, use_views=False
+            )
+        )
+        == small_test * 100
+    )
+
+    assert (
+        len(
+            get_all_documents(
+                small_test=small_test, n_jobs=n_jobs, use_views=True
+            )
+        )
+        == 429
+    )
+
+
+def get_one_pageviews(fil):
+    df1 = pd.read_csv(
+        fil,
+        sep=" ",
+        header=None,
+        names=["region", "title", "views", "foo"],
+        quoting=csv.QUOTE_NONE,
+    )
+    df1.index = df1["title"]
+    df1 = df1[df1["region"] == "en"]
+    df1 = df1.drop("region", axis=1)
+    df1 = df1.drop("foo", axis=1)
+    df1 = df1.drop("title", axis=1)  # already index
+
+    base_tmp = "temp_wiki_pageviews"
+    if not os.path.isdir(base_tmp):
+        os.makedirs(base_tmp, exist_ok=True)
+    filename = os.path.join(base_tmp, str(uuid.uuid4()) + ".tmp.csv")
+    df1.to_csv(filename, index=True)
+    return filename
+
+
+def test_agg_pageviews(gen_files=False):
+    if gen_files:
+        path = os.path.join(
+            root_path,
+            "wiki_pageviews/dumps.wikimedia.org/other/pageviews/2023/2023-04",
+        )
+        files = glob.glob(os.path.join(path, "pageviews*.gz"))
+        # files = files[:2]  # test
+        n_jobs = os.cpu_count() // 2
+        csv_files = Parallel(
+            n_jobs=n_jobs, verbose=10, backend="multiprocessing"
+        )(delayed(get_one_pageviews)(fil) for fil in files)
+    else:
+        # to continue without redoing above
+        csv_files = glob.glob(
+            os.path.join(root_path, "temp_wiki_pageviews/*.csv")
+        )
+
+    df_list = []
+    for csv_file in csv_files:
+        print(csv_file)
+        df1 = pd.read_csv(csv_file)
+        df_list.append(df1)
+    df = pd.concat(df_list, axis=0)
+    df = df.groupby("title")["views"].sum().reset_index()
+    df.to_csv("wiki_page_views.csv", index=True)
+
+
+def test_reduce_pageview():
+    filename = "wiki_page_views.csv"
+    df = pd.read_csv(filename)
+    df = df[df["views"] < 1e7]
+    #
+    plt.hist(df["views"], bins=100, log=True)
+    views_avg = np.mean(df["views"])
+    views_median = np.median(df["views"])
+    plt.title("Views avg: %s median: %s" % (views_avg, views_median))
+    plt.savefig(filename.replace(".csv", ".png"))
+    plt.close()
+    #
+    views_limit = 5000
+    df = df[df["views"] > views_limit]
+    filename = "wiki_page_views_more_5000month.csv"
+    df.to_csv(filename, index=True)
+    #
+    plt.hist(df["views"], bins=100, log=True)
+    views_avg = np.mean(df["views"])
+    views_median = np.median(df["views"])
+    plt.title("Views avg: %s median: %s" % (views_avg, views_median))
+    plt.savefig(filename.replace(".csv", ".png"))
+    plt.close()
+
+
+@pytest.mark.skip("Only if doing full processing again, some manual steps")
+def test_do_wiki_full_all():
+    # Install other requirements for wiki specific conversion:
+    # pip install -r reqs_optional/requirements_optional_wikiprocessing.txt
+
+    # Use "Transmission" in Ubuntu to get wiki dump using torrent:
+    # See: https://meta.wikimedia.org/wiki/Data_dump_torrents
+    # E.g. magnet:?xt=urn:btih:b2c74af2b1531d0b63f1166d2011116f44a8fed0&dn=enwiki-20230401-pages-articles-multistream.xml.bz2&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337
+
+    # Get index
+    os.system(
+        "wget http://ftp.acc.umu.se/mirror/wikimedia.org/dumps/enwiki/20230401/enwiki-20230401-pages-articles-multistream-index.txt.bz2"
+    )
+
+    # Test that can use LangChain to get docs from subset of wiki as sampled out of full wiki directly using bzip multistream
+    test_get_all_documents()
+
+    # Check can search wiki multistream
+    test_by_search_term()
+
+    # Test can get all start bytes in index
+    test_start_bytes()
+
+    # Get page views, e.g. for entire month of April 2023
+    os.system(
+        "wget -b -m -k -o wget.log -e robots=off https://dumps.wikimedia.org/other/pageviews/2023/2023-04/"
+    )
+
+    # Aggregate page views from many files into single file
+    test_agg_pageviews(gen_files=True)
+
+    # Reduce page views to some limit, so processing of full wiki is not too large
+    test_reduce_pageview()
+
+    # Start generate.py with requesting wiki_full in prep.  This will use page views as referenced in get_views.
+    # Note get_views as global() function done once is required to avoid very slow processing
+    # WARNING: Requires alot of memory to handle, used up to 300GB system RAM at peak
+    """
+    python generate.py --langchain_mode='wiki_full' --visible_langchain_modes="['wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs']" &> lc_out.log
+    """
--- a/apps/language_models/langchain/stopping.py
+++ b/apps/language_models/langchain/stopping.py
@@ -0,0 +1,121 @@
+import torch
+from transformers import StoppingCriteria, StoppingCriteriaList
+
+from enums import PromptType
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(
+        self, stops=[], encounters=[], device="cuda", model_max_length=None
+    ):
+        super().__init__()
+        assert (
+            len(stops) % len(encounters) == 0
+        ), "Number of stops and encounters must match"
+        self.encounters = encounters
+        self.stops = [stop.to(device) for stop in stops]
+        self.num_stops = [0] * len(stops)
+        self.model_max_length = model_max_length
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        for stopi, stop in enumerate(self.stops):
+            if torch.all((stop == input_ids[0][-len(stop) :])).item():
+                self.num_stops[stopi] += 1
+                if (
+                    self.num_stops[stopi]
+                    >= self.encounters[stopi % len(self.encounters)]
+                ):
+                    # print("Stopped", flush=True)
+                    return True
+        if (
+            self.model_max_length is not None
+            and input_ids[0].shape[0] >= self.model_max_length
+        ):
+            # critical limit
+            return True
+        # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
+        # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
+        return False
+
+
+def get_stopping(
+    prompt_type,
+    prompt_dict,
+    tokenizer,
+    device,
+    human="<human>:",
+    bot="<bot>:",
+    model_max_length=None,
+):
+    # FIXME: prompt_dict unused currently
+    if prompt_type in [
+        PromptType.human_bot.name,
+        PromptType.instruct_vicuna.name,
+        PromptType.instruct_with_end.name,
+    ]:
+        if prompt_type == PromptType.human_bot.name:
+            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
+            # stopping only starts once output is beyond prompt
+            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
+            stop_words = [human, bot, "\n" + human, "\n" + bot]
+            encounters = [1, 2]
+        elif prompt_type == PromptType.instruct_vicuna.name:
+            # even below is not enough, generic strings and many ways to encode
+            stop_words = [
+                "### Human:",
+                """
+### Human:""",
+                """
+### Human:
+""",
+                "### Assistant:",
+                """
+### Assistant:""",
+                """
+### Assistant:
+""",
+            ]
+            encounters = [1, 2]
+        else:
+            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
+            stop_words = ["### End"]
+            encounters = [1]
+        stop_words_ids = [
+            tokenizer(stop_word, return_tensors="pt")["input_ids"].squeeze()
+            for stop_word in stop_words
+        ]
+        # handle single token case
+        stop_words_ids = [
+            x if len(x.shape) > 0 else torch.tensor([x])
+            for x in stop_words_ids
+        ]
+        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
+        # avoid padding in front of tokens
+        if (
+            tokenizer._pad_token
+        ):  # use hidden variable to avoid annoying properly logger bug
+            stop_words_ids = [
+                x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x
+                for x in stop_words_ids
+            ]
+        # handle fake \n added
+        stop_words_ids = [
+            x[1:] if y[0] == "\n" else x
+            for x, y in zip(stop_words_ids, stop_words)
+        ]
+        # build stopper
+        stopping_criteria = StoppingCriteriaList(
+            [
+                StoppingCriteriaSub(
+                    stops=stop_words_ids,
+                    encounters=encounters,
+                    device=device,
+                    model_max_length=model_max_length,
+                )
+            ]
+        )
+    else:
+        stopping_criteria = StoppingCriteriaList()
+    return stopping_criteria
--- a/apps/language_models/langchain/utils.py
+++ b/apps/language_models/langchain/utils.py
--- a/apps/language_models/langchain/utils_langchain.py
+++ b/apps/language_models/langchain/utils_langchain.py
@@ -0,0 +1,69 @@
+from typing import Any, Dict, List, Union, Optional
+import time
+import queue
+
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain.schema import LLMResult
+
+
+class StreamingGradioCallbackHandler(BaseCallbackHandler):
+    """
+    Similar to H2OTextIteratorStreamer that is for HF backend, but here LangChain backend
+    """
+
+    def __init__(self, timeout: Optional[float] = None, block=True):
+        super().__init__()
+        self.text_queue = queue.SimpleQueue()
+        self.stop_signal = None
+        self.do_stop = False
+        self.timeout = timeout
+        self.block = block
+
+    def on_llm_start(
+        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
+    ) -> None:
+        """Run when LLM starts running. Clean the queue."""
+        while not self.text_queue.empty():
+            try:
+                self.text_queue.get(block=False)
+            except queue.Empty:
+                continue
+
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        """Run on new LLM token. Only available when streaming is enabled."""
+        self.text_queue.put(token)
+
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        """Run when LLM ends running."""
+        self.text_queue.put(self.stop_signal)
+
+    def on_llm_error(
+        self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
+    ) -> None:
+        """Run when LLM errors."""
+        self.text_queue.put(self.stop_signal)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while True:
+            try:
+                value = (
+                    self.stop_signal
+                )  # value looks unused in pycharm, not true
+                if self.do_stop:
+                    print("hit stop", flush=True)
+                    # could raise or break, maybe best to raise and make parent see if any exception in thread
+                    raise StopIteration()
+                    # break
+                value = self.text_queue.get(
+                    block=self.block, timeout=self.timeout
+                )
+                break
+            except queue.Empty:
+                time.sleep(0.01)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
--- a/apps/language_models/src/model_wrappers/minigpt4.py
+++ b/apps/language_models/src/model_wrappers/minigpt4.py
@@ -0,0 +1,503 @@
+import torch
+import dataclasses
+from enum import auto, Enum
+from typing import List, Any
+from transformers import StoppingCriteria
+
+
+from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
+
+
+class LayerNorm(torch.nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class VisionModel(torch.nn.Module):
+    def __init__(
+        self,
+        ln_vision,
+        visual_encoder,
+        precision="fp32",
+        weight_group_size=128,
+    ):
+        super().__init__()
+        self.ln_vision = ln_vision
+        self.visual_encoder = visual_encoder
+        if precision in ["int4", "int8"]:
+            print("Vision Model applying weight quantization to ln_vision")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                self.ln_vision,
+                dtype=torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+            print(
+                "Vision Model applying weight quantization to visual_encoder"
+            )
+            quantize_model(
+                self.visual_encoder,
+                dtype=torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(self, image):
+        image_embeds = self.ln_vision(self.visual_encoder(image))
+        return image_embeds
+
+
+class QformerBertModel(torch.nn.Module):
+    def __init__(self, qformer_bert):
+        super().__init__()
+        self.qformer_bert = qformer_bert
+
+    def forward(self, query_tokens, image_embeds, image_atts):
+        query_output = self.qformer_bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+        return query_output.last_hidden_state
+
+
+class FirstLlamaModel(torch.nn.Module):
+    def __init__(self, model, precision="fp32", weight_group_size=128):
+        super().__init__()
+        self.model = model
+        print("SHARK: Loading LLAMA Done")
+        if precision in ["int4", "int8"]:
+            print("First Llama applying weight quantization")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                self.model,
+                dtype=torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(self, inputs_embeds, position_ids, attention_mask):
+        print("************************************")
+        print(
+            "inputs_embeds: ",
+            inputs_embeds.shape,
+            " dtype: ",
+            inputs_embeds.dtype,
+        )
+        print(
+            "position_ids: ",
+            position_ids.shape,
+            " dtype: ",
+            position_ids.dtype,
+        )
+        print(
+            "attention_mask: ",
+            attention_mask.shape,
+            " dtype: ",
+            attention_mask.dtype,
+        )
+        print("************************************")
+        config = {
+            "inputs_embeds": inputs_embeds,
+            "position_ids": position_ids,
+            "past_key_values": None,
+            "use_cache": True,
+            "attention_mask": attention_mask,
+        }
+        output = self.model(
+            **config,
+            return_dict=True,
+            output_attentions=False,
+            output_hidden_states=False,
+        )
+        return_vals = []
+        return_vals.append(output.logits)
+        temp_past_key_values = output.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SecondLlamaModel(torch.nn.Module):
+    def __init__(self, model, precision="fp32", weight_group_size=128):
+        super().__init__()
+        self.model = model
+        print("SHARK: Loading LLAMA Done")
+        if precision in ["int4", "int8"]:
+            print("Second Llama applying weight quantization")
+            weight_bit_width = 4 if precision == "int4" else 8
+            quantize_model(
+                self.model,
+                dtype=torch.float32,
+                weight_bit_width=weight_bit_width,
+                weight_param_method="stats",
+                weight_scale_precision="float",
+                weight_quant_type="asym",
+                weight_quant_granularity="per_group",
+                weight_group_size=weight_group_size,
+                quantize_weight_zero_point=False,
+            )
+            print("Weight quantization applied.")
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        i1,
+        i2,
+        i3,
+        i4,
+        i5,
+        i6,
+        i7,
+        i8,
+        i9,
+        i10,
+        i11,
+        i12,
+        i13,
+        i14,
+        i15,
+        i16,
+        i17,
+        i18,
+        i19,
+        i20,
+        i21,
+        i22,
+        i23,
+        i24,
+        i25,
+        i26,
+        i27,
+        i28,
+        i29,
+        i30,
+        i31,
+        i32,
+        i33,
+        i34,
+        i35,
+        i36,
+        i37,
+        i38,
+        i39,
+        i40,
+        i41,
+        i42,
+        i43,
+        i44,
+        i45,
+        i46,
+        i47,
+        i48,
+        i49,
+        i50,
+        i51,
+        i52,
+        i53,
+        i54,
+        i55,
+        i56,
+        i57,
+        i58,
+        i59,
+        i60,
+        i61,
+        i62,
+        i63,
+        i64,
+    ):
+        print("************************************")
+        print("input_ids: ", input_ids.shape, " dtype: ", input_ids.dtype)
+        print(
+            "position_ids: ",
+            position_ids.shape,
+            " dtype: ",
+            position_ids.dtype,
+        )
+        print(
+            "attention_mask: ",
+            attention_mask.shape,
+            " dtype: ",
+            attention_mask.dtype,
+        )
+        print("past_key_values: ", i1.shape, i2.shape, i63.shape, i64.shape)
+        print("past_key_values dtype: ", i1.dtype)
+        print("************************************")
+        config = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "past_key_values": (
+                (i1, i2),
+                (
+                    i3,
+                    i4,
+                ),
+                (
+                    i5,
+                    i6,
+                ),
+                (
+                    i7,
+                    i8,
+                ),
+                (
+                    i9,
+                    i10,
+                ),
+                (
+                    i11,
+                    i12,
+                ),
+                (
+                    i13,
+                    i14,
+                ),
+                (
+                    i15,
+                    i16,
+                ),
+                (
+                    i17,
+                    i18,
+                ),
+                (
+                    i19,
+                    i20,
+                ),
+                (
+                    i21,
+                    i22,
+                ),
+                (
+                    i23,
+                    i24,
+                ),
+                (
+                    i25,
+                    i26,
+                ),
+                (
+                    i27,
+                    i28,
+                ),
+                (
+                    i29,
+                    i30,
+                ),
+                (
+                    i31,
+                    i32,
+                ),
+                (
+                    i33,
+                    i34,
+                ),
+                (
+                    i35,
+                    i36,
+                ),
+                (
+                    i37,
+                    i38,
+                ),
+                (
+                    i39,
+                    i40,
+                ),
+                (
+                    i41,
+                    i42,
+                ),
+                (
+                    i43,
+                    i44,
+                ),
+                (
+                    i45,
+                    i46,
+                ),
+                (
+                    i47,
+                    i48,
+                ),
+                (
+                    i49,
+                    i50,
+                ),
+                (
+                    i51,
+                    i52,
+                ),
+                (
+                    i53,
+                    i54,
+                ),
+                (
+                    i55,
+                    i56,
+                ),
+                (
+                    i57,
+                    i58,
+                ),
+                (
+                    i59,
+                    i60,
+                ),
+                (
+                    i61,
+                    i62,
+                ),
+                (
+                    i63,
+                    i64,
+                ),
+            ),
+            "use_cache": True,
+            "attention_mask": attention_mask,
+        }
+        output = self.model(
+            **config,
+            return_dict=True,
+            output_attentions=False,
+            output_hidden_states=False,
+        )
+        return_vals = []
+        return_vals.append(output.logits)
+        temp_past_key_values = output.past_key_values
+        for item in temp_past_key_values:
+            return_vals.append(item[0])
+            return_vals.append(item[1])
+        return tuple(return_vals)
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+
+    SINGLE = auto()
+    TWO = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+
+    skip_next: bool = False
+    conv_id: Any = None
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            conv_id=self.conv_id,
+        )
+
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "conv_id": self.conv_id,
+        }
+
+
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop) :])).item():
+                return True
+
+        return False
+
+
+CONV_VISION = Conversation(
+    system="Give the following image: <Img>ImageContent</Img>. "
+    "You will be able to see the image once I provide it to you. Please answer my questions.",
+    roles=("Human", "Assistant"),
+    messages=[],
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
--- a/apps/language_models/src/model_wrappers/vicuna4.py
+++ b/apps/language_models/src/model_wrappers/vicuna4.py
@@ -0,0 +1,879 @@
+import argparse
+import json
+import re
+from io import BytesIO
+from pathlib import Path
+from tqdm import tqdm
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import iree.runtime
+import itertools
+import subprocess
+
+import torch
+import torch_mlir
+from torch_mlir import TensorPlaceholder
+from torch_mlir.compiler_utils import run_pipeline_with_repro_report
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    LlamaPreTrainedModel,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+
+from apps.language_models.src.pipelines.SharkLLMBase import SharkLLMBase
+from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
+    FirstVicunaLayer,
+    SecondVicunaLayer,
+    CompiledVicunaLayer,
+    ShardedVicunaModel,
+    LMHead,
+    LMHeadCompiled,
+    VicunaEmbedding,
+    VicunaEmbeddingCompiled,
+    VicunaNorm,
+    VicunaNormCompiled,
+)
+from apps.language_models.src.model_wrappers.vicuna_model import (
+    FirstVicuna,
+    SecondVicuna,
+)
+from apps.language_models.utils import (
+    get_vmfb_from_path,
+)
+from shark.shark_downloader import download_public_file
+from shark.shark_importer import get_f16_inputs
+from shark.shark_importer import import_with_fx
+from shark.shark_inference import SharkInference
+
+from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaRMSNorm,
+    _make_causal_mask,
+    _expand_mask,
+)
+from torch import nn
+from time import time
+
+
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(config)
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self,
+        attention_mask,
+        input_shape,
+        inputs_embeds,
+        past_key_values_length,
+    ):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        t1 = time()
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = (
+            use_cache if use_cache is not None else self.config.use_cache
+        )
+
+        return_dict = (
+            return_dict
+            if return_dict is not None
+            else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = (
+                seq_length_with_past + past_key_values_length
+            )
+
+        if position_ids is None:
+            device = (
+                input_ids.device
+                if input_ids is not None
+                else inputs_embeds.device
+            )
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.compressedlayers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = (
+                past_key_values[8 * idx : 8 * (idx + 1)]
+                if past_key_values is not None
+                else None
+            )
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer.forward(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[1:],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        try:
+            hidden_states = np.asarray(hidden_states, hidden_states.dtype)
+        except:
+            _ = 10
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        next_cache = tuple(itertools.chain.from_iterable(next_cache))
+        print(f"Token generated in {time() - t1} seconds")
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class EightLayerLayerSV(torch.nn.Module):
+    def __init__(self, layers):
+        super().__init__()
+        assert len(layers) == 8
+        self.layers = layers
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        pkv00,
+        pkv01,
+        pkv10,
+        pkv11,
+        pkv20,
+        pkv21,
+        pkv30,
+        pkv31,
+        pkv40,
+        pkv41,
+        pkv50,
+        pkv51,
+        pkv60,
+        pkv61,
+        pkv70,
+        pkv71,
+    ):
+        pkvs = [
+            (pkv00, pkv01),
+            (pkv10, pkv11),
+            (pkv20, pkv21),
+            (pkv30, pkv31),
+            (pkv40, pkv41),
+            (pkv50, pkv51),
+            (pkv60, pkv61),
+            (pkv70, pkv71),
+        ]
+        new_pkvs = []
+        for layer, pkv in zip(self.layers, pkvs):
+            outputs = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=(
+                    pkv[0],
+                    pkv[1],
+                ),
+                use_cache=True,
+            )
+
+            hidden_states = outputs[0]
+            new_pkvs.append(
+                (
+                    outputs[-1][0],
+                    outputs[-1][1],
+                )
+            )
+        (
+            (new_pkv00, new_pkv01),
+            (new_pkv10, new_pkv11),
+            (new_pkv20, new_pkv21),
+            (new_pkv30, new_pkv31),
+            (new_pkv40, new_pkv41),
+            (new_pkv50, new_pkv51),
+            (new_pkv60, new_pkv61),
+            (new_pkv70, new_pkv71),
+        ) = new_pkvs
+        return (
+            hidden_states,
+            new_pkv00,
+            new_pkv01,
+            new_pkv10,
+            new_pkv11,
+            new_pkv20,
+            new_pkv21,
+            new_pkv30,
+            new_pkv31,
+            new_pkv40,
+            new_pkv41,
+            new_pkv50,
+            new_pkv51,
+            new_pkv60,
+            new_pkv61,
+            new_pkv70,
+            new_pkv71,
+        )
+
+
+class EightLayerLayerFV(torch.nn.Module):
+    def __init__(self, layers):
+        super().__init__()
+        assert len(layers) == 8
+        self.layers = layers
+
+    def forward(self, hidden_states, attention_mask, position_ids):
+        new_pkvs = []
+        for layer in self.layers:
+            outputs = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=None,
+                use_cache=True,
+            )
+
+            hidden_states = outputs[0]
+            new_pkvs.append(
+                (
+                    outputs[-1][0],
+                    outputs[-1][1],
+                )
+            )
+        (
+            (new_pkv00, new_pkv01),
+            (new_pkv10, new_pkv11),
+            (new_pkv20, new_pkv21),
+            (new_pkv30, new_pkv31),
+            (new_pkv40, new_pkv41),
+            (new_pkv50, new_pkv51),
+            (new_pkv60, new_pkv61),
+            (new_pkv70, new_pkv71),
+        ) = new_pkvs
+        return (
+            hidden_states,
+            new_pkv00,
+            new_pkv01,
+            new_pkv10,
+            new_pkv11,
+            new_pkv20,
+            new_pkv21,
+            new_pkv30,
+            new_pkv31,
+            new_pkv40,
+            new_pkv41,
+            new_pkv50,
+            new_pkv51,
+            new_pkv60,
+            new_pkv61,
+            new_pkv70,
+            new_pkv71,
+        )
+
+
+class CompiledEightLayerLayerSV(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        hidden_states = hidden_states.detach()
+        attention_mask = attention_mask.detach()
+        position_ids = position_ids.detach()
+        (
+            (pkv00, pkv01),
+            (pkv10, pkv11),
+            (pkv20, pkv21),
+            (pkv30, pkv31),
+            (pkv40, pkv41),
+            (pkv50, pkv51),
+            (pkv60, pkv61),
+            (pkv70, pkv71),
+        ) = past_key_value
+        pkv00 = pkv00.detatch()
+        pkv01 = pkv01.detatch()
+        pkv10 = pkv10.detatch()
+        pkv11 = pkv11.detatch()
+        pkv20 = pkv20.detatch()
+        pkv21 = pkv21.detatch()
+        pkv30 = pkv30.detatch()
+        pkv31 = pkv31.detatch()
+        pkv40 = pkv40.detatch()
+        pkv41 = pkv41.detatch()
+        pkv50 = pkv50.detatch()
+        pkv51 = pkv51.detatch()
+        pkv60 = pkv60.detatch()
+        pkv61 = pkv61.detatch()
+        pkv70 = pkv70.detatch()
+        pkv71 = pkv71.detatch()
+
+        output = self.model(
+            "forward",
+            (
+                hidden_states,
+                attention_mask,
+                position_ids,
+                pkv00,
+                pkv01,
+                pkv10,
+                pkv11,
+                pkv20,
+                pkv21,
+                pkv30,
+                pkv31,
+                pkv40,
+                pkv41,
+                pkv50,
+                pkv51,
+                pkv60,
+                pkv61,
+                pkv70,
+                pkv71,
+            ),
+            send_to_host=False,
+        )
+        return (
+            output[0],
+            (output[1][0], output[1][1]),
+            (output[2][0], output[2][1]),
+            (output[3][0], output[3][1]),
+            (output[4][0], output[4][1]),
+            (output[5][0], output[5][1]),
+            (output[6][0], output[6][1]),
+            (output[7][0], output[7][1]),
+            (output[8][0], output[8][1]),
+        )
+
+
+def forward_compressed(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+):
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError(
+            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+        )
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape
+    elif inputs_embeds is not None:
+        batch_size, seq_length, _ = inputs_embeds.shape
+    else:
+        raise ValueError(
+            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+        )
+
+    seq_length_with_past = seq_length
+    past_key_values_length = 0
+
+    if past_key_values is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+        seq_length_with_past = seq_length_with_past + past_key_values_length
+
+    if position_ids is None:
+        device = (
+            input_ids.device if input_ids is not None else inputs_embeds.device
+        )
+        position_ids = torch.arange(
+            past_key_values_length,
+            seq_length + past_key_values_length,
+            dtype=torch.long,
+            device=device,
+        )
+        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+    else:
+        position_ids = position_ids.view(-1, seq_length).long()
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+    # embed positions
+    if attention_mask is None:
+        attention_mask = torch.ones(
+            (batch_size, seq_length_with_past),
+            dtype=torch.bool,
+            device=inputs_embeds.device,
+        )
+    attention_mask = self._prepare_decoder_attention_mask(
+        attention_mask,
+        (batch_size, seq_length),
+        inputs_embeds,
+        past_key_values_length,
+    )
+
+    hidden_states = inputs_embeds
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = () if use_cache else None
+
+    for idx, decoder_layer in enumerate(self.compressedlayers):
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        past_key_value = (
+            past_key_values[8 * idx : 8 * (idx + 1)]
+            if past_key_values is not None
+            else None
+        )
+
+        if self.gradient_checkpointing and self.training:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    # None for past_key_value
+                    return module(*inputs, output_attentions, None)
+
+                return custom_forward
+
+            layer_outputs = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(decoder_layer),
+                hidden_states,
+                attention_mask,
+                position_ids,
+                None,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache += (
+                layer_outputs[2 if output_attentions else 1],
+            )
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if not return_dict:
+        return tuple(
+            v
+            for v in [
+                hidden_states,
+                next_cache,
+                all_hidden_states,
+                all_self_attns,
+            ]
+            if v is not None
+        )
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+
+class CompiledEightLayerLayer(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value=None,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        t2 = time()
+        if past_key_value is None:
+            try:
+                hidden_states = np.asarray(hidden_states, hidden_states.dtype)
+            except:
+                pass
+            attention_mask = attention_mask.detach()
+            position_ids = position_ids.detach()
+            t1 = time()
+
+            output = self.model(
+                "first_vicuna_forward",
+                (hidden_states, attention_mask, position_ids),
+                send_to_host=False,
+            )
+            output2 = (
+                output[0],
+                (
+                    output[1],
+                    output[2],
+                ),
+                (
+                    output[3],
+                    output[4],
+                ),
+                (
+                    output[5],
+                    output[6],
+                ),
+                (
+                    output[7],
+                    output[8],
+                ),
+                (
+                    output[9],
+                    output[10],
+                ),
+                (
+                    output[11],
+                    output[12],
+                ),
+                (
+                    output[13],
+                    output[14],
+                ),
+                (
+                    output[15],
+                    output[16],
+                ),
+            )
+            return output2
+        else:
+            (
+                (pkv00, pkv01),
+                (pkv10, pkv11),
+                (pkv20, pkv21),
+                (pkv30, pkv31),
+                (pkv40, pkv41),
+                (pkv50, pkv51),
+                (pkv60, pkv61),
+                (pkv70, pkv71),
+            ) = past_key_value
+
+            try:
+                hidden_states = hidden_states.detach()
+                attention_mask = attention_mask.detach()
+                position_ids = position_ids.detach()
+                pkv00 = pkv00.detach()
+                pkv01 = pkv01.detach()
+                pkv10 = pkv10.detach()
+                pkv11 = pkv11.detach()
+                pkv20 = pkv20.detach()
+                pkv21 = pkv21.detach()
+                pkv30 = pkv30.detach()
+                pkv31 = pkv31.detach()
+                pkv40 = pkv40.detach()
+                pkv41 = pkv41.detach()
+                pkv50 = pkv50.detach()
+                pkv51 = pkv51.detach()
+                pkv60 = pkv60.detach()
+                pkv61 = pkv61.detach()
+                pkv70 = pkv70.detach()
+                pkv71 = pkv71.detach()
+            except:
+                x = 10
+
+            t1 = time()
+            if type(hidden_states) == iree.runtime.array_interop.DeviceArray:
+                hidden_states = np.array(hidden_states, hidden_states.dtype)
+                hidden_states = torch.tensor(hidden_states)
+                hidden_states = hidden_states.detach()
+
+            output = self.model(
+                "second_vicuna_forward",
+                (
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    pkv00,
+                    pkv01,
+                    pkv10,
+                    pkv11,
+                    pkv20,
+                    pkv21,
+                    pkv30,
+                    pkv31,
+                    pkv40,
+                    pkv41,
+                    pkv50,
+                    pkv51,
+                    pkv60,
+                    pkv61,
+                    pkv70,
+                    pkv71,
+                ),
+                send_to_host=False,
+            )
+            print(f"{time() - t1}")
+            del pkv00
+            del pkv01
+            del pkv10
+            del pkv11
+            del pkv20
+            del pkv21
+            del pkv30
+            del pkv31
+            del pkv40
+            del pkv41
+            del pkv50
+            del pkv51
+            del pkv60
+            del pkv61
+            del pkv70
+            del pkv71
+            output2 = (
+                output[0],
+                (
+                    output[1],
+                    output[2],
+                ),
+                (
+                    output[3],
+                    output[4],
+                ),
+                (
+                    output[5],
+                    output[6],
+                ),
+                (
+                    output[7],
+                    output[8],
+                ),
+                (
+                    output[9],
+                    output[10],
+                ),
+                (
+                    output[11],
+                    output[12],
+                ),
+                (
+                    output[13],
+                    output[14],
+                ),
+                (
+                    output[15],
+                    output[16],
+                ),
+            )
+            return output2
--- a/apps/language_models/src/model_wrappers/vicuna_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_model.py
@@ -6,9 +6,18 @@ from brevitas_examples.llm.llm_quant.run_utils import get_model_impl


 class FirstVicuna(torch.nn.Module):
-    def __init__(self, model_path, precision="fp32", weight_group_size=128):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
        super().__init__()
        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
@@ -18,20 +27,13 @@ class FirstVicuna(torch.nn.Module):
            quantize_model(
                get_model_impl(self.model).layers,
                dtype=torch.float32,
-                weight_quant_type="asym",
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
+                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
                quantize_weight_zero_point=False,
-                input_bit_width=None,
-                input_scale_type="float",
-                input_param_method="stats",
-                input_quant_type="asym",
-                input_quant_granularity="per_tensor",
-                quantize_input_zero_point=False,
-                seqlen=2048,
            )
            print("Weight quantization applied.")

@@ -47,9 +49,18 @@ class FirstVicuna(torch.nn.Module):


 class SecondVicuna(torch.nn.Module):
-    def __init__(self, model_path, precision="fp32", weight_group_size=128):
+    def __init__(
+        self,
+        model_path,
+        precision="fp32",
+        weight_group_size=128,
+        model_name="vicuna",
+        hf_auth_token: str = None,
+    ):
        super().__init__()
        kwargs = {"torch_dtype": torch.float32}
+        if "llama2" in model_name:
+            kwargs["use_auth_token"] = hf_auth_token
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, low_cpu_mem_usage=True, **kwargs
        )
@@ -59,20 +70,13 @@ class SecondVicuna(torch.nn.Module):
            quantize_model(
                get_model_impl(self.model).layers,
                dtype=torch.float32,
-                weight_quant_type="asym",
                weight_bit_width=weight_bit_width,
                weight_param_method="stats",
                weight_scale_precision="float",
+                weight_quant_type="asym",
                weight_quant_granularity="per_group",
                weight_group_size=weight_group_size,
                quantize_weight_zero_point=False,
-                input_bit_width=None,
-                input_scale_type="float",
-                input_param_method="stats",
-                input_quant_type="asym",
-                input_quant_granularity="per_tensor",
-                quantize_input_zero_point=False,
-                seqlen=2048,
            )
            print("Weight quantization applied.")

@@ -297,12 +301,13 @@ class CombinedModel(torch.nn.Module):
        self.second_vicuna = SecondVicuna(second_vicuna_model_path)

    def forward(self, input_ids):
-        first_output = self.first_vicuna(input_ids=input_ids, use_cache=True)
-        logits = first_output[0]
-        pkv = first_output[1:]
-
-        token = torch.argmax(torch.tensor(logits)[:, -1, :], dim=1)
-        token = token.to(torch.int64).reshape([1, 1])
-        secondVicunaInput = (token,) + tuple(pkv)
-        second_output = self.second_vicuna(secondVicunaInput)
+        first_output = self.first_vicuna(input_ids=input_ids)
+        # generate second vicuna
+        compilation_input_ids = torch.zeros([1, 1], dtype=torch.int64)
+        pkv = tuple(
+            (torch.zeros([1, 32, 19, 128], dtype=torch.float32))
+            for _ in range(64)
+        )
+        secondVicunaCompileInput = (compilation_input_ids,) + pkv
+        second_output = self.second_vicuna(*secondVicunaCompileInput)
        return second_output
--- a/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
+++ b/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
@@ -66,7 +66,7 @@ class ShardedVicunaModel(torch.nn.Module):
    def __init__(self, model, layers, lmhead, embedding, norm):
        super().__init__()
        self.model = model
-        assert len(layers) == len(model.model.layers)
+        # assert len(layers) == len(model.model.layers)
        self.model.model.config.use_cache = True
        self.model.model.config.output_attentions = False
        self.layers = layers
@@ -132,7 +132,10 @@ class VicunaNormCompiled(torch.nn.Module):
        self.model = shark_module

    def forward(self, hidden_states):
-        hidden_states.detach()
+        try:
+            hidden_states.detach()
+        except:
+            pass
        output = self.model("forward", (hidden_states,))
        output = torch.tensor(output)
        return output
--- a/apps/language_models/src/pipelines/minigpt4_pipeline.py
+++ b/apps/language_models/src/pipelines/minigpt4_pipeline.py
--- a/apps/language_models/src/pipelines/minigpt4_utils/Qformer.py
+++ b/apps/language_models/src/pipelines/minigpt4_utils/Qformer.py
--- a/apps/language_models/src/pipelines/minigpt4_utils/blip_processors.py
+++ b/apps/language_models/src/pipelines/minigpt4_utils/blip_processors.py
@@ -0,0 +1,68 @@
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from omegaconf import OmegaConf
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+
+class BaseProcessor:
+    def __init__(self):
+        self.transform = lambda x: x
+        return
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        return cls()
+
+    def build(self, **kwargs):
+        cfg = OmegaConf.create(kwargs)
+
+        return self.from_config(cfg)
+
+
+class BlipImageBaseProcessor(BaseProcessor):
+    def __init__(self, mean=None, std=None):
+        if mean is None:
+            mean = (0.48145466, 0.4578275, 0.40821073)
+        if std is None:
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.normalize = transforms.Normalize(mean, std)
+
+
+class Blip2ImageEvalProcessor(BlipImageBaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None):
+        super().__init__(mean=mean, std=std)
+
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @classmethod
+    def from_config(cls, cfg=None):
+        if cfg is None:
+            cfg = OmegaConf.create()
+
+        image_size = cfg.get("image_size", 224)
+
+        mean = cfg.get("mean", None)
+        std = cfg.get("std", None)
+
+        return cls(image_size=image_size, mean=mean, std=std)
--- a/apps/language_models/src/pipelines/minigpt4_utils/configs/cc_sbu_align.yaml
+++ b/apps/language_models/src/pipelines/minigpt4_utils/configs/cc_sbu_align.yaml
@@ -0,0 +1,5 @@
+datasets:
+  cc_sbu_align:
+    data_type: images
+    build_info:
+      storage: /path/to/cc_sbu_align/
--- a/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4.yaml
+++ b/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4.yaml
@@ -0,0 +1,33 @@
+model:
+  arch: mini_gpt4
+
+  # vit encoder
+  image_size: 224
+  drop_path_rate: 0
+  use_grad_checkpoint: False
+  vit_precision: "fp16"
+  freeze_vit: True
+  freeze_qformer: True
+
+  # Q-Former
+  num_query_token: 32
+
+  # Vicuna
+  llama_model: "lmsys/vicuna-7b-v1.3"
+
+  # generation configs
+  prompt: ""
+
+preprocess:
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip2_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+        eval:
+          name: "blip_caption"
--- a/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4_eval.yaml
+++ b/apps/language_models/src/pipelines/minigpt4_utils/configs/minigpt4_eval.yaml
@@ -0,0 +1,25 @@
+model:
+  arch: mini_gpt4
+  model_type: pretrain_vicuna
+  freeze_vit: True
+  freeze_qformer: True
+  max_txt_len: 160
+  end_sym: "###"
+  low_resource: False
+  prompt_path: "apps/language_models/src/pipelines/minigpt4_utils/prompts/alignment.txt"
+  prompt_template: '###Human: {} ###Assistant: '
+  ckpt: 'prerained_minigpt4_7b.pth'
+
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: image_text_pretrain
--- a/apps/language_models/src/pipelines/minigpt4_utils/eva_vit.py
+++ b/apps/language_models/src/pipelines/minigpt4_utils/eva_vit.py
@@ -0,0 +1,629 @@
+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+import requests
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+
+def _cfg(url="", **kwargs):
+    return {
+        "url": url,
+        "num_classes": 1000,
+        "input_size": (3, 224, 224),
+        "pool_size": None,
+        "crop_pct": 0.9,
+        "interpolation": "bicubic",
+        "mean": (0.5, 0.5, 0.5),
+        "std": (0.5, 0.5, 0.5),
+        **kwargs,
+    }
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1
+            ) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads)
+            )  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(
+                torch.meshgrid([coords_h, coords_w])
+            )  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = (
+                coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            )  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0
+            ).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += (
+                window_size[0] - 1
+            )  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = torch.zeros(
+                size=(window_size[0] * window_size[1] + 1,) * 2,
+                dtype=relative_coords.dtype,
+            )
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1
+            )  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer(
+                "relative_position_index", relative_position_index
+            )
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (
+                    self.q_bias,
+                    torch.zeros_like(self.v_bias, requires_grad=False),
+                    self.v_bias,
+                )
+            )
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)
+            ].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1,
+                -1,
+            )  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1
+            ).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        init_values=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        window_size=None,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = (
+            DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        )
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True
+            )
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True
+            )
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
+            )
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(
+                self.gamma_1
+                * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
+            )
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0]
+        )
+        self.patch_shape = (
+            img_size[0] // patch_size[0],
+            img_size[1] // patch_size[1],
+        )
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1
+        ) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = torch.zeros(
+            size=(window_size[0] * window_size[1] + 1,) * 2,
+            dtype=relative_coords.dtype,
+        )
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1
+        )  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer(
+            "relative_position_index", relative_position_index
+        )
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1] + 1,
+            self.window_size[0] * self.window_size[1] + 1,
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer with support for patch or hybrid CNN input stage"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_values=None,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        use_mean_pooling=True,
+        init_scale=0.001,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + 1, embed_dim)
+            )
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads
+            )
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    init_values=init_values,
+                    window_size=self.patch_embed.patch_shape
+                    if use_rel_pos_bias
+                    else None,
+                )
+                for i in range(depth)
+            ]
+        )
+        #         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        #         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        #         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        #         if isinstance(self.head, nn.Linear):
+        #             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    #         if isinstance(self.head, nn.Linear):
+    #             self.head.weight.data.mul_(init_scale)
+    #             self.head.bias.data.mul_(init_scale)
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=""):
+        self.num_classes = num_classes
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes)
+            if num_classes > 0
+            else nn.Identity()
+        )
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = (
+            self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        )
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+
+    #         x = self.norm(x)
+
+    #         if self.fc_norm is not None:
+    #             t = x[:, 1:, :]
+    #             return self.fc_norm(t.mean(1))
+    #         else:
+    #             return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        #         x = self.head(x)
+        return x
+
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        features = []
+        rel_pos_bias = (
+            self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        )
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+
+        return features
+
+
+def interpolate_pos_embed(model, checkpoint_model):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int(
+            (pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5
+        )
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size, orig_size, new_size, new_size)
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size, orig_size, embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+
+
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            # l.weight.data = l.weight.data.half()
+            l.weight.data = l.weight.data
+            if l.bias is not None:
+                # l.bias.data = l.bias.data.half()
+                l.bias.data = l.bias.data
+
+    #         if isinstance(l, (nn.MultiheadAttention, Attention)):
+    #             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+    #                 tensor = getattr(l, attr)
+    #                 if tensor is not None:
+    #                     tensor.data = tensor.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def create_eva_vit_g(
+    img_size=224, drop_path_rate=0.4, use_checkpoint=False, precision="fp16"
+):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408 // 88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )
+    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
+
+    local_filename = "eva_vit_g.pth"
+    response = requests.get(url)
+    if response.status_code == 200:
+        with open(local_filename, "wb") as f:
+            f.write(response.content)
+        print("File downloaded successfully.")
+    state_dict = torch.load(local_filename, map_location="cpu")
+    interpolate_pos_embed(model, state_dict)
+
+    incompatible_keys = model.load_state_dict(state_dict, strict=False)
+
+    if precision == "fp16":
+        #         model.to("cuda")
+        convert_weights_to_fp16(model)
+    return model
--- a/apps/language_models/src/pipelines/minigpt4_utils/prompts/alignment.txt
+++ b/apps/language_models/src/pipelines/minigpt4_utils/prompts/alignment.txt
@@ -0,0 +1,4 @@
+<Img><ImageHere></Img> Describe this image in detail.
+<Img><ImageHere></Img> Take a look at this image and describe what you notice.
+<Img><ImageHere></Img> Please provide a detailed description of the picture.
+<Img><ImageHere></Img> Could you describe the contents of this image for me?
--- a/apps/language_models/utils.py
+++ b/apps/language_models/utils.py
@@ -3,6 +3,7 @@ from torch.fx.experimental.proxy_tensor import make_fx
 from torch._decomp import get_decompositions
 from typing import List
 from pathlib import Path
+from shark.shark_downloader import download_public_file


 # expects a Path / str as arg
@@ -17,9 +18,23 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
        return None

    print("Loading vmfb from: ", vmfb_path)
+    print("Device from get_vmfb_from_path - ", device)
    shark_module = SharkInference(
        None, device=device, mlir_dialect=mlir_dialect
    )
    shark_module.load_module(vmfb_path)
    print("Successfully loaded vmfb")
    return shark_module
+
+
+def get_vmfb_from_config(
+    shark_container, model, precision, device, vmfb_path, padding=None
+):
+    vmfb_url = (
+        f"gs://shark_tank/{shark_container}/{model}_{precision}_{device}"
+    )
+    if padding:
+        vmfb_url = vmfb_url + f"_{padding}"
+    vmfb_url = vmfb_url + ".vmfb"
+    download_public_file(vmfb_url, vmfb_path.absolute(), single_file=True)
+    return get_vmfb_from_path(vmfb_path, device, "tm_tensor")
--- a/apps/stable_diffusion/shark_sd.spec
+++ b/apps/stable_diffusion/shark_sd.spec
@@ -1,5 +1,5 @@
 # -*- mode: python ; coding: utf-8 -*-
-from apps.stable_diffusion.shark_studio_imports import datas, hiddenimports
+from apps.stable_diffusion.shark_studio_imports import pathex, datas, hiddenimports

 binaries = []

@@ -7,7 +7,7 @@ block_cipher = None

 a = Analysis(
    ['web/index.py'],
-    pathex=['.'],
+    pathex=pathex,
    binaries=binaries,
    datas=datas,
    hiddenimports=hiddenimports,
--- a/apps/stable_diffusion/shark_studio_imports.py
+++ b/apps/stable_diffusion/shark_studio_imports.py
@@ -6,6 +6,13 @@ import sys

 sys.setrecursionlimit(sys.getrecursionlimit() * 5)

+# python path for pyinstaller
+pathex = [
+    ".",
+    "./apps/language_models/langchain",
+    "./apps/language_models/src/pipelines/minigpt4_utils",
+]
+
 # datafiles for pyinstaller
 datas = []
 datas += collect_data_files("torch")
@@ -22,6 +29,7 @@ datas += copy_metadata("omegaconf")
 datas += copy_metadata("safetensors")
 datas += copy_metadata("Pillow")
 datas += copy_metadata("sentencepiece")
+datas += copy_metadata("pyyaml")
 datas += collect_data_files("tokenizers")
 datas += collect_data_files("tiktoken")
 datas += collect_data_files("accelerate")
@@ -34,13 +42,15 @@ datas += collect_data_files("gradio")
 datas += collect_data_files("gradio_client")
 datas += collect_data_files("iree")
 datas += collect_data_files("google_cloud_storage")
-datas += collect_data_files("shark")
+datas += collect_data_files("shark", include_py_files=True)
+datas += collect_data_files("timm", include_py_files=True)
 datas += collect_data_files("tkinter")
 datas += collect_data_files("webview")
 datas += collect_data_files("sentencepiece")
 datas += collect_data_files("jsonschema")
 datas += collect_data_files("jsonschema_specifications")
 datas += collect_data_files("cpuinfo")
+datas += collect_data_files("langchain")
 datas += [
    ("src/utils/resources/prompts.json", "resources"),
    ("src/utils/resources/model_db.json", "resources"),
@@ -48,6 +58,14 @@ datas += [
    ("src/utils/resources/base_model.json", "resources"),
    ("web/ui/css/*", "ui/css"),
    ("web/ui/logos/*", "logos"),
+    (
+        "../language_models/src/pipelines/minigpt4_utils/configs/*",
+        "minigpt4_utils/configs",
+    ),
+    (
+        "../language_models/src/pipelines/minigpt4_utils/prompts/*",
+        "minigpt4_utils/prompts",
+    ),
 ]


--- a/apps/stable_diffusion/src/utils/init.py
+++ b/apps/stable_diffusion/src/utils/init.py
@@ -28,6 +28,7 @@ from apps.stable_diffusion.src.utils.utils import (
    fetch_and_update_base_model_id,
    get_path_to_diffusers_checkpoint,
    sanitize_seed,
+    parse_seed_input,
    batch_seeds,
    get_path_stem,
    get_extended_name,
--- a/apps/stable_diffusion/src/utils/profiler.py
+++ b/apps/stable_diffusion/src/utils/profiler.py
@@ -3,7 +3,9 @@ from apps.stable_diffusion.src.utils.stable_args import args

 # Helper function to profile the vulkan device.
 def start_profiling(file_path="foo.rdc", profiling_mode="queue"):
-    if args.vulkan_debug_utils and "vulkan" in args.device:
+    from shark.parser import shark_args
+
+    if shark_args.vulkan_debug_utils and "vulkan" in args.device:
        import iree

        print(f"Profiling and saving to {file_path}.")
--- a/apps/stable_diffusion/src/utils/stable_args.py
+++ b/apps/stable_diffusion/src/utils/stable_args.py
@@ -66,9 +66,9 @@ p.add_argument(

 p.add_argument(
    "--seed",
-    type=int,
+    type=str,
    default=-1,
-    help="The seed to use. -1 for a random one.",
+    help="The seed or list of seeds to use. -1 for a random one.",
 )

 p.add_argument(
@@ -400,6 +400,13 @@ p.add_argument(
    help="Load and unload models for low VRAM.",
 )

+p.add_argument(
+    "--hf_auth_token",
+    type=str,
+    default=None,
+    help="Specify your own huggingface authentication tokens for models like Llama2.",
+)
+
 ##############################################################################
 # IREE - Vulkan supported flags
 ##############################################################################
@@ -418,27 +425,6 @@ p.add_argument(
    help="Specify target triple for metal.",
 )

-p.add_argument(
-    "--vulkan_debug_utils",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Profiles vulkan device and collects the .rdc info.",
-)
-
-p.add_argument(
-    "--vulkan_large_heap_block_size",
-    default="2073741824",
-    help="Flag for setting VMA preferredLargeHeapBlockSize for "
-    "vulkan device, default is 4G.",
-)
-
-p.add_argument(
-    "--vulkan_validation_layers",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="Flag for disabling vulkan validation layers when benchmarking.",
-)
-
 ##############################################################################
 # Misc. Debug and Optimization flags
 ##############################################################################
@@ -533,6 +519,12 @@ p.add_argument(
    "in shark importer. Does nothing if import_mlir is false (the default).",
 )

+p.add_argument(
+    "--iree_constant_folding",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Controls constant folding in iree-compile for all SD models.",
+)

 ##############################################################################
 # Web UI flags
@@ -648,6 +640,16 @@ p.add_argument(
    help="Op to be optimized, options are matmul, bmm, conv and all.",
 )

+##############################################################################
+# DocuChat Flags
+##############################################################################
+
+p.add_argument(
+    "--run_docuchat_web",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Specifies whether the docuchat's web version is running or not.",
+)

 args, unknown = p.parse_known_args()
 if args.import_debug:
--- a/apps/stable_diffusion/src/utils/utils.py
+++ b/apps/stable_diffusion/src/utils/utils.py
@@ -22,6 +22,7 @@ from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
    get_vulkan_target_triple,
+    get_iree_vulkan_runtime_flags,
 )
 from shark.iree_utils.metal_utils import get_metal_target_triple
 from shark.iree_utils.gpu_utils import get_cuda_sm_cc
@@ -183,10 +184,7 @@ def compile_through_fx(


 def set_iree_runtime_flags():
-    vulkan_runtime_flags = [
-        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
-        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
-    ]
+    vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
    if args.enable_rgp:
        vulkan_runtime_flags += [
            f"--enable_rgp=true",
@@ -461,7 +459,12 @@ def get_available_devices():
                device_name = (
                    cpu_name if device["name"] == "default" else device["name"]
                )
-                device_list.append(f"{device_name} => {driver_name}://{i}")
+                if "local" in driver_name:
+                    device_list.append(
+                        f"{device_name} => {driver_name.replace('local', 'cpu')}"
+                    )
+                else:
+                    device_list.append(f"{device_name} => {driver_name}://{i}")
        return device_list

    set_iree_runtime_flags()
@@ -497,6 +500,12 @@ def get_opt_flags(model, precision="fp16"):
            f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
        )

+    if args.iree_constant_folding == False:
+        iree_flags.append("--iree-opt-const-expr-hoisting=False")
+        iree_flags.append(
+            "--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
+        )
+
    # Disable bindings fusion to work with moltenVK.
    if sys.platform == "darwin":
        iree_flags.append("-iree-stream-fuse-binding=false")
@@ -727,7 +736,8 @@ def fetch_and_update_base_model_id(model_to_run, base_model=""):

 # Generate and return a new seed if the provided one is not in the
 # supported range (including -1)
-def sanitize_seed(seed):
+def sanitize_seed(seed: int | str):
+    seed = int(seed)
    uint32_info = np.iinfo(np.uint32)
    uint32_min, uint32_max = uint32_info.min, uint32_info.max
    if seed < uint32_min or seed >= uint32_max:
@@ -735,20 +745,48 @@ def sanitize_seed(seed):
    return seed


-# Generate a set of seeds, using as the first seed of the set,
-# optionally using it as the rng seed for subsequent seeds in the set
-def batch_seeds(seed, batch_count, repeatable=False):
-    # use the passed seed as the initial seed of the batch
-    seeds = [sanitize_seed(seed)]
+# take a seed expression in an input format and convert it to
+# a list of integers, where possible
+def parse_seed_input(seed_input: str | list | int):
+    if isinstance(seed_input, str):
+        try:
+            seed_input = json.loads(seed_input)
+        except (ValueError, TypeError):
+            seed_input = None
+
+    if isinstance(seed_input, int):
+        return [seed_input]
+
+    if isinstance(seed_input, list) and all(
+        type(seed) is int for seed in seed_input
+    ):
+        return seed_input
+
+    raise TypeError(
+        "Seed input must be an integer or an array of integers in JSON format"
+    )
+
+
+# Generate a set of seeds from an input expression for batch_count batches,
+# optionally using that input as the rng seed for any randomly generated seeds.
+def batch_seeds(
+    seed_input: str | list | int, batch_count: int, repeatable=False
+):
+    # turn the input into a list if possible
+    seeds = parse_seed_input(seed_input)
+
+    # slice or pad the list to be of batch_count length
+    seeds = seeds[:batch_count] + [-1] * (batch_count - len(seeds))

    if repeatable:
-        # use the initial seed as the rng generator seed
+        # set seed for the rng based on what we have so far
        saved_random_state = random_getstate()
-        seed_random(seed)
+        if all(seed < 0 for seed in seeds):
+            seeds[0] = sanitize_seed(seeds[0])
+        seed_random(str(seeds))

-    # generate the additional seeds
-    for i in range(1, batch_count):
-        seeds.append(sanitize_seed(-1))
+    # generate any seeds that are unspecified
+    seeds = [sanitize_seed(seed) for seed in seeds]

    if repeatable:
        # reset the rng back to normal
--- a/apps/stable_diffusion/studio_bundle.spec
+++ b/apps/stable_diffusion/studio_bundle.spec
@@ -1,5 +1,5 @@
 # -*- mode: python ; coding: utf-8 -*-
-from apps.stable_diffusion.shark_studio_imports import datas, hiddenimports
+from apps.stable_diffusion.shark_studio_imports import pathex, datas, hiddenimports

 binaries = []

@@ -7,7 +7,7 @@ block_cipher = None

 a = Analysis(
    ['web\\index.py'],
-    pathex=['.'],
+    pathex=pathex,
    binaries=binaries,
    datas=datas,
    hiddenimports=hiddenimports,
--- a/apps/stable_diffusion/web/index.py
+++ b/apps/stable_diffusion/web/index.py
@@ -115,6 +115,8 @@ if __name__ == "__main__":
        txt2img_sendto_inpaint,
        txt2img_sendto_outpaint,
        txt2img_sendto_upscaler,
+        # h2ogpt_upload,
+        # h2ogpt_web,
        img2img_web,
        img2img_custom_model,
        img2img_hf_model_id,
@@ -153,6 +155,7 @@ if __name__ == "__main__":
        upscaler_sendto_outpaint,
        lora_train_web,
        model_web,
+        model_config_web,
        hf_models,
        modelmanager_sendto_txt2img,
        modelmanager_sendto_img2img,
@@ -160,6 +163,7 @@ if __name__ == "__main__":
        modelmanager_sendto_outpaint,
        modelmanager_sendto_upscaler,
        stablelm_chat,
+        minigpt4_web,
        outputgallery_web,
        outputgallery_tab_select,
        outputgallery_watch,
@@ -219,14 +223,18 @@ if __name__ == "__main__":
                outpaint_web.render()
            with gr.TabItem(label="Upscaler", id=4):
                upscaler_web.render()
-            with gr.TabItem(label="Model Manager", id=5):
+            with gr.TabItem(label="Model Manager", id=6):
                model_web.render()
-            with gr.TabItem(label="Chat Bot(Experimental)", id=6):
+            with gr.TabItem(label="Chat Bot(Experimental)", id=7):
                stablelm_chat.render()
-            with gr.TabItem(label="LoRA Training(Experimental)", id=7):
+            with gr.TabItem(label="Generate Sharding Config", id=8):
+                model_config_web.render()
+            with gr.TabItem(label="LoRA Training(Experimental)", id=9):
                lora_train_web.render()
+            with gr.TabItem(label="MultiModal (Experimental)", id=10):
+                minigpt4_web.render()
            if args.output_gallery:
-                with gr.TabItem(label="Output Gallery", id=8) as og_tab:
+                with gr.TabItem(label="Output Gallery", id=5) as og_tab:
                    outputgallery_web.render()

                # extra output gallery configuration
@@ -240,6 +248,10 @@ if __name__ == "__main__":
                        upscaler_status,
                    ]
                )
+            # with gr.TabItem(label="DocuChat Upload", id=11):
+            #    h2ogpt_upload.render()
+            # with gr.TabItem(label="DocuChat(Experimental)", id=12):
+            #    h2ogpt_web.render()

        # send to buttons
        register_button_click(
--- a/apps/stable_diffusion/web/ui/init.py
+++ b/apps/stable_diffusion/web/ui/init.py
@@ -78,6 +78,8 @@ from apps.stable_diffusion.web.ui.stablelm_ui import (
    stablelm_chat,
    llm_chat_api,
 )
+from apps.stable_diffusion.web.ui.generate_config import model_config_web
+from apps.stable_diffusion.web.ui.minigpt4_ui import minigpt4_web
 from apps.stable_diffusion.web.ui.outputgallery_ui import (
    outputgallery_web,
    outputgallery_tab_select,
--- a/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
+++ b/apps/stable_diffusion/web/ui/css/sd_dark_theme.css
@@ -117,16 +117,12 @@ body {
    padding: 0 var(--size-4) !important;
 }

-.container {
-    background-color: black !important;
-    padding-top: var(--size-5) !important;
-}
-
 #ui_title {
    padding: var(--size-2) 0 0 var(--size-1);
 }

 #top_logo {
+    color: transparent;
    background-color: transparent;
    border-radius: 0 !important;
    border: 0;
--- a/apps/stable_diffusion/web/ui/generate_config.py
+++ b/apps/stable_diffusion/web/ui/generate_config.py
@@ -0,0 +1,41 @@
+import gradio as gr
+import torch
+from transformers import AutoTokenizer
+from apps.language_models.src.model_wrappers.vicuna_model import CombinedModel
+from shark.shark_generate_model_config import GenerateConfigFile
+
+
+def get_model_config():
+    hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
+    compilation_prompt = "".join(["0" for _ in range(17)])
+    compilation_input_ids = tokenizer(
+        compilation_prompt,
+        return_tensors="pt",
+    ).input_ids
+    compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
+        [1, 19]
+    )
+    firstVicunaCompileInput = (compilation_input_ids,)
+
+    model = CombinedModel()
+    c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
+    return c.split_into_layers()
+
+
+with gr.Blocks() as model_config_web:
+    with gr.Row():
+        hf_models = gr.Dropdown(
+            label="Model List",
+            choices=["Vicuna"],
+            value="Vicuna",
+            visible=True,
+        )
+        get_model_config_btn = gr.Button(value="Get Model Config")
+    json_view = gr.JSON()
+
+    get_model_config_btn.click(
+        fn=get_model_config,
+        inputs=[],
+        outputs=[json_view],
+    )
--- a/apps/stable_diffusion/web/ui/h2ogpt.py
+++ b/apps/stable_diffusion/web/ui/h2ogpt.py
@@ -0,0 +1,348 @@
+import gradio as gr
+import torch
+import os
+from pathlib import Path
+from transformers import (
+    AutoModelForCausalLM,
+)
+from apps.stable_diffusion.web.ui.utils import available_devices
+
+from apps.language_models.langchain.enums import (
+    DocumentChoices,
+    LangChainAction,
+)
+import apps.language_models.langchain.gen as gen
+from gpt_langchain import (
+    path_to_docs,
+    create_or_update_db,
+)
+from apps.stable_diffusion.src import args
+
+
+def user(message, history):
+    # Append the user's message to the conversation history
+    return "", history + [[message, ""]]
+
+
+sharkModel = 0
+h2ogpt_model = 0
+
+
+# NOTE: Each `model_name` should have its own start message
+start_message = """
+    SHARK DocuChat
+    Chat with an AI, contextualized with provided files.
+"""
+
+
+def create_prompt(history):
+    system_message = start_message
+
+    conversation = "".join(["".join([item[0], item[1]]) for item in history])
+
+    msg = system_message + conversation
+    msg = msg.strip()
+    return msg
+
+
+def chat(curr_system_message, history, device, precision):
+    args.run_docuchat_web = True
+    global h2ogpt_model
+    global h2ogpt_tokenizer
+    global model_state
+    global langchain
+    global userpath_selector
+
+    if h2ogpt_model == 0:
+        if "cuda" in device:
+            shark_device = "cuda"
+        elif "sync" in device:
+            shark_device = "cpu"
+        elif "task" in device:
+            shark_device = "cpu"
+        elif "vulkan" in device:
+            shark_device = "vulkan"
+        else:
+            print("unrecognized device")
+
+        device = "cpu" if shark_device == "cpu" else "cuda"
+
+        args.device = shark_device
+        args.precision = precision
+
+        from apps.language_models.langchain.gen import Langchain
+
+        langchain = Langchain(device, precision)
+        h2ogpt_model, h2ogpt_tokenizer, _ = langchain.get_model(
+            load_4bit=True
+            if device == "cuda"
+            else False,  # load model in 4bit if device is cuda to save memory
+            load_gptq="",
+            use_safetensors=False,
+            infer_devices=True,
+            device=device,
+            base_model="h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+            inference_server="",
+            tokenizer_base_model="h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+            lora_weights="",
+            gpu_id=0,
+            reward_type=None,
+            local_files_only=False,
+            resume_download=True,
+            use_auth_token=False,
+            trust_remote_code=True,
+            offload_folder=None,
+            compile_model=False,
+            verbose=False,
+        )
+        model_state = dict(
+            model=h2ogpt_model,
+            tokenizer=h2ogpt_tokenizer,
+            device=device,
+            base_model="h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+            tokenizer_base_model="h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+            lora_weights="",
+            inference_server="",
+            prompt_type=None,
+            prompt_dict=None,
+        )
+
+    prompt = create_prompt(history)
+    output = langchain.evaluate(
+        model_state=model_state,
+        my_db_state=None,
+        instruction=prompt,
+        iinput="",
+        context="",
+        stream_output=True,
+        prompt_type="prompt_answer",
+        prompt_dict={
+            "promptA": "",
+            "promptB": "",
+            "PreInstruct": "<|prompt|>",
+            "PreInput": None,
+            "PreResponse": "<|answer|>",
+            "terminate_response": [
+                "<|prompt|>",
+                "<|answer|>",
+                "<|endoftext|>",
+            ],
+            "chat_sep": "<|endoftext|>",
+            "chat_turn_sep": "<|endoftext|>",
+            "humanstr": "<|prompt|>",
+            "botstr": "<|answer|>",
+            "generates_leading_space": False,
+        },
+        temperature=0.1,
+        top_p=0.75,
+        top_k=40,
+        num_beams=1,
+        max_new_tokens=256,
+        min_new_tokens=0,
+        early_stopping=False,
+        max_time=180,
+        repetition_penalty=1.07,
+        num_return_sequences=1,
+        do_sample=False,
+        chat=True,
+        instruction_nochat=prompt,
+        iinput_nochat="",
+        langchain_mode="UserData",
+        langchain_action=LangChainAction.QUERY.value,
+        top_k_docs=3,
+        chunk=True,
+        chunk_size=512,
+        document_choice=[DocumentChoices.All_Relevant.name],
+        concurrency_count=1,
+        memory_restriction_level=2,
+        raise_generate_gpu_exceptions=False,
+        chat_context="",
+        use_openai_embedding=False,
+        use_openai_model=False,
+        hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
+        db_type="chroma",
+        n_jobs=-1,
+        first_para=False,
+        max_max_time=60 * 2,
+        model_state0=model_state,
+        model_lock=True,
+        user_path=userpath_selector.value,
+    )
+    history[-1][1] = output["response"]
+    return history
+
+
+userpath_selector = gr.Textbox(
+    label="Document Directory",
+    value=str(os.path.abspath("apps/language_models/langchain/user_path/")),
+    interactive=True,
+    container=True,
+)
+
+with gr.Blocks(title="DocuChat") as h2ogpt_web:
+    with gr.Row():
+        supported_devices = available_devices
+        enabled = len(supported_devices) > 0
+        # show cpu-task device first in list for chatbot
+        supported_devices = supported_devices[-1:] + supported_devices[:-1]
+        supported_devices = [x for x in supported_devices if "sync" not in x]
+        print(supported_devices)
+        device = gr.Dropdown(
+            label="Device",
+            value=supported_devices[0]
+            if enabled
+            else "Only CUDA Supported for now",
+            choices=supported_devices,
+            interactive=enabled,
+        )
+        precision = gr.Radio(
+            label="Precision",
+            value="fp16",
+            choices=[
+                "int4",
+                "int8",
+                "fp16",
+                "fp32",
+            ],
+            visible=True,
+        )
+    chatbot = gr.Chatbot(height=500)
+    with gr.Row():
+        with gr.Column():
+            msg = gr.Textbox(
+                label="Chat Message Box",
+                placeholder="Chat Message Box",
+                show_label=False,
+                interactive=enabled,
+                container=False,
+            )
+        with gr.Column():
+            with gr.Row():
+                submit = gr.Button("Submit", interactive=enabled)
+                stop = gr.Button("Stop", interactive=enabled)
+                clear = gr.Button("Clear", interactive=enabled)
+    system_msg = gr.Textbox(
+        start_message, label="System Message", interactive=False, visible=False
+    )
+
+    submit_event = msg.submit(
+        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+    ).then(
+        fn=chat,
+        inputs=[system_msg, chatbot, device, precision],
+        outputs=[chatbot],
+        queue=True,
+    )
+    submit_click_event = submit.click(
+        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
+    ).then(
+        fn=chat,
+        inputs=[system_msg, chatbot, device, precision],
+        outputs=[chatbot],
+        queue=True,
+    )
+    stop.click(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        cancels=[submit_event, submit_click_event],
+        queue=False,
+    )
+    clear.click(lambda: None, None, [chatbot], queue=False)
+
+
+with gr.Blocks(title="DocuChat Upload") as h2ogpt_upload:
+    import pathlib
+
+    upload_path = None
+    database = None
+    database_directory = os.path.abspath(
+        "apps/language_models/langchain/db_path/"
+    )
+
+    def read_path():
+        global upload_path
+        filenames = [
+            [f]
+            for f in os.listdir(upload_path)
+            if os.path.isfile(os.path.join(upload_path, f))
+        ]
+        filenames.sort()
+        return filenames
+
+    def upload_file(f):
+        names = []
+        for tmpfile in f:
+            name = tmpfile.name.split("/")[-1]
+            basename = os.path.join(upload_path, name)
+            with open(basename, "wb") as w:
+                with open(tmpfile.name, "rb") as r:
+                    w.write(r.read())
+        update_or_create_db()
+        return read_path()
+
+    def update_userpath(newpath):
+        global upload_path
+        upload_path = newpath
+        pathlib.Path(upload_path).mkdir(parents=True, exist_ok=True)
+        return read_path()
+
+    def update_or_create_db():
+        global database
+        global upload_path
+
+        sources = path_to_docs(
+            upload_path,
+            verbose=True,
+            fail_any_exception=False,
+            n_jobs=-1,
+            chunk=True,
+            chunk_size=512,
+            url=None,
+            enable_captions=False,
+            captions_model=None,
+            caption_loader=None,
+            enable_ocr=False,
+        )
+
+        pathlib.Path(database_directory).mkdir(parents=True, exist_ok=True)
+
+        database = create_or_update_db(
+            "chroma",
+            database_directory,
+            "UserData",
+            sources,
+            False,
+            True,
+            True,
+            "sentence-transformers/all-MiniLM-L6-v2",
+        )
+
+    def first_run():
+        global database
+        if database is None:
+            update_or_create_db()
+
+    update_userpath(
+        os.path.abspath("apps/language_models/langchain/user_path/")
+    )
+    h2ogpt_upload.load(fn=first_run)
+    h2ogpt_web.load(fn=first_run)
+
+    with gr.Column():
+        text = gr.DataFrame(
+            col_count=(1, "fixed"),
+            type="array",
+            label="Documents",
+            value=read_path(),
+        )
+        with gr.Row():
+            upload = gr.UploadButton(
+                label="Upload documents",
+                file_count="multiple",
+            )
+            upload.upload(fn=upload_file, inputs=upload, outputs=text)
+            userpath_selector.render()
+            userpath_selector.input(
+                fn=update_userpath, inputs=userpath_selector, outputs=text
+            ).then(fn=update_or_create_db)
--- a/apps/stable_diffusion/web/ui/img2img_ui.py
+++ b/apps/stable_diffusion/web/ui/img2img_ui.py
@@ -50,7 +50,7 @@ def img2img_inf(
    steps: int,
    strength: float,
    guidance_scale: float,
-    seed: int,
+    seed: str | int,
    batch_count: int,
    batch_size: int,
    scheduler: str,
@@ -230,10 +230,12 @@ def img2img_inf(
    start_time = time.time()
    global_obj.get_sd_obj().log = ""
    generated_imgs = []
-    seeds = []
-    seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
    extra_info = {"STRENGTH": strength}
    text_output = ""
+    try:
+        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    except TypeError as error:
+        raise gr.Error(str(error)) from None

    for current_batch in range(batch_count):
        out_imgs = global_obj.get_sd_obj().generate_images(
@@ -617,8 +619,10 @@ with gr.Blocks(title="Image-to-Image") as img2img_web:
                            visible=False,
                        )
                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
+                    seed = gr.Textbox(
+                        value=args.seed,
+                        label="Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
--- a/apps/stable_diffusion/web/ui/inpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/inpaint_ui.py
@@ -49,7 +49,7 @@ def inpaint_inf(
    inpaint_full_res_padding: int,
    steps: int,
    guidance_scale: float,
-    seed: int,
+    seed: str | int,
    batch_count: int,
    batch_size: int,
    scheduler: str,
@@ -181,10 +181,13 @@ def inpaint_inf(
    start_time = time.time()
    global_obj.get_sd_obj().log = ""
    generated_imgs = []
-    seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
    image = image_dict["image"]
    mask_image = image_dict["mask"]
    text_output = ""
+    try:
+        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    except TypeError as error:
+        raise gr.Error(str(error)) from None

    for current_batch in range(batch_count):
        out_imgs = global_obj.get_sd_obj().generate_images(
@@ -514,8 +517,10 @@ with gr.Blocks(title="Inpainting") as inpaint_web:
                            visible=False,
                        )
                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
+                    seed = gr.Textbox(
+                        value=args.seed,
+                        label="Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
--- a/apps/stable_diffusion/web/ui/lora_train_ui.py
+++ b/apps/stable_diffusion/web/ui/lora_train_ui.py
@@ -3,7 +3,7 @@ import os
 import gradio as gr
 from PIL import Image
 from apps.stable_diffusion.scripts import lora_train
-from apps.stable_diffusion.src import prompt_examples, args
+from apps.stable_diffusion.src import prompt_examples, args, utils
 from apps.stable_diffusion.web.ui.utils import (
    available_devices,
    nodlogo_loc,
@@ -168,7 +168,9 @@ with gr.Blocks(title="Lora Training") as lora_train_web:
                        stop_batch = gr.Button("Stop Batch")
                with gr.Row():
                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
+                        value=utils.parse_seed_input(args.seed)[0],
+                        precision=0,
+                        label="Seed",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
--- a/apps/stable_diffusion/web/ui/minigpt4_ui.py
+++ b/apps/stable_diffusion/web/ui/minigpt4_ui.py
@@ -0,0 +1,193 @@
+# ========================================
+#             Gradio Setting
+# ========================================
+import gradio as gr
+
+# from apps.language_models.src.pipelines.minigpt4_pipeline import (
+#     # MiniGPT4,
+#     CONV_VISION,
+# )
+from pathlib import Path
+
+chat = None
+
+
+def gradio_reset(chat_state, img_list):
+    if chat_state is not None:
+        chat_state.messages = []
+    if img_list is not None:
+        img_list = []
+    return (
+        None,
+        gr.update(value=None, interactive=True),
+        gr.update(
+            placeholder="Please upload your image first", interactive=False
+        ),
+        gr.update(value="Upload & Start Chat", interactive=True),
+        chat_state,
+        img_list,
+    )
+
+
+def upload_img(gr_img, text_input, chat_state, device, precision, _compile):
+    global chat
+    if chat is None:
+        from apps.language_models.src.pipelines.minigpt4_pipeline import (
+            MiniGPT4,
+            CONV_VISION,
+        )
+
+        vision_model_precision = precision
+        if precision in ["int4", "int8"]:
+            vision_model_precision = "fp16"
+        vision_model_vmfb_path = Path(
+            f"vision_model_{vision_model_precision}_{device}.vmfb"
+        )
+        qformer_vmfb_path = Path(f"qformer_fp32_{device}.vmfb")
+        chat = MiniGPT4(
+            model_name="MiniGPT4",
+            hf_model_path=None,
+            max_new_tokens=30,
+            device=device,
+            precision=precision,
+            _compile=_compile,
+            vision_model_vmfb_path=vision_model_vmfb_path,
+            qformer_vmfb_path=qformer_vmfb_path,
+        )
+    if gr_img is None:
+        return None, None, gr.update(interactive=True), chat_state, None
+    chat_state = CONV_VISION.copy()
+    img_list = []
+    llm_message = chat.upload_img(gr_img, chat_state, img_list)
+    return (
+        gr.update(interactive=False),
+        gr.update(interactive=True, placeholder="Type and press Enter"),
+        gr.update(value="Start Chatting", interactive=False),
+        chat_state,
+        img_list,
+    )
+
+
+def gradio_ask(user_message, chatbot, chat_state):
+    if len(user_message) == 0:
+        return (
+            gr.update(
+                interactive=True, placeholder="Input should not be empty!"
+            ),
+            chatbot,
+            chat_state,
+        )
+    chat.ask(user_message, chat_state)
+    chatbot = chatbot + [[user_message, None]]
+    return "", chatbot, chat_state
+
+
+def gradio_answer(chatbot, chat_state, img_list, num_beams, temperature):
+    llm_message = chat.answer(
+        conv=chat_state,
+        img_list=img_list,
+        num_beams=num_beams,
+        temperature=temperature,
+        max_new_tokens=300,
+        max_length=2000,
+    )[0]
+    print(llm_message)
+    print("************")
+    chatbot[-1][1] = llm_message
+    return chatbot, chat_state, img_list
+
+
+title = """<h1 align="center">MultiModal SHARK (experimental)</h1>"""
+description = """<h3>Upload your images and start chatting!</h3>"""
+article = """<p><a href='https://minigpt-4.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p><p><a href='https://github.com/Vision-CAIR/MiniGPT-4'><img src='https://img.shields.io/badge/Github-Code-blue'></a></p><p><a href='https://raw.githubusercontent.com/Vision-CAIR/MiniGPT-4/main/MiniGPT_4.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a></p>
+"""
+
+# TODO show examples below
+
+with gr.Blocks() as minigpt4_web:
+    gr.Markdown(title)
+    gr.Markdown(description)
+
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            image = gr.Image(type="pil")
+            upload_button = gr.Button(
+                value="Upload & Start Chat",
+                interactive=True,
+                variant="primary",
+            )
+            clear = gr.Button("Restart")
+
+            num_beams = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=1,
+                step=1,
+                interactive=True,
+                label="beam search numbers)",
+            )
+
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                interactive=True,
+                label="Temperature",
+            )
+
+            device = gr.Dropdown(
+                label="Device",
+                value="cuda",
+                # if enabled
+                # else "Only CUDA Supported for now",
+                choices=["cuda"],
+                interactive=False,
+            )
+
+        with gr.Column():
+            chat_state = gr.State()
+            img_list = gr.State()
+            chatbot = gr.Chatbot(label="MiniGPT-4")
+            text_input = gr.Textbox(
+                label="User",
+                placeholder="Please upload your image first",
+                interactive=False,
+            )
+            precision = gr.Radio(
+                label="Precision",
+                value="int8",
+                choices=[
+                    "int8",
+                    "fp16",
+                    "fp32",
+                ],
+                visible=True,
+            )
+            _compile = gr.Checkbox(
+                value=False,
+                label="Compile",
+                interactive=True,
+            )
+
+    upload_button.click(
+        upload_img,
+        [image, text_input, chat_state, device, precision, _compile],
+        [image, text_input, upload_button, chat_state, img_list],
+    )
+
+    text_input.submit(
+        gradio_ask,
+        [text_input, chatbot, chat_state],
+        [text_input, chatbot, chat_state],
+    ).then(
+        gradio_answer,
+        [chatbot, chat_state, img_list, num_beams, temperature],
+        [chatbot, chat_state, img_list],
+    )
+    clear.click(
+        gradio_reset,
+        [chat_state, img_list],
+        [chatbot, image, text_input, upload_button, chat_state, img_list],
+        queue=False,
+    )
--- a/apps/stable_diffusion/web/ui/outpaint_ui.py
+++ b/apps/stable_diffusion/web/ui/outpaint_ui.py
@@ -49,7 +49,7 @@ def outpaint_inf(
    width: int,
    steps: int,
    guidance_scale: float,
-    seed: int,
+    seed: str,
    batch_count: int,
    batch_size: int,
    scheduler: str,
@@ -178,7 +178,10 @@ def outpaint_inf(
    start_time = time.time()
    global_obj.get_sd_obj().log = ""
    generated_imgs = []
-    seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    try:
+        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    except TypeError as error:
+        raise gr.Error(str(error)) from None

    left = True if "left" in directions else False
    right = True if "right" in directions else False
@@ -542,8 +545,10 @@ with gr.Blocks(title="Outpainting") as outpaint_web:
                            visible=False,
                        )
                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
+                    seed = gr.Textbox(
+                        value=args.seed,
+                        label="Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -7,6 +7,7 @@ from transformers import (
 )
 from apps.stable_diffusion.web.ui.utils import available_devices
 from datetime import datetime as dt
+import json


 def user(message, history):
@@ -21,14 +22,35 @@ vicuna_model = 0
 past_key_values = None

 model_map = {
+    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
+    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
    "codegen": "Salesforce/codegen25-7b-multi",
    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
    "vicuna": "TheBloke/vicuna-7B-1.1-HF",
+    "vicuna4": "TheBloke/vicuna-7B-1.1-HF",
    "StableLM": "stabilityai/stablelm-tuned-alpha-3b",
 }

 # NOTE: Each `model_name` should have its own start message
 start_message = {
+    "llama2_7b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
+    "llama2_70b": (
+        "System: You are a helpful, respectful and honest assistant. Always answer "
+        "as helpfully as possible, while being safe.  Your answers should not "
+        "include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal "
+        "content. Please ensure that your responses are socially unbiased and positive "
+        "in nature. If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. If you don't know the "
+        "answer to a question, please don't share false information."
+    ),
    "StableLM": (
        "<|SYSTEM|># StableLM Tuned (Alpha version)"
        "\n- StableLM is a helpful and harmless open-source AI language model "
@@ -45,6 +67,11 @@ start_message = {
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
+    "vicuna4": (
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's "
+        "questions.\n"
+    ),
    "vicuna1p3": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
@@ -57,7 +84,14 @@ start_message = {
 def create_prompt(model_name, history):
    system_message = start_message[model_name]

-    if model_name in ["StableLM", "vicuna", "vicuna1p3"]:
+    if model_name in [
+        "StableLM",
+        "vicuna",
+        "vicuna4",
+        "vicuna1p3",
+        "llama2_7b",
+        "llama2_70b",
+    ]:
        conversation = "".join(
            [
                "".join(["<|USER|>" + item[0], "<|ASSISTANT|>" + item[1]])
@@ -74,21 +108,46 @@ def create_prompt(model_name, history):
    return msg


-# TODO: Make chat reusable for UI and API
-def chat(curr_system_message, history, model, device, precision):
-    global sharded_model
-    global past_key_values
+def set_vicuna_model(model):
    global vicuna_model
+    vicuna_model = model

+
+# TODO: Make chat reusable for UI and API
+def chat(
+    curr_system_message,
+    history,
+    model,
+    devices,
+    precision,
+    config_file,
+    cli=True,
+):
+    global past_key_values
+
+    global vicuna_model
    model_name, model_path = list(map(str.strip, model.split("=>")))
-    print(f"In chat for {model_name}")

-    if model_name in ["vicuna", "vicuna1p3", "codegen"]:
-        from apps.language_models.scripts.vicuna import (
-            UnshardedVicuna,
-        )
+    if model_name in [
+        "vicuna",
+        "vicuna4",
+        "vicuna1p3",
+        "codegen",
+        "llama2_7b",
+        "llama2_70b",
+    ]:
+        if model_name == "vicuna4":
+            from apps.language_models.scripts.vicuna import (
+                ShardedVicuna as Vicuna,
+            )
+        else:
+            from apps.language_models.scripts.vicuna import (
+                UnshardedVicuna as Vicuna,
+            )
+        from apps.stable_diffusion.src import args

        if vicuna_model == 0:
+            device = devices[0]
            if "cuda" in device:
                device = "cuda"
            elif "sync" in device:
@@ -101,17 +160,42 @@ def chat(curr_system_message, history, model, device, precision):
                print("unrecognized device")

            max_toks = 128 if model_name == "codegen" else 512
-            vicuna_model = UnshardedVicuna(
-                model_name,
-                hf_model_path=model_path,
-                device=device,
-                precision=precision,
-                max_num_tokens=max_toks,
-            )
-        prompt = create_prompt(model_name, history)
-        print("prompt = ", prompt)
+            if model_name == "vicuna4":
+                vicuna_model = Vicuna(
+                    model_name,
+                    hf_model_path=model_path,
+                    device=device,
+                    precision=precision,
+                    max_num_tokens=max_toks,
+                    compressed=True,
+                )
+            else:
+                if len(devices) == 1 and config_file is None:
+                    vicuna_model = Vicuna(
+                        model_name,
+                        hf_model_path=model_path,
+                        hf_auth_token=args.hf_auth_token,
+                        device=device,
+                        precision=precision,
+                        max_num_tokens=max_toks,
+                    )
+                else:
+                    if config_file is not None:
+                        config_file = open(config_file)
+                        config_json = json.load(config_file)
+                        config_file.close()
+                    else:
+                        config_json = None
+                    vicuna_model = Vicuna(
+                        model_name,
+                        device=device,
+                        precision=precision,
+                        config_json=config_json,
+                    )

-        for partial_text in vicuna_model.generate(prompt):
+        prompt = create_prompt(model_name, history)
+
+        for partial_text in vicuna_model.generate(prompt, cli=cli):
            history[-1][1] = partial_text
            yield history

@@ -140,7 +224,7 @@ def chat(curr_system_message, history, model, device, precision):

    partial_text = ""
    for new_text in words_list:
-        # print(new_text)
+        print(new_text)
        partial_text += new_text
        history[-1][1] = partial_text
        # Yield an empty string to clean up the message textbox and the updated
@@ -248,6 +332,13 @@ def llm_chat_api(InputData: dict):
    }


+def view_json_file(file_obj):
+    content = ""
+    with open(file_obj.name, "r") as fopen:
+        content = fopen.read()
+    return content
+
+
 with gr.Blocks(title="Chatbot") as stablelm_chat:
    with gr.Row():
        model_choices = list(
@@ -264,13 +355,14 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        supported_devices = supported_devices[-1:] + supported_devices[:-1]
        supported_devices = [x for x in supported_devices if "sync" not in x]
        print(supported_devices)
-        device = gr.Dropdown(
+        devices = gr.Dropdown(
            label="Device",
            value=supported_devices[0]
            if enabled
            else "Only CUDA Supported for now",
            choices=supported_devices,
            interactive=enabled,
+            multiselect=True,
        )
        precision = gr.Radio(
            label="Precision",
@@ -283,6 +375,14 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
            ],
            visible=True,
        )
+    with gr.Row():
+        with gr.Group():
+            config_file = gr.File(label="Upload sharding configuration")
+            json_view_button = gr.Button("View as JSON")
+        json_view = gr.JSON()
+        json_view_button.click(
+            fn=view_json_file, inputs=[config_file], outputs=[json_view]
+        )
    chatbot = gr.Chatbot(height=500)
    with gr.Row():
        with gr.Column():
@@ -306,7 +406,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, device, precision],
+        inputs=[system_msg, chatbot, model, devices, precision, config_file],
        outputs=[chatbot],
        queue=True,
    )
@@ -314,7 +414,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False
    ).then(
        fn=chat,
-        inputs=[system_msg, chatbot, model, device, precision],
+        inputs=[system_msg, chatbot, model, devices, precision, config_file],
        outputs=[chatbot],
        queue=True,
    )
--- a/apps/stable_diffusion/web/ui/txt2img_ui.py
+++ b/apps/stable_diffusion/web/ui/txt2img_ui.py
@@ -46,7 +46,7 @@ def txt2img_inf(
    width: int,
    steps: int,
    guidance_scale: float,
-    seed: int,
+    seed: str | int,
    batch_count: int,
    batch_size: int,
    scheduler: str,
@@ -178,8 +178,11 @@ def txt2img_inf(
    start_time = time.time()
    global_obj.get_sd_obj().log = ""
    generated_imgs = []
-    seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
    text_output = ""
+    try:
+        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    except TypeError as error:
+        raise gr.Error(str(error)) from None

    for current_batch in range(batch_count):
        out_imgs = global_obj.get_sd_obj().generate_images(
@@ -395,7 +398,7 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            value=args.scheduler,
                            choices=scheduler_list,
                        )
-                        with gr.Group():
+                        with gr.Column():
                            save_metadata_to_png = gr.Checkbox(
                                label="Save prompt information to PNG",
                                value=args.write_metadata_to_png,
@@ -481,8 +484,10 @@ with gr.Blocks(title="Text-to-Image") as txt2img_web:
                            label="Repeatable Seeds",
                        )
                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
+                    seed = gr.Textbox(
+                        value=args.seed,
+                        label="Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
--- a/apps/stable_diffusion/web/ui/upscaler_ui.py
+++ b/apps/stable_diffusion/web/ui/upscaler_ui.py
@@ -42,7 +42,7 @@ def upscaler_inf(
    steps: int,
    noise_level: int,
    guidance_scale: float,
-    seed: int,
+    seed: str,
    batch_count: int,
    batch_size: int,
    scheduler: str,
@@ -177,8 +177,11 @@ def upscaler_inf(
    start_time = time.time()
    global_obj.get_sd_obj().log = ""
    generated_imgs = []
-    seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
    extra_info = {"NOISE LEVEL": noise_level}
+    try:
+        seeds = utils.batch_seeds(seed, batch_count, repeatable_seeds)
+    except TypeError as error:
+        raise gr.Error(str(error)) from None

    for current_batch in range(batch_count):
        low_res_img = image
@@ -534,8 +537,10 @@ with gr.Blocks(title="Upscaler") as upscaler_web:
                            visible=False,
                        )
                with gr.Row():
-                    seed = gr.Number(
-                        value=args.seed, precision=0, label="Seed"
+                    seed = gr.Textbox(
+                        value=args.seed,
+                        label="Seed",
+                        info="An integer or a JSON list of integers, -1 for random",
                    )
                    device = gr.Dropdown(
                        elem_id="device",
--- a/build_tools/vicuna_testing.py
+++ b/build_tools/vicuna_testing.py
@@ -0,0 +1,14 @@
+import os
+from sys import executable
+import subprocess
+from apps.language_models.scripts import vicuna
+
+
+def test_loop():
+    precisions = ["fp16", "int8", "int4"]
+    devices = ["cpu"]
+    for precision in precisions:
+        for device in devices:
+            model = vicuna.UnshardedVicuna(device=device, precision=precision)
+            model.compile()
+            del model
--- a/process_skipfiles.py
+++ b/process_skipfiles.py
@@ -56,3 +56,14 @@ for line in fileinput.input(path_to_lazy_loader, inplace=True):
        )
    else:
        print(line, end="")
+
+# For getting around timm's packaging.
+# Refer: https://github.com/pyinstaller/pyinstaller/issues/5673#issuecomment-808731505
+path_to_timm_activations = Path(
+    get_python_lib() + "/timm/layers/activations_jit.py"
+)
+for line in fileinput.input(path_to_timm_activations, inplace=True):
+    if "@torch.jit.script" in line:
+        print("@torch.jit._script_if_tracing", end="\n")
+    else:
+        print(line, end="")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,3 +15,4 @@ build-backend = "setuptools.build_meta"
 line-length = 79
 include = '\.pyi?$'
 exclude = "apps/language_models/scripts/vicuna.py"
+extend-exclude = "apps/language_models/src/pipelines/minigpt4_pipeline.py"
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,10 +33,13 @@ pywebview
 sentencepiece
 py-cpuinfo
 tiktoken # for codegen
+joblib # for langchain
+timm # for MiniGPT4
+langchain

 # Keep PyInstaller at the end. Sometimes Windows Defender flags it but most folks can continue even if it errors
 pefile
 pyinstaller

-# low precision vicuna
-brevitas @ git+https://github.com/Xilinx/brevitas.git@llm
+# vicuna quantization
+brevitas @ git+https://github.com/Xilinx/brevitas.git@dev
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,7 @@ setup(
    install_requires=[
        "numpy",
        "PyYAML",
-        "torch-mlir==20230620.875",
+        "torch-mlir",
    ]
    + backend_deps,
 )
--- a/setup_venv.ps1
+++ b/setup_venv.ps1
@@ -89,7 +89,7 @@ else {python -m venv .\shark.venv\}
 python -m pip install --upgrade pip
 pip install wheel
 pip install -r requirements.txt
-pip install --pre torch-mlir==20230620.875 torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
+pip install --pre torch-mlir torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu -f https://llvm.github.io/torch-mlir/package-index/
 pip install --upgrade -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html iree-compiler iree-runtime
 Write-Host "Building SHARK..."
 pip install -e . -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SHARK-Runtime/pip-release-links.html
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -88,7 +88,7 @@ if [ "$torch_mlir_bin" = true ]; then
    echo "MacOS detected. Installing torch-mlir from .whl, to avoid dependency problems with torch."
    $PYTHON -m pip install --pre --no-cache-dir torch-mlir -f https://llvm.github.io/torch-mlir/package-index/ -f https://download.pytorch.org/whl/nightly/torch/
  else
-    $PYTHON -m pip install --pre torch-mlir==20230620.875 -f https://llvm.github.io/torch-mlir/package-index/
+    $PYTHON -m pip install --pre torch-mlir -f https://llvm.github.io/torch-mlir/package-index/
    if [ $? -eq 0 ];then
      echo "Successfully Installed torch-mlir"
    else
@@ -159,5 +159,3 @@ if [[ -z "${CONDA_PREFIX}" && "$SKIP_VENV" != "1" ]]; then
  echo "${Green}Before running examples activate venv with:"
  echo "  ${Green}source $VENV_DIR/bin/activate"
 fi
-
-$PYTHON -m pip install git+https://github.com/Xilinx/brevitas.git@llm
--- a/shark/backward_makefx.py
+++ b/shark/backward_makefx.py
@@ -15,7 +15,7 @@
 import torch
 from torch._decomp import get_decompositions
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.nn.utils import _stateless
+from torch.nn.utils import stateless

 from torch import fx
 import tempfile
--- a/shark/examples/shark_inference/upscaler/upscaler_args.py
+++ b/shark/examples/shark_inference/upscaler/upscaler_args.py
@@ -94,18 +94,5 @@ p.add_argument(
    help="Profiles vulkan device and collects the .rdc info",
 )

-p.add_argument(
-    "--vulkan_large_heap_block_size",
-    default="4147483648",
-    help="flag for setting VMA preferredLargeHeapBlockSize for vulkan device, default is 4G",
-)
-
-p.add_argument(
-    "--vulkan_validation_layers",
-    default=False,
-    action=argparse.BooleanOptionalAction,
-    help="flag for disabling vulkan validation layers when benchmarking",
-)
-

 args = p.parse_args()
--- a/shark/examples/shark_inference/upscaler/utils.py
+++ b/shark/examples/shark_inference/upscaler/utils.py
@@ -6,6 +6,7 @@ from shark.shark_importer import import_with_fx
 from shark.iree_utils.vulkan_utils import (
    set_iree_vulkan_runtime_flags,
    get_vulkan_target_triple,
+    get_iree_vulkan_runtime_flags,
 )


@@ -75,10 +76,7 @@ def compile_through_fx(


 def set_iree_runtime_flags():
-    vulkan_runtime_flags = [
-        f"--vulkan_large_heap_block_size={args.vulkan_large_heap_block_size}",
-        f"--vulkan_validation_layers={'true' if args.vulkan_validation_layers else 'false'}",
-    ]
+    vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
    if args.enable_rgp:
        vulkan_runtime_flags += [
            f"--enable_rgp=true",
--- a/shark/examples/shark_training/bert_training.py
+++ b/shark/examples/shark_training/bert_training.py
@@ -1,5 +1,5 @@
 import torch
-from torch.nn.utils import _stateless
+from torch.nn.utils import stateless
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from shark.shark_trainer import SharkTrainer

@@ -33,7 +33,7 @@ inp = (torch.randint(2, (1, 128)),)

 def forward(params, buffers, args):
    params_and_buffers = {**params, **buffers}
-    _stateless.functional_call(
+    stateless.functional_call(
        mod, params_and_buffers, args, {}
    ).sum().backward()
    optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
@@ -44,5 +44,5 @@ def forward(params, buffers, args):

 shark_module = SharkTrainer(mod, inp)
 shark_module.compile(forward)
-
-print(shark_module.train())
+shark_module.train(num_iters=2)
+print("training done")
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -21,6 +21,7 @@ import numpy as np
 import os
 import re
 import tempfile
+import time
 from pathlib import Path


@@ -377,9 +378,14 @@ def load_vmfb_using_mmap(
        and ".vmfb" in flatbuffer_blob_or_path
    ):
        vmfb_file_path = flatbuffer_blob_or_path
+        print(
+            f"Loading module {flatbuffer_blob_or_path}... ", end="", flush=True
+        )
        mmaped_vmfb = ireert.VmModule.mmap(instance, flatbuffer_blob_or_path)
+        print(f"mmap complete... ", end="", flush=True)
        ctx = ireert.SystemContext(config=config)
        ctx.add_vm_module(mmaped_vmfb)
+        print(f"module initialized. Ready to run!")
        mmaped_vmfb = getattr(ctx.modules, mmaped_vmfb.name)
    else:
        with tempfile.NamedTemporaryFile(delete=False) as tf:
@@ -410,7 +416,6 @@ def get_iree_compiled_module(
    #       we're setting delete=False when creating NamedTemporaryFile. That's why
    #       I'm getting hold of the name of the temporary file in `temp_file_to_unlink`.
    if mmap:
-        print(f"Will load the compiled module as a mmapped temporary file")
        vmfb, config, temp_file_to_unlink = load_vmfb_using_mmap(
            flatbuffer_blob, device, device_idx
        )
@@ -434,7 +439,6 @@ def load_flatbuffer(
 ):
    temp_file_to_unlink = None
    if mmap:
-        print(f"Loading flatbuffer at {flatbuffer_path} as a mmapped file")
        vmfb, config, temp_file_to_unlink = load_vmfb_using_mmap(
            flatbuffer_path, device, device_idx
        )
--- a/shark/iree_utils/vulkan_utils.py
+++ b/shark/iree_utils/vulkan_utils.py
@@ -19,6 +19,7 @@ from shark.iree_utils._common import run_cmd
 import iree.runtime as ireert
 from sys import platform
 from shark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag
+from shark.parser import shark_args


 def get_vulkan_device_name(device_num=0):
@@ -171,6 +172,15 @@ def get_iree_vulkan_args(device_num=0, extra_args=[]):
    return res_vulkan_flag


+def get_iree_vulkan_runtime_flags():
+    vulkan_runtime_flags = [
+        f"--vulkan_large_heap_block_size={shark_args.vulkan_large_heap_block_size}",
+        f"--vulkan_validation_layers={'true' if shark_args.vulkan_validation_layers else 'false'}",
+        f"--vulkan_vma_allocator={'true' if shark_args.vulkan_vma_allocator else 'false'}",
+    ]
+    return vulkan_runtime_flags
+
+
 def set_iree_vulkan_runtime_flags(flags):
    for flag in flags:
        ireert.flags.parse_flags(flag)
--- a/shark/parser.py
+++ b/shark/parser.py
@@ -126,4 +126,32 @@ parser.add_argument(
    help="passthrough flag for the iree flag of the same name. If None, defaults to cpu-count",
 )

+parser.add_argument(
+    "--vulkan_debug_utils",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Profiles vulkan device and collects the .rdc info.",
+)
+
+parser.add_argument(
+    "--vulkan_large_heap_block_size",
+    default="2073741824",
+    help="Flag for setting VMA preferredLargeHeapBlockSize for "
+    "vulkan device, default is 4G.",
+)
+
+parser.add_argument(
+    "--vulkan_validation_layers",
+    default=False,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for disabling vulkan validation layers when benchmarking.",
+)
+
+parser.add_argument(
+    "--vulkan_vma_allocator",
+    default=True,
+    action=argparse.BooleanOptionalAction,
+    help="Flag for enabling / disabling Vulkan VMA Allocator.",
+)
+
 shark_args, unknown = parser.parse_known_args()
--- a/shark/shark_compile.py
+++ b/shark/shark_compile.py
@@ -2,6 +2,55 @@ import os
 import tempfile
 from shark.shark_inference import SharkInference
 from shark.shark_importer import import_with_fx
+import torch
+import torch_mlir
+from torch_mlir.compiler_utils import run_pipeline_with_repro_report
+from typing import List, Tuple
+from io import BytesIO
+from brevitas_examples.llm.llm_quant.quantize import quantize_model
+from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
+
+
+def brevitas〇matmul_rhs_group_quant〡shape(
+    lhs: List[int],
+    rhs: List[int],
+    rhs_scale: List[int],
+    rhs_zero_point: List[int],
+    rhs_bit_width: int,
+    rhs_group_size: int,
+) -> List[int]:
+    if len(lhs) == 3 and len(rhs) == 2:
+        return [lhs[0], lhs[1], rhs[0]]
+    elif len(lhs) == 2 and len(rhs) == 2:
+        return [lhs[0], rhs[0]]
+    else:
+        raise ValueError("Input shapes not supported.")
+
+
+def brevitas〇matmul_rhs_group_quant〡dtype(
+    lhs_rank_dtype: Tuple[int, int],
+    rhs_rank_dtype: Tuple[int, int],
+    rhs_scale_rank_dtype: Tuple[int, int],
+    rhs_zero_point_rank_dtype: Tuple[int, int],
+    rhs_bit_width: int,
+    rhs_group_size: int,
+) -> int:
+    # output dtype is the dtype of the lhs float input
+    lhs_rank, lhs_dtype = lhs_rank_dtype
+    return lhs_dtype
+
+
+def brevitas〇matmul_rhs_group_quant〡has_value_semantics(
+    lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size
+) -> None:
+    return
+
+
+brevitas_matmul_rhs_group_quant_library = [
+    brevitas〇matmul_rhs_group_quant〡shape,
+    brevitas〇matmul_rhs_group_quant〡dtype,
+    brevitas〇matmul_rhs_group_quant〡has_value_semantics,
+]


 def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):
@@ -39,11 +88,90 @@ def compile_module(
    return shark_module


+def compile_int_precision(
+    model, inputs, precision, device, generate_vmfb, extended_model_name
+):
+    weight_bit_width = 4 if precision == "int4" else 8
+    weight_group_size = 128
+    quantize_model(
+        get_model_impl(model),
+        dtype=torch.float32,
+        weight_quant_type="asym",
+        weight_bit_width=weight_bit_width,
+        weight_param_method="stats",
+        weight_scale_precision="float",
+        weight_quant_granularity="per_group",
+        weight_group_size=weight_group_size,
+        quantize_weight_zero_point=False,
+        input_bit_width=None,
+        input_scale_type="float",
+        input_param_method="stats",
+        input_quant_type="asym",
+        input_quant_granularity="per_tensor",
+        quantize_input_zero_point=False,
+        seqlen=2048,
+    )
+    print("Weight quantization applied.")
+    torchscript_module = import_with_fx(
+        model,
+        inputs,
+        precision=precision,
+        mlir_type="torchscript",
+    )
+    mlir_module = torch_mlir.compile(
+        torchscript_module,
+        inputs,
+        output_type="torch",
+        backend_legal_ops=["brevitas.matmul_rhs_group_quant"],
+        extra_library=brevitas_matmul_rhs_group_quant_library,
+        use_tracing=False,
+        verbose=False,
+    )
+    print(f"[DEBUG] converting torch to linalg")
+    run_pipeline_with_repro_report(
+        mlir_module,
+        "builtin.module(func.func(torch-unpack-torch-tensor),torch-backend-to-linalg-on-tensors-backend-pipeline)",
+        description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
+    )
+    from contextlib import redirect_stdout
+
+    mlir_file_path = os.path.join(
+        os.getcwd(), f"{extended_model_name}_linalg.mlir"
+    )
+    with open(mlir_file_path, "w") as f:
+        with redirect_stdout(f):
+            print(mlir_module.operation.get_asm())
+    mlir_module = str(mlir_module)
+    mlir_module = mlir_module.encode("UTF-8")
+    mlir_module = BytesIO(mlir_module)
+    bytecode = mlir_module.read()
+    print(f"Elided IR written for {extended_model_name}")
+    return bytecode
+    shark_module = SharkInference(
+        mlir_module=bytecode, device=device, mlir_dialect="tm_tensor"
+    )
+    extra_args = [
+        "--iree-hal-dump-executable-sources-to=ies",
+        "--iree-vm-target-truncate-unsupported-floats",
+        "--iree-codegen-check-ir-before-llvm-conversion=false",
+        "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+    ]
+    return (
+        compile_module(
+            shark_module,
+            extended_model_name=extended_model_name,
+            generate_vmfb=generate_vmfb,
+            extra_args=extra_args,
+        ),
+        bytecode,
+    )
+
+
 def shark_compile_through_fx(
    model,
    inputs,
    extended_model_name,
-    is_f16=False,
+    precision,
    f16_input_mask=None,
    save_dir=tempfile.gettempdir(),
    debug=False,
@@ -52,6 +180,7 @@ def shark_compile_through_fx(
    device=None,
    mlir_dialect="tm_tensor",
 ):
+    is_f16 = precision == "fp16"
    if generate_or_load_vmfb:
        shark_module = load_vmfb(
            extended_model_name=extended_model_name,
@@ -70,18 +199,34 @@ def shark_compile_through_fx(
    if "cuda" in device:
        shark_args.enable_tf32 = True

-    (
-        mlir_module,
-        _,
-    ) = import_with_fx(
-        model=model,
-        inputs=inputs,
-        is_f16=is_f16,
-        f16_input_mask=f16_input_mask,
-        debug=debug,
-        model_name=extended_model_name,
-        save_dir=save_dir,
-    )
+    if precision in ["int4", "int8"]:
+        mlir_module = compile_int_precision(
+            model,
+            inputs,
+            precision,
+            device,
+            generate_or_load_vmfb,
+            extended_model_name,
+        )
+        extra_args = [
+            "--iree-hal-dump-executable-sources-to=ies",
+            "--iree-vm-target-truncate-unsupported-floats",
+            "--iree-codegen-check-ir-before-llvm-conversion=false",
+            "--iree-vm-bytecode-module-output-format=flatbuffer-binary",
+        ]
+    else:
+        (
+            mlir_module,
+            _,
+        ) = import_with_fx(
+            model=model,
+            inputs=inputs,
+            is_f16=is_f16,
+            f16_input_mask=f16_input_mask,
+            debug=debug,
+            model_name=extended_model_name,
+            save_dir=save_dir,
+        )

    shark_module = SharkInference(
        mlir_module,
--- a/shark/shark_downloader.py
+++ b/shark/shark_downloader.py
@@ -111,22 +111,20 @@ os.makedirs(WORKDIR, exist_ok=True)
 def check_dir_exists(model_name, frontend="torch", dynamic=""):
    model_dir = os.path.join(WORKDIR, model_name)

-    # Remove the _tf keyword from end.
-    if frontend in ["tf", "tensorflow"]:
-        model_name = model_name[:-3]
-    elif frontend in ["tflite"]:
-        model_name = model_name[:-7]
-    elif frontend in ["torch", "pytorch"]:
-        model_name = model_name[:-6]
+    # Remove the _tf keyword from end only for non-SD models.
+    if not any(model in model_name for model in ["clip", "unet", "vae"]):
+        if frontend in ["tf", "tensorflow"]:
+            model_name = model_name[:-3]
+        elif frontend in ["tflite"]:
+            model_name = model_name[:-7]
+        elif frontend in ["torch", "pytorch"]:
+            model_name = model_name[:-6]
+
+    model_mlir_file_name = f"{model_name}{dynamic}_{frontend}.mlir"

    if os.path.isdir(model_dir):
        if (
-            os.path.isfile(
-                os.path.join(
-                    model_dir,
-                    model_name + dynamic + "_" + str(frontend) + ".mlir",
-                )
-            )
+            os.path.isfile(os.path.join(model_dir, model_mlir_file_name))
            and os.path.isfile(os.path.join(model_dir, "function_name.npy"))
            and os.path.isfile(os.path.join(model_dir, "inputs.npz"))
            and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
--- a/shark/shark_generate_model_config.py
+++ b/shark/shark_generate_model_config.py
@@ -1,5 +1,7 @@
 import re
 import json
+import numpy as np
+
 import torch_mlir
 from iree.compiler import compile_str
 from shark.shark_importer import import_with_fx, get_f16_inputs
@@ -11,6 +13,7 @@ class GenerateConfigFile:
        model,
        num_sharding_stages: int,
        sharding_stages_id: list[str],
+        units_in_each_stage: list[int],
        model_input=None,
        config_file_path="model_config.json",
    ):
@@ -22,13 +25,16 @@ class GenerateConfigFile:
        ), "Number of sharding stages should be equal to the list of their ID"
        self.model_input = model_input
        self.config_file_path = config_file_path
+        # (Nithin) this is a quick fix - revisit and rewrite
+        self.units_in_each_stage = np.array(units_in_each_stage)
+        self.track_loop = np.zeros(len(self.sharding_stages_id)).astype(int)

    def split_into_dispatches(
        self,
        backend,
-        fx_tracing_required=True,
+        fx_tracing_required=False,
        f16_model=False,
-        torch_mlir_tracing=False,
+        torch_mlir_tracing=True,
    ):
        graph_for_compilation = self.model
        if fx_tracing_required:
@@ -95,7 +101,17 @@ class GenerateConfigFile:
            if substring_before_final_period in model_dictionary:
                del model_dictionary[substring_before_final_period]

-            layer_dict = {n: "None" for n in self.sharding_stages_id}
+            # layer_dict = {n: "None" for n in self.sharding_stages_id}
+
+            # By default embed increasing device id's for each layer
+            increasing_wraparound_idx_list = (
+                self.track_loop % self.units_in_each_stage
+            )
+            layer_dict = {
+                n: int(increasing_wraparound_idx_list[idx])
+                for idx, n in enumerate(self.sharding_stages_id)
+            }
+            self.track_loop += 1
            model_dictionary[name] = layer_dict

        self.generate_json(model_dictionary)
@@ -103,3 +119,29 @@ class GenerateConfigFile:
    def generate_json(self, artifacts):
        with open(self.config_file_path, "w") as outfile:
            json.dump(artifacts, outfile)
+
+
+if __name__ == "__main__":
+    import torch
+    from transformers import AutoTokenizer
+
+    hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
+    compilation_prompt = "".join(["0" for _ in range(17)])
+    compilation_input_ids = tokenizer(
+        compilation_prompt,
+        return_tensors="pt",
+    ).input_ids
+    compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
+        [1, 19]
+    )
+    firstVicunaCompileInput = (compilation_input_ids,)
+    from apps.language_models.src.model_wrappers.vicuna_model import (
+        FirstVicuna,
+        SecondVicuna,
+        CombinedModel,
+    )
+
+    model = CombinedModel()
+    c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
+    c.split_into_dispatches("vulkan")
--- a/shark/shark_importer.py
+++ b/shark/shark_importer.py
@@ -488,7 +488,7 @@ def flatten_training_input(inputs):
    return tuple(flattened_input)


-# TODO: get rid of is_f16 by using precision
+# TODO: Remove is_f16 and fix all calls with using precision instead
 # Applies fx conversion to the model and imports the mlir.
 def import_with_fx(
    model,
--- a/tank/all_models.csv
+++ b/tank/all_models.csv
@@ -44,4 +44,3 @@ t5-base,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq m
 t5-base,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"","macos"
 t5-large,linalg,torch,1e-2,1e-3,default,None,True,True,True,"Inputs for seq2seq models in torch currently unsupported","macos"
 t5-large,stablehlo,tf,1e-2,1e-3,default,None,False,False,False,"","macos"
-stabilityai/stable-diffusion-2-1-base,linalg,torch,1e-3,1e-3,default,None,True,False,False,"","macos"
--- a/tank/examples/opt/opt_perf_comparison.py
+++ b/tank/examples/opt/opt_perf_comparison.py
@@ -11,7 +11,7 @@ from shark_opt_wrapper import OPTForCausalLMModel
 MODEL_NAME = "facebook/opt-1.3b"
 OPT_MODELNAME = "opt-1.3b"
 OPT_FS_NAME = "opt_1-3b"
-MAX_SEQUENCE_LENGTH = 8
+MAX_SEQUENCE_LENGTH = 512
 DEVICE = "cpu"

 PROMPTS = [
@@ -72,16 +72,14 @@ def create_vmfb_module(model_name, tokenizer, device):
        is_benchmark=False,
    )

-    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}"
+    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}_tiled_ukernels"
    shark_module.save_module(module_name=vmfb_name)
    vmfb_path = vmfb_name + ".vmfb"
    return vmfb_path


 def load_shark_model() -> ModelWrapper:
-    vmfb_name = (
-        f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}.vmfb"
-    )
+    vmfb_name = f"{OPT_FS_NAME}_causallm_{MAX_SEQUENCE_LENGTH}_torch_{DEVICE}_tiled_ukernels.vmfb"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
    if not os.path.isfile(vmfb_name):
        print(f"vmfb not found. compiling and saving to {vmfb_name}")
@@ -91,20 +89,9 @@ def load_shark_model() -> ModelWrapper:
    return ModelWrapper(model=shark_module, tokenizer=tokenizer)


-def run_shark_model(model_wrapper: ModelWrapper, prompt: str):
-    model_inputs = model_wrapper.tokenizer(
-        prompt,
-        padding="max_length",
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        return_tensors="pt",
-    )
-    inputs = (
-        model_inputs["input_ids"],
-        model_inputs["attention_mask"],
-    )
+def run_shark_model(model_wrapper: ModelWrapper, tokens):
    # Generate logits output of OPT model.
-    return model_wrapper.model("forward", inputs)
+    return model_wrapper.model("forward", tokens)


 def run_shark():
@@ -124,16 +111,14 @@ def load_huggingface_model() -> ModelWrapper:
    )


-def run_huggingface_model(model_wrapper: ModelWrapper, prompt: str):
-    inputs = model_wrapper.tokenizer(prompt, return_tensors="pt")
+def run_huggingface_model(model_wrapper: ModelWrapper, tokens):
    return model_wrapper.model.forward(
-        inputs.input_ids, inputs.attention_mask, return_dict=False
+        tokens.input_ids, tokens.attention_mask, return_dict=False
    )


 def run_huggingface():
    model_wrapper = load_huggingface_model()
-
    prompt = "What is the meaning of life?"
    logits = run_huggingface_model(model_wrapper, prompt)

@@ -150,11 +135,21 @@ def collect_huggingface_logits():
    model_wrapper = load_huggingface_model()
    print("--- Took {} seconds to load Huggingface.".format(time.time() - t0))
    results = []
-    t0 = time.time()
+    tokenized_prompts = []
    for prompt in PROMPTS:
-        print("prompt: {}".format(prompt))
-        logits = run_huggingface_model(model_wrapper, prompt)
-        results.append([prompt, logits[0].tolist()])
+        tokens = model_wrapper.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            return_tensors="pt",
+        )
+        tokenized_prompts.append(tokens)
+    t0 = time.time()
+    for idx, tokens in enumerate(tokenized_prompts):
+        print("prompt: {}".format(PROMPTS[idx]))
+        logits = run_huggingface_model(model_wrapper, tokens)
+        results.append([PROMPTS[idx], logits[0].tolist()])
    print("--- Took {} seconds to run Huggingface.".format(time.time() - t0))
    save_json(results, "/tmp/huggingface.json")

@@ -164,12 +159,26 @@ def collect_shark_logits():
    model_wrapper = load_shark_model()
    print("--- Took {} seconds to load Shark.".format(time.time() - t0))
    results = []
-    t0 = time.time()
+    tokenized_prompts = []
    for prompt in PROMPTS:
-        print("prompt: {}".format(prompt))
-        logits = run_shark_model(model_wrapper, prompt)
+        tokens = model_wrapper.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=MAX_SEQUENCE_LENGTH,
+            return_tensors="pt",
+        )
+        inputs = (
+            tokens["input_ids"],
+            tokens["attention_mask"],
+        )
+        tokenized_prompts.append(inputs)
+    t0 = time.time()
+    for idx, tokens in enumerate(tokenized_prompts):
+        print("prompt: {}".format(PROMPTS[idx]))
+        logits = run_shark_model(model_wrapper, tokens)
        lst = [e.tolist() for e in logits]
-        results.append([prompt, lst])
+        results.append([PROMPTS[idx], lst])
    print("--- Took {} seconds to run Shark.".format(time.time() - t0))
    save_json(results, "/tmp/shark.json")
Author	SHA1	Message	Date
Stella Laurenzo	9e37e03741	Clearly differentiate phases of loading modules to better understand if things are taking a long time. (#1733 )	2023-08-07 14:03:12 -07:00
Stefan Kapusniak	9b8c4401b5	gpt_langchain.py fixes for pydantic (#1722 )	2023-08-07 00:55:38 -07:00
Ean Garvey	a9f95a218b	Remove SD from all_models.csv (#1706 ) Removes SD from pytests as it has its own test suite.	2023-08-05 15:55:52 -05:00
PhaneeshB	872bd72d0b	fix name check for file existence	2023-08-05 21:33:53 +05:30
Eliasj42	fd1c4db5d0	download all mlirs (#1727 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-08-04 18:22:06 -05:00
Daniel Garvey	759664bb48	add py files to pyinstaller for shark (#1723 )	2023-08-04 14:10:43 -07:00
Daniel Garvey	14fd0cdd87	add missing subprocess import (#1721 )	2023-08-04 15:15:22 -05:00
Daniel Garvey	a57eccc997	fix lint (#1720 )	2023-08-04 14:54:33 -05:00
Daniel Garvey	a686d7d89f	temporarily disable langchain stuff in webui (#1719 ) its breaking the exe	2023-08-04 12:48:06 -07:00
Eliasj42	ed484b8253	added functionality for int8 vicuna and 4 shards (#1712 ) combined vicuna_4_shards.py and vicuna.py to reduce code duplication Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-08-04 14:05:05 -05:00
gpetters94	7fe57ebaaf	Add vector database and add support on the web UI (#1699 )	2023-08-04 13:47:19 -04:00
Nithin Meganathan	c287fd2be8	Add GPU ID's in model_confg.json by default for manual annotation (#1718 )	2023-08-04 12:46:27 -05:00
Gaurav Shukla	51ec1a1360	[vicuna] Integrate sharded vicuna in web (#1717 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-08-04 11:46:53 -05:00
Gaurav Shukla	bd30044c0b	[Shard] Add sharding generation in shark studio Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-08-04 21:51:14 +05:30
Ean Garvey	c9de2729b2	Add flag for toggling constant folding. (#1714 )	2023-08-04 04:55:52 -07:00
Vivek Khandelwal	a5b13fcc2f	[Langchain] Patch for fixing streaming of tokens (#1709 )	2023-08-03 10:06:49 -07:00
Stefan Kapusniak	6bb329c4af	Unsharded Vicuna: Fix Memory Error compiling mlir for lmsys/vicuna-7b-v1.3 fp16 with 64 GiB (#1702 )	2023-08-01 06:07:56 -07:00
Vivek Khandelwal	98fb6c52df	Expand pipelines to fix streaming of tokens	2023-07-31 22:11:01 +05:30
Stefan Kapusniak	206c1b70f4	UI/Web: Reorder tabs to separate SD and LLM (#1701 ) Shuffle the tabs around so that: * All the SD tabs are together * All the LLM tabs are together * All the experimental tabs are together	2023-07-29 22:25:30 -04:00
PhaneeshB	cdb037ee54	use shark_args for vulkan debug utils flag	2023-07-30 07:54:26 +05:30
PhaneeshB	ce2fd84538	fix cpu device name for SharkStudio	2023-07-30 07:54:26 +05:30
PhaneeshB	4684afad34	update upscalar example	2023-07-28 21:06:28 +05:30
PhaneeshB	8d65456b7a	Move vulkan runtime flags to shark_args	2023-07-28 21:06:28 +05:30
PhaneeshB	d6759a852b	add vulkan vma alloc flag	2023-07-28 21:06:28 +05:30
Daniel Garvey	ab57af43c1	Couple of fixes for vicuna.py (#1696 ) * mega vicuna merge pt 2 * add fallback to ensure compile is called	2023-07-27 15:53:05 -07:00
jinchen62	4d5c55dd9f	Fix vicuna script (#1697 )	2023-07-27 17:24:26 -05:00
Vivek Khandelwal	07399ad65c	[Langchain] Remove unused code (#1698 )	2023-07-27 11:59:54 -05:00
Vivek Khandelwal	776a9c2293	Fix for Langchain (#1694 ) For CPU, remove max time stopping criteria Fix web UI issue	2023-07-26 09:00:23 -07:00
Eliasj42	9d399eb988	fixed bug where device_idx was hardcoded (#1693 ) Co-authored-by: Elias Joseph <elias@nod-labs.com>	2023-07-25 19:00:13 -05:00
Vivek Khandelwal	927b662aa7	Add Langchain SHARK Compilation support for all paths	2023-07-25 22:15:42 +05:30
Abhishek Varma	47f8a79c75	[MiniGPT4] Add MiniGPT4 to SHARK (#1554 ) * [MiniGPT4] Add MiniGPT4 to SHARK -- This is the first installment of MiniGPT4 in SHARK. Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> * Add int8 support for MiniGPT4 -- This commit adds int8 support for MiniGPT4. Signed-off-by: Abhishek Varma <abhishek@nod-lab.com> * Update .spec for MiniGPT4's config files * black format MiniGPT4 --------- Signed-off-by: Abhishek Varma <abhishek@nod-labs.com> Signed-off-by: Abhishek Varma <abhishek@nod-lab.com>	2023-07-25 09:42:27 -07:00
Stefan Kapusniak	289f983f41	SD - Implement seed arrays for batch runs (#1690 ) * SD Scripts and UI tabs that support batch_count can now take a string containing a JSON array, or a list of integers, as their seed input. * Each batch in a run will now take the seed specified at the corresponding array index if one exists. If there is no seed at that index, the seed value will be treated as -1 and a random seed will be assigned at that position. If an integer rather than a list or json array has been, everything works as before. * UI seed input controls are now Textboxes with info lines about the seed formats allowed. * UI error handling updated to be more helpful if the seed input is invalid.	2023-07-24 19:22:34 -07:00
Daniel Garvey	453e46562f	mega vicuna merge pt 2 (#1685 )	2023-07-24 12:42:20 -05:00
Gaurav Shukla	5497af1f56	[config] Add support for uploading sharding config file in chatbot (#1689 ) Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com>	2023-07-24 10:18:03 -07:00
Vivek Khandelwal	f3cb63fc9c	Fix Langchain multiple device isssue (#1688 )	2023-07-24 08:03:46 -07:00
Vivek Khandelwal	d7092aafaa	Fix multiple issue for Langchain This commit fixes the following issue for the Langchain: 1.) Web UI not able to fetch results. 2.) For each query model getting reloaded. 3.) SHARK module not using user provided device and precision. 4.) Create a class for main Langchain code. 5.) Misc issues	2023-07-21 21:56:27 +05:30
Vivek Khandelwal	a415f3f70e	Fix Langchain Prompt issue and add web UI support (#1682 )	2023-07-21 06:36:55 -07:00
Vivek Khandelwal	c292e5c9d7	Add Langchain CPU support and update requirements	2023-07-20 18:53:34 +05:30
Vivek Khandelwal	03c4d9e171	Add support for Llama-2-70b for web and cli, and for hf_auth_token	2023-07-20 14:57:48 +05:30
jinchen62	3662224c04	Update brevitas requirement (#1677 ) also clean up useless args Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-07-19 22:03:32 -07:00
Vivek Khandelwal	db3f222933	Revert "Add Llama2 70B option in CLI and WebUI (#1673 )" (#1679 ) This reverts commit `41e5088908`.	2023-07-19 22:02:48 -07:00
Stefan Kapusniak	68b3021325	Fixes cosmetic problems with Gradio 3.37.0 (#1676 ) * Fix nod-ai logo having a white border * Fix control labels having a black background * Remove extra lower border below Save Prompt checkboxes in Txt2Img UI	2023-07-19 17:28:53 -07:00
AyaanShah2204	336469154d	added copy-metadata for pyyaml (#1678 )	2023-07-19 17:27:25 -07:00
Abhishek Varma	41e5088908	Add Llama2 70B option in CLI and WebUI (#1673 )	2023-07-19 10:41:42 -07:00
PhaneeshB	0a8f7673f4	Add README for CodeGen server	2023-07-19 23:10:23 +05:30
PhaneeshB	c482ab78da	fix second vic clearing for low mem device	2023-07-19 23:10:23 +05:30
Vivek Khandelwal	4be80f7158	Add support for the Llama-2 model	2023-07-19 20:57:08 +05:30
AyaanShah2204	536aba1424	unpinned torch_mlir (#1668 ) Co-authored-by: powderluv <powderluv@users.noreply.github.com>	2023-07-19 06:28:00 -07:00
Ean Garvey	dd738a0e02	small changes to opt_perf_comparison.py (#1670 ) * Use longer prompts for OPT comparison script * small tweaks	2023-07-19 06:26:50 -07:00
Daniel Garvey	8927cb0a2c	set optional vmfb download (#1667 )	2023-07-18 10:57:28 -07:00
Daniel Garvey	8c317e4809	fix cli for vicuna (#1666 )	2023-07-18 10:03:40 -07:00
Vivek Khandelwal	b0136593df	Add support for different compilation paths for DocuChat (#1665 )	2023-07-18 09:49:44 -07:00
Vivek Khandelwal	11f62d7fac	Minor fixes for MiniLM Training	2023-07-18 17:16:44 +05:30
powderluv	14559dd620	Update DocuChat as experimental (#1660 )	2023-07-17 22:12:05 -07:00
AyaanShah2204	e503a3e8d6	fixed joblib import error (#1659 )	2023-07-17 12:56:10 -07:00
AyaanShah2204	22a4254adf	fixed pyinstaller path for langchain imports (#1658 )	2023-07-17 12:19:21 -07:00
Vivek Khandelwal	ab01f0f048	Add Langchain model in SHARK (#1657 ) * Add H2OGPT * Add UI tab for h2ogpt * Add source files from h2ogpt * Add the rest of the files * Add h2ogpt support * Add SHARK Compilation support for langchain model for cli mode --------- Co-authored-by: George Petterson <gpetters@protonmail.com>	2023-07-17 09:58:15 -07:00