Fix torchvision versioning in Linux importer setup. (#1809 )

Fix StableHLO Compilation flag
[chatbot] Remove few outdated models from list (#1814 )
2026-04-20 03:00:34 -04:00 · 2023-09-05 12:57:03 -05:00 · 2023-09-05 21:32:33 +05:30 · 2023-09-04 09:26:32 -07:00
4 changed files with 76 additions and 200 deletions
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
@@ -413,8 +413,7 @@ class VicunaBase(SharkLLMBase):
            _past_key_values = torch.tensor(output[1:])
            _token = torch.argmax(_logits[:, -1, :], dim=1)

-        skip_sp_tok = True if self.model_name == "codegen" else False
-        _detok = self.tokenizer.decode(_token, skip_special_tokens=skip_sp_tok)
+        _detok = self.tokenizer.decode(_token, skip_special_tokens=False)
        ret_dict = {
            "token": _token,
            "detok": _detok,
@@ -465,17 +464,11 @@ class ShardedVicuna(VicunaBase):
            kwargs = {
                "use_auth_token": "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"
            }
-        if self.model_name == "codegen":
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                trust_remote_code=True,
-            )
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                use_fast=False,
-                **kwargs,
-            )
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.hf_model_path,
+            use_fast=False,
+            **kwargs,
+        )
        return tokenizer

    def get_src_model(self):
@@ -1284,17 +1277,11 @@ class UnshardedVicuna(VicunaBase):

    def get_tokenizer(self):
        kwargs = {"use_auth_token": self.hf_auth_token}
-        if self.model_name == "codegen":
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                trust_remote_code=True,
-            )
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.hf_model_path,
-                use_fast=False,
-                **kwargs,
-            )
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.hf_model_path,
+            use_fast=False,
+            **kwargs,
+        )
        return tokenizer

    def get_src_model(self):
@@ -1448,10 +1435,7 @@ class UnshardedVicuna(VicunaBase):
            print("[DEBUG] generating mlir on device")
            # Select a compilation prompt such that the resulting input_ids
            # from the model's tokenizer has shape [1, 19]
-            if self.model_name == "codegen":
-                compilation_prompt = "def hello_world():\n    print('Hello World')\n    print('Hello World')"
-            else:
-                compilation_prompt = "".join(["0" for _ in range(17)])
+            compilation_prompt = "".join(["0" for _ in range(17)])

            first_model_path = f"first_{self.model_name}_{self.precision}.mlir"
            if Path(first_model_path).exists():
@@ -1683,9 +1667,8 @@ class UnshardedVicuna(VicunaBase):
            if type(res_tokens[i]) != int:
                res_tokens[i] = int(res_tokens[i][0])

-        skip_sp_tok = True if self.model_name == "codegen" else False
        res_str = self.tokenizer.decode(
-            res_tokens, skip_special_tokens=skip_sp_tok
+            res_tokens, skip_special_tokens=False
        )
        return res_str

@@ -1728,7 +1711,7 @@ class UnshardedVicuna(VicunaBase):
            pkv = generated_token_op["past_key_values"]
            detok = generated_token_op["detok"]

-            if token == 2 and self.model_name != "codegen":
+            if token == 2:
                break
            res_tokens.append(token)
            if detok == "<0x0A>":
@@ -1776,33 +1759,11 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


--- a/apps/stable_diffusion/web/ui/stablelm_ui.py
+++ b/apps/stable_diffusion/web/ui/stablelm_ui.py
@@ -26,11 +26,7 @@ model_map = {
    "llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
    "llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
    "llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
-    "codegen": "Salesforce/codegen25-7b-multi",
-    "vicuna1p3": "lmsys/vicuna-7b-v1.3",
    "vicuna": "TheBloke/vicuna-7B-1.1-HF",
-    "vicuna4": "TheBloke/vicuna-7B-1.1-HF",
-    "StableLM": "stabilityai/stablelm-tuned-alpha-3b",
 }

 # NOTE: Each `model_name` should have its own start message
@@ -62,33 +58,11 @@ start_message = {
        "explain why instead of answering something not correct. If you don't know the "
        "answer to a question, please don't share false information."
    ),
-    "StableLM": (
-        "<|SYSTEM|># StableLM Tuned (Alpha version)"
-        "\n- StableLM is a helpful and harmless open-source AI language model "
-        "developed by StabilityAI."
-        "\n- StableLM is excited to be able to help the user, but will refuse "
-        "to do anything that could be considered harmful to the user."
-        "\n- StableLM is more than just an information source, StableLM is also "
-        "able to write poetry, short stories, and make jokes."
-        "\n- StableLM will refuse to participate in anything that "
-        "could harm a human."
-    ),
    "vicuna": (
        "A chat between a curious user and an artificial intelligence assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's "
        "questions.\n"
    ),
-    "vicuna4": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "vicuna1p3": (
-        "A chat between a curious user and an artificial intelligence assistant. "
-        "The assistant gives helpful, detailed, and polite answers to the user's "
-        "questions.\n"
-    ),
-    "codegen": "",
 }


@@ -96,10 +70,7 @@ def create_prompt(model_name, history):
    system_message = start_message[model_name]

    if model_name in [
-        "StableLM",
        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
        "llama2_7b",
        "llama2_13b",
        "llama2_70b",
@@ -183,123 +154,68 @@ def chat(
    else:
        print("unrecognized device")

+    from apps.language_models.scripts.vicuna import ShardedVicuna
+    from apps.language_models.scripts.vicuna import UnshardedVicuna
+    from apps.stable_diffusion.src import args
+
    new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{precision}"
-    if model_name in [
-        "vicuna",
-        "vicuna4",
-        "vicuna1p3",
-        "codegen",
-        "llama2_7b",
-        "llama2_13b",
-        "llama2_70b",
-    ]:
-        from apps.language_models.scripts.vicuna import ShardedVicuna
-        from apps.language_models.scripts.vicuna import UnshardedVicuna
-        from apps.stable_diffusion.src import args
-
-        if new_model_vmfb_key != model_vmfb_key:
-            model_vmfb_key = new_model_vmfb_key
-            max_toks = 128 if model_name == "codegen" else 512
-
-            # get iree flags that need to be overridden, from commandline args
-            _extra_args = []
-            # vulkan target triple
-            if args.iree_vulkan_target_triple != "":
-                _extra_args.append(
-                    f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
-                )
-
-            if model_name == "vicuna4":
-                vicuna_model = ShardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    compressed=True,
-                    extra_args_cmd=_extra_args,
-                )
-            else:
-                #  if config_file is None:
-                vicuna_model = UnshardedVicuna(
-                    model_name,
-                    hf_model_path=model_path,
-                    hf_auth_token=args.hf_auth_token,
-                    device=device,
-                    precision=precision,
-                    max_num_tokens=max_toks,
-                    download_vmfb=download_vmfb,
-                    load_mlir_from_shark_tank=True,
-                    extra_args_cmd=_extra_args,
-                )
-                #  else:
-                #      if config_file is not None:
-                #          config_file = open(config_file)
-                #          config_json = json.load(config_file)
-                #          config_file.close()
-                #      else:
-                #          config_json = get_default_config()
-                #      vicuna_model = ShardedVicuna(
-                #          model_name,
-                #          device=device,
-                #          precision=precision,
-                #          config_json=config_json,
-                #      )
-
-        prompt = create_prompt(model_name, history)
-
-        partial_text = ""
-        count = 0
-        start_time = time.time()
-        for text, msg in progress.tqdm(
-            vicuna_model.generate(prompt, cli=cli),
-            desc="generating response",
-        ):
-            count += 1
-            if "formatted" in msg:
-                history[-1][1] = text
-                end_time = time.time()
-                tokens_per_sec = count / (end_time - start_time)
-                yield history, str(
-                    format(tokens_per_sec, ".2f")
-                ) + " tokens/sec"
-            else:
-                partial_text += text + " "
-                history[-1][1] = partial_text
-                yield history, ""
-
-        return history, ""
-
-    # else Model is StableLM
-    global sharkModel
-    from apps.language_models.src.pipelines.stablelm_pipeline import (
-        SharkStableLM,
-    )
-
    if new_model_vmfb_key != model_vmfb_key:
        model_vmfb_key = new_model_vmfb_key
-        # max_new_tokens=512
-        shark_slm = SharkStableLM(
-            model_name
-        )  # pass elements from UI as required
+        max_toks = 128 if model_name == "codegen" else 512
+
+        # get iree flags that need to be overridden, from commandline args
+        _extra_args = []
+        # vulkan target triple
+        if args.iree_vulkan_target_triple != "":
+            _extra_args.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+
+        if model_name == "vicuna4":
+            vicuna_model = ShardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                compressed=True,
+                extra_args_cmd=_extra_args,
+            )
+        else:
+            #  if config_file is None:
+            vicuna_model = UnshardedVicuna(
+                model_name,
+                hf_model_path=model_path,
+                hf_auth_token=args.hf_auth_token,
+                device=device,
+                precision=precision,
+                max_num_tokens=max_toks,
+                download_vmfb=download_vmfb,
+                load_mlir_from_shark_tank=True,
+                extra_args_cmd=_extra_args,
+            )

-    # Construct the input message string for the model by concatenating the
-    # current system message and conversation history
-    if len(curr_system_message.split()) > 160:
-        print("clearing context")
    prompt = create_prompt(model_name, history)
-    generate_kwargs = dict(prompt=prompt)
-
-    words_list = shark_slm.generate(**generate_kwargs)

    partial_text = ""
-    for new_text in words_list:
-        partial_text += new_text
-        history[-1][1] = partial_text
-        # Yield an empty string to clean up the message textbox and the updated
-        # conversation history
-        yield history
-    return words_list
+    count = 0
+    start_time = time.time()
+    for text, msg in progress.tqdm(
+        vicuna_model.generate(prompt, cli=cli),
+        desc="generating response",
+    ):
+        count += 1
+        if "formatted" in msg:
+            history[-1][1] = text
+            end_time = time.time()
+            tokens_per_sec = count / (end_time - start_time)
+            yield history, str(format(tokens_per_sec, ".2f")) + " tokens/sec"
+        else:
+            partial_text += text + " "
+            history[-1][1] = partial_text
+            yield history, ""
+
+    return history, ""


 def llm_chat_api(InputData: dict):
@@ -417,7 +333,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
        )
        model = gr.Dropdown(
            label="Select Model",
-            value=model_choices[4],
+            value=model_choices[1],
            choices=model_choices,
        )
        supported_devices = available_devices
--- a/setup_venv.sh
+++ b/setup_venv.sh
@@ -130,14 +130,13 @@ fi

 $PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/cpu/

-if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
+if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
  T_VER=$($PYTHON -m pip show torch | grep Version)
-  TORCH_VERSION=${T_VER:9:17}
+  T_VER_MIN=${T_VER:14:12}
  TV_VER=$($PYTHON -m pip show torchvision | grep Version)
-  TV_VERSION=${TV_VER:9:18}
-  $PYTHON -m pip uninstall -y torch torchvision
-  $PYTHON -m pip install -U --pre --no-warn-conflicts triton
-  $PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
+  TV_VER_MAJ=${TV_VER:9:6}
+  $PYTHON -m pip uninstall -y torchvision
+  $PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
  if [ $? -eq 0 ];then
    echo "Successfully Installed torch + cu118."
  else
--- a/shark/iree_utils/compile_utils.py
+++ b/shark/iree_utils/compile_utils.py
@@ -84,7 +84,7 @@ def get_iree_frontend_args(frontend):
    elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
        return [
            "--iree-llvmcpu-target-cpu-features=host",
-            "--iree-flow-demote-i64-to-i32",
+            "--iree-input-demote-i64-to-i32",
        ]
    else:
        # Frontend not found.
Author	SHA1	Message	Date
Ean Garvey	2c2693fb7d	Fix torchvision versioning in Linux importer setup. (#1809 )	2023-09-05 12:57:03 -05:00
Vivek Khandelwal	1d31b2b2c6	Fix StableHLO Compilation flag	2023-09-05 21:32:33 +05:30
Gaurav Shukla	d2f64eefa3	[chatbot] Remove few outdated models from list (#1814 )	2023-09-04 09:26:32 -07:00