Compare commits

...

3 Commits

Author SHA1 Message Date
Ean Garvey
2c2693fb7d Fix torchvision versioning in Linux importer setup. (#1809) 2023-09-05 12:57:03 -05:00
Vivek Khandelwal
1d31b2b2c6 Fix StableHLO Compilation flag 2023-09-05 21:32:33 +05:30
Gaurav Shukla
d2f64eefa3 [chatbot] Remove few outdated models from list (#1814) 2023-09-04 09:26:32 -07:00
4 changed files with 76 additions and 200 deletions

View File

@@ -413,8 +413,7 @@ class VicunaBase(SharkLLMBase):
_past_key_values = torch.tensor(output[1:])
_token = torch.argmax(_logits[:, -1, :], dim=1)
skip_sp_tok = True if self.model_name == "codegen" else False
_detok = self.tokenizer.decode(_token, skip_special_tokens=skip_sp_tok)
_detok = self.tokenizer.decode(_token, skip_special_tokens=False)
ret_dict = {
"token": _token,
"detok": _detok,
@@ -465,17 +464,11 @@ class ShardedVicuna(VicunaBase):
kwargs = {
"use_auth_token": "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"
}
if self.model_name == "codegen":
tokenizer = AutoTokenizer.from_pretrained(
self.hf_model_path,
trust_remote_code=True,
)
else:
tokenizer = AutoTokenizer.from_pretrained(
self.hf_model_path,
use_fast=False,
**kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
self.hf_model_path,
use_fast=False,
**kwargs,
)
return tokenizer
def get_src_model(self):
@@ -1284,17 +1277,11 @@ class UnshardedVicuna(VicunaBase):
def get_tokenizer(self):
kwargs = {"use_auth_token": self.hf_auth_token}
if self.model_name == "codegen":
tokenizer = AutoTokenizer.from_pretrained(
self.hf_model_path,
trust_remote_code=True,
)
else:
tokenizer = AutoTokenizer.from_pretrained(
self.hf_model_path,
use_fast=False,
**kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
self.hf_model_path,
use_fast=False,
**kwargs,
)
return tokenizer
def get_src_model(self):
@@ -1448,10 +1435,7 @@ class UnshardedVicuna(VicunaBase):
print("[DEBUG] generating mlir on device")
# Select a compilation prompt such that the resulting input_ids
# from the model's tokenizer has shape [1, 19]
if self.model_name == "codegen":
compilation_prompt = "def hello_world():\n print('Hello World')\n print('Hello World')"
else:
compilation_prompt = "".join(["0" for _ in range(17)])
compilation_prompt = "".join(["0" for _ in range(17)])
first_model_path = f"first_{self.model_name}_{self.precision}.mlir"
if Path(first_model_path).exists():
@@ -1683,9 +1667,8 @@ class UnshardedVicuna(VicunaBase):
if type(res_tokens[i]) != int:
res_tokens[i] = int(res_tokens[i][0])
skip_sp_tok = True if self.model_name == "codegen" else False
res_str = self.tokenizer.decode(
res_tokens, skip_special_tokens=skip_sp_tok
res_tokens, skip_special_tokens=False
)
return res_str
@@ -1728,7 +1711,7 @@ class UnshardedVicuna(VicunaBase):
pkv = generated_token_op["past_key_values"]
detok = generated_token_op["detok"]
if token == 2 and self.model_name != "codegen":
if token == 2:
break
res_tokens.append(token)
if detok == "<0x0A>":
@@ -1776,33 +1759,11 @@ start_message = {
"explain why instead of answering something not correct. If you don't know the "
"answer to a question, please don't share false information."
),
"StableLM": (
"<|SYSTEM|># StableLM Tuned (Alpha version)"
"\n- StableLM is a helpful and harmless open-source AI language model "
"developed by StabilityAI."
"\n- StableLM is excited to be able to help the user, but will refuse "
"to do anything that could be considered harmful to the user."
"\n- StableLM is more than just an information source, StableLM is also "
"able to write poetry, short stories, and make jokes."
"\n- StableLM will refuse to participate in anything that "
"could harm a human."
),
"vicuna": (
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's "
"questions.\n"
),
"vicuna4": (
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's "
"questions.\n"
),
"vicuna1p3": (
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's "
"questions.\n"
),
"codegen": "",
}

View File

@@ -26,11 +26,7 @@ model_map = {
"llama2_7b": "meta-llama/Llama-2-7b-chat-hf",
"llama2_13b": "meta-llama/Llama-2-13b-chat-hf",
"llama2_70b": "meta-llama/Llama-2-70b-chat-hf",
"codegen": "Salesforce/codegen25-7b-multi",
"vicuna1p3": "lmsys/vicuna-7b-v1.3",
"vicuna": "TheBloke/vicuna-7B-1.1-HF",
"vicuna4": "TheBloke/vicuna-7B-1.1-HF",
"StableLM": "stabilityai/stablelm-tuned-alpha-3b",
}
# NOTE: Each `model_name` should have its own start message
@@ -62,33 +58,11 @@ start_message = {
"explain why instead of answering something not correct. If you don't know the "
"answer to a question, please don't share false information."
),
"StableLM": (
"<|SYSTEM|># StableLM Tuned (Alpha version)"
"\n- StableLM is a helpful and harmless open-source AI language model "
"developed by StabilityAI."
"\n- StableLM is excited to be able to help the user, but will refuse "
"to do anything that could be considered harmful to the user."
"\n- StableLM is more than just an information source, StableLM is also "
"able to write poetry, short stories, and make jokes."
"\n- StableLM will refuse to participate in anything that "
"could harm a human."
),
"vicuna": (
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's "
"questions.\n"
),
"vicuna4": (
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's "
"questions.\n"
),
"vicuna1p3": (
"A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's "
"questions.\n"
),
"codegen": "",
}
@@ -96,10 +70,7 @@ def create_prompt(model_name, history):
system_message = start_message[model_name]
if model_name in [
"StableLM",
"vicuna",
"vicuna4",
"vicuna1p3",
"llama2_7b",
"llama2_13b",
"llama2_70b",
@@ -183,123 +154,68 @@ def chat(
else:
print("unrecognized device")
from apps.language_models.scripts.vicuna import ShardedVicuna
from apps.language_models.scripts.vicuna import UnshardedVicuna
from apps.stable_diffusion.src import args
new_model_vmfb_key = f"{model_name}#{model_path}#{device}#{precision}"
if model_name in [
"vicuna",
"vicuna4",
"vicuna1p3",
"codegen",
"llama2_7b",
"llama2_13b",
"llama2_70b",
]:
from apps.language_models.scripts.vicuna import ShardedVicuna
from apps.language_models.scripts.vicuna import UnshardedVicuna
from apps.stable_diffusion.src import args
if new_model_vmfb_key != model_vmfb_key:
model_vmfb_key = new_model_vmfb_key
max_toks = 128 if model_name == "codegen" else 512
# get iree flags that need to be overridden, from commandline args
_extra_args = []
# vulkan target triple
if args.iree_vulkan_target_triple != "":
_extra_args.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
if model_name == "vicuna4":
vicuna_model = ShardedVicuna(
model_name,
hf_model_path=model_path,
device=device,
precision=precision,
max_num_tokens=max_toks,
compressed=True,
extra_args_cmd=_extra_args,
)
else:
# if config_file is None:
vicuna_model = UnshardedVicuna(
model_name,
hf_model_path=model_path,
hf_auth_token=args.hf_auth_token,
device=device,
precision=precision,
max_num_tokens=max_toks,
download_vmfb=download_vmfb,
load_mlir_from_shark_tank=True,
extra_args_cmd=_extra_args,
)
# else:
# if config_file is not None:
# config_file = open(config_file)
# config_json = json.load(config_file)
# config_file.close()
# else:
# config_json = get_default_config()
# vicuna_model = ShardedVicuna(
# model_name,
# device=device,
# precision=precision,
# config_json=config_json,
# )
prompt = create_prompt(model_name, history)
partial_text = ""
count = 0
start_time = time.time()
for text, msg in progress.tqdm(
vicuna_model.generate(prompt, cli=cli),
desc="generating response",
):
count += 1
if "formatted" in msg:
history[-1][1] = text
end_time = time.time()
tokens_per_sec = count / (end_time - start_time)
yield history, str(
format(tokens_per_sec, ".2f")
) + " tokens/sec"
else:
partial_text += text + " "
history[-1][1] = partial_text
yield history, ""
return history, ""
# else Model is StableLM
global sharkModel
from apps.language_models.src.pipelines.stablelm_pipeline import (
SharkStableLM,
)
if new_model_vmfb_key != model_vmfb_key:
model_vmfb_key = new_model_vmfb_key
# max_new_tokens=512
shark_slm = SharkStableLM(
model_name
) # pass elements from UI as required
max_toks = 128 if model_name == "codegen" else 512
# get iree flags that need to be overridden, from commandline args
_extra_args = []
# vulkan target triple
if args.iree_vulkan_target_triple != "":
_extra_args.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
if model_name == "vicuna4":
vicuna_model = ShardedVicuna(
model_name,
hf_model_path=model_path,
device=device,
precision=precision,
max_num_tokens=max_toks,
compressed=True,
extra_args_cmd=_extra_args,
)
else:
# if config_file is None:
vicuna_model = UnshardedVicuna(
model_name,
hf_model_path=model_path,
hf_auth_token=args.hf_auth_token,
device=device,
precision=precision,
max_num_tokens=max_toks,
download_vmfb=download_vmfb,
load_mlir_from_shark_tank=True,
extra_args_cmd=_extra_args,
)
# Construct the input message string for the model by concatenating the
# current system message and conversation history
if len(curr_system_message.split()) > 160:
print("clearing context")
prompt = create_prompt(model_name, history)
generate_kwargs = dict(prompt=prompt)
words_list = shark_slm.generate(**generate_kwargs)
partial_text = ""
for new_text in words_list:
partial_text += new_text
history[-1][1] = partial_text
# Yield an empty string to clean up the message textbox and the updated
# conversation history
yield history
return words_list
count = 0
start_time = time.time()
for text, msg in progress.tqdm(
vicuna_model.generate(prompt, cli=cli),
desc="generating response",
):
count += 1
if "formatted" in msg:
history[-1][1] = text
end_time = time.time()
tokens_per_sec = count / (end_time - start_time)
yield history, str(format(tokens_per_sec, ".2f")) + " tokens/sec"
else:
partial_text += text + " "
history[-1][1] = partial_text
yield history, ""
return history, ""
def llm_chat_api(InputData: dict):
@@ -417,7 +333,7 @@ with gr.Blocks(title="Chatbot") as stablelm_chat:
)
model = gr.Dropdown(
label="Select Model",
value=model_choices[4],
value=model_choices[1],
choices=model_choices,
)
supported_devices = available_devices

View File

@@ -130,14 +130,13 @@ fi
$PYTHON -m pip install --no-warn-conflicts -e . -f https://llvm.github.io/torch-mlir/package-index/ -f ${RUNTIME} -f https://download.pytorch.org/whl/nightly/cpu/
if [[ $(uname -s) = 'Linux' && ! -z "${BENCHMARK}" ]]; then
if [[ $(uname -s) = 'Linux' && ! -z "${IMPORTER}" ]]; then
T_VER=$($PYTHON -m pip show torch | grep Version)
TORCH_VERSION=${T_VER:9:17}
T_VER_MIN=${T_VER:14:12}
TV_VER=$($PYTHON -m pip show torchvision | grep Version)
TV_VERSION=${TV_VER:9:18}
$PYTHON -m pip uninstall -y torch torchvision
$PYTHON -m pip install -U --pre --no-warn-conflicts triton
$PYTHON -m pip install --no-deps https://download.pytorch.org/whl/nightly/cu118/torch-${TORCH_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl https://download.pytorch.org/whl/nightly/cu118/torchvision-${TV_VERSION}%2Bcu118-cp311-cp311-linux_x86_64.whl
TV_VER_MAJ=${TV_VER:9:6}
$PYTHON -m pip uninstall -y torchvision
$PYTHON -m pip install torchvision==${TV_VER_MAJ}${T_VER_MIN} --no-deps -f https://download.pytorch.org/whl/nightly/cpu/torchvision/
if [ $? -eq 0 ];then
echo "Successfully Installed torch + cu118."
else

View File

@@ -84,7 +84,7 @@ def get_iree_frontend_args(frontend):
elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
return [
"--iree-llvmcpu-target-cpu-features=host",
"--iree-flow-demote-i64-to-i32",
"--iree-input-demote-i64-to-i32",
]
else:
# Frontend not found.