Compare commits

..

11 Commits

Author SHA1 Message Date
Nelson Sharpe
6773278ec2 Fix checkpoint_path unexpected argument (#1832) 2023-09-24 14:17:52 -07:00
Abhishek Varma
9a0efffcca [Llama2] Fix wrong Vulkan device ID + Add Vulkan compile flags
-- This commit fixes the wrong Vulkan device being selected during
   runtime.
-- It also adds couple of IREE compilation flags to target specific
   Vulkan device.
-- It also changes the Vulkan device listing to be more in tune with
   lowering control flow.

Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>
2023-09-22 22:24:18 +05:30
gpetters94
61c6f153d9 Switch to keras-nightly to fix a Linux issue (#1835) 2023-09-21 12:33:45 -04:00
Phaneesh Barwaria
effd42e8f5 pin gradio to v3.44.3 2023-09-21 17:33:43 +05:30
Sungsoon Cho
b5fbb1a8a0 Rename the func arg save_json to avoid name collision. (#1837)
* Rename the func arg save_json to avoid name collision.

* black formatted.
2023-09-19 17:29:27 -05:00
Quinn Dawkins
ded74d09cd [vicuna.py] Keep past key values on device (#1836)
The past key values are only used within the models themselves and can
be kept on device. For vulkan int4, this gives 44 tok/s (for the first
prompt) and settles at around 26 tok/s on 7900xtx.
2023-09-19 18:17:41 -04:00
Boian Petkantchin
79267931c1 Add argument --additional_compile_args (#1119)
This allows to pass more arguemnts to the IREE compiler
Example:
python my-app.py --additional_compile_args="--mlir-pretty-debuginfo --mlir-timing"

Co-authored-by: Boian Petkantchin <boian@nod-labs.com>
2023-09-19 11:26:03 -05:00
zjgarvey
9eceba69b7 local_tank_cache included into clear_all (#1833) 2023-09-18 00:27:23 -05:00
Ean Garvey
ca609afb6a Update README.md (#1830) 2023-09-14 10:33:57 -05:00
Gaurav Shukla
11bdce9790 [flags] Fix vulkan runtime flags as vma is dropped from iree (#1831) 2023-09-14 08:58:59 -05:00
Ean Garvey
684943a4a6 (SD) Fix tokenizers imports in pyinstaller builds. (#1828)
* Fix tokenizers metadata.

* (SD) Disable VAE lowering configs (rdna3) and add versioned tunings.

* Update sd_annotation.py

* (SD) Add cv2 to spec.

* Update stencil pipeline with the new img2img arg.
2023-09-12 12:23:48 -05:00
11 changed files with 150 additions and 42 deletions

View File

@@ -10,7 +10,7 @@ High Performance Machine Learning Distribution
<summary>Prerequisites - Drivers </summary>
#### Install your Windows hardware drivers
* [AMD RDNA Users] Download the latest driver [here](https://www.amd.com/en/support/kb/release-notes/rn-rad-win-23-2-1).
* [AMD RDNA Users] Download the latest driver (23.2.1 is the oldest supported) [here](https://www.amd.com/en/support).
* [macOS Users] Download and install the 1.3.216 Vulkan SDK from [here](https://sdk.lunarg.com/sdk/download/1.3.216.0/mac/vulkansdk-macos-1.3.216.0.dmg). Newer versions of the SDK will not work.
* [Nvidia Users] Download and install the latest CUDA / Vulkan drivers from [here](https://developer.nvidia.com/cuda-downloads)

View File

@@ -382,8 +382,7 @@ class VicunaBase(SharkLLMBase):
if sharded:
output = self.shark_model.forward(input_ids, is_first=is_first)
else:
output = self.shark_model("first_vicuna_forward", (input_ids,))
out_tensor = torch.tensor(output[1:])
output = self.shark_model("first_vicuna_forward", (input_ids,), send_to_host=False)
else:
token = params["token"]
@@ -402,7 +401,7 @@ class VicunaBase(SharkLLMBase):
token = token.to(torch.int64).reshape([1, 1])
second_input = (token,) + tuple(past_key_values)
output = self.shark_model(
"second_vicuna_forward", second_input
"second_vicuna_forward", second_input, send_to_host=False
)
if sharded:
@@ -410,8 +409,8 @@ class VicunaBase(SharkLLMBase):
_past_key_values = output["past_key_values"]
_token = int(torch.argmax(_logits[:, -1, :], dim=1)[0])
else:
_logits = torch.tensor(output[0])
_past_key_values = torch.tensor(output[1:])
_logits = torch.tensor(output[0].to_host())
_past_key_values = output[1:]
_token = torch.argmax(_logits[:, -1, :], dim=1)
_detok = self.tokenizer.decode(_token, skip_special_tokens=False)
@@ -1230,6 +1229,7 @@ class UnshardedVicuna(VicunaBase):
download_vmfb=False,
cache_vicunas=False,
extra_args_cmd=[],
device_id=None,
debug=False,
) -> None:
super().__init__(
@@ -1248,6 +1248,7 @@ class UnshardedVicuna(VicunaBase):
print(f"[DEBUG] hf model name: {self.hf_model_path}")
self.max_sequence_length = 256
self.device = device
self.device_id = device_id
self.precision = precision
self.download_vmfb = download_vmfb
self.vicuna_vmfb_path = vicuna_vmfb_path
@@ -1410,7 +1411,7 @@ class UnshardedVicuna(VicunaBase):
single_file=True,
)
self.shark_model = get_vmfb_from_path(
self.vicuna_vmfb_path, self.device, "tm_tensor"
self.vicuna_vmfb_path, self.device, "tm_tensor", self.device_id
)
if self.shark_model is not None:
print(f"[DEBUG] vmfb found at {self.vicuna_vmfb_path.absolute()}")
@@ -1658,6 +1659,7 @@ class UnshardedVicuna(VicunaBase):
mlir_module=combined_module,
device=self.device,
mlir_dialect="tm_tensor",
device_idx=self.device_id
)
path = shark_module.save_module(
self.vicuna_vmfb_path.parent.absolute(),
@@ -1809,11 +1811,37 @@ if __name__ == "__main__":
args, unknown = parser.parse_known_args()
_extra_args = []
# vulkan target triple
if args.iree_vulkan_target_triple != "":
device_id = None
# Process vulkan target triple.
# TODO: This feature should just be in a common utils for other LLMs and in general
# any model run via SHARK for Vulkan backend.
vulkan_target_triple = args.iree_vulkan_target_triple
if vulkan_target_triple != "":
_extra_args.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
)
# Step 1. Fetch the device ID.
from shark.iree_utils.vulkan_utils import (
get_all_vulkan_devices,
get_vulkan_target_triple
)
vulkaninfo_list = get_all_vulkan_devices()
id = 0
for device in vulkaninfo_list:
target_triple = get_vulkan_target_triple(vulkaninfo_list[id])
if target_triple == vulkan_target_triple:
device_id = id
break
id += 1
assert device_id, f"no vulkan hardware for target-triple '{vulkan_target_triple}' exists"
# Step 2. Add a few flags targetting specific hardwares.
if "rdna" in vulkan_target_triple:
flags_to_add = [
"--iree-spirv-index-bits=64",
]
_extra_args = _extra_args + flags_to_add
vic = None
if not args.sharded:
@@ -1839,6 +1867,7 @@ if __name__ == "__main__":
download_vmfb=args.download_vmfb,
cache_vicunas=args.cache_vicunas,
extra_args_cmd=_extra_args,
device_id=device_id
)
else:
if args.config is not None:

View File

@@ -8,7 +8,7 @@ from shark.shark_downloader import download_public_file
# expects a Path / str as arg
# returns None if path not found or SharkInference module
def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
def get_vmfb_from_path(vmfb_path, device, mlir_dialect, device_id=None):
if not isinstance(vmfb_path, Path):
vmfb_path = Path(vmfb_path)
@@ -20,7 +20,7 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
print("Loading vmfb from: ", vmfb_path)
print("Device from get_vmfb_from_path - ", device)
shark_module = SharkInference(
None, device=device, mlir_dialect=mlir_dialect
None, device=device, mlir_dialect=mlir_dialect, device_idx=device_id
)
shark_module.load_module(vmfb_path)
print("Successfully loaded vmfb")
@@ -28,7 +28,13 @@ def get_vmfb_from_path(vmfb_path, device, mlir_dialect):
def get_vmfb_from_config(
shark_container, model, precision, device, vmfb_path, padding=None
shark_container,
model,
precision,
device,
vmfb_path,
padding=None,
device_id=None,
):
vmfb_url = (
f"gs://shark_tank/{shark_container}/{model}_{precision}_{device}"
@@ -37,4 +43,6 @@ def get_vmfb_from_config(
vmfb_url = vmfb_url + f"_{padding}"
vmfb_url = vmfb_url + ".vmfb"
download_public_file(vmfb_url, vmfb_path.absolute(), single_file=True)
return get_vmfb_from_path(vmfb_path, device, "tm_tensor")
return get_vmfb_from_path(
vmfb_path, device, "tm_tensor", device_id=device_id
)

View File

@@ -470,7 +470,21 @@ def get_available_devices():
set_iree_runtime_flags()
available_devices = []
vulkan_devices = get_devices_by_name("vulkan")
from shark.iree_utils._common import run_cmd
from shark.iree_utils.vulkan_utils import (
get_all_vulkan_devices,
)
vulkaninfo_list = get_all_vulkan_devices()
vulkan_devices = []
id = 0
for device in vulkaninfo_list:
vulkan_devices.append(
f"{device.split('=')[1].strip()} => vulkan://{id}"
)
id += 1
if id != 0:
print(f"vulkan devices are available.")
available_devices.extend(vulkan_devices)
metal_devices = get_devices_by_name("metal")
available_devices.extend(metal_devices)
@@ -577,7 +591,7 @@ def preprocessCKPT(custom_weights, is_inpaint=False):
)
num_in_channels = 9 if is_inpaint else 4
pipe = download_from_original_stable_diffusion_ckpt(
checkpoint_path=custom_weights,
checkpoint_path_or_dict=custom_weights,
extract_ema=extract_ema,
from_safetensors=from_safetensors,
num_in_channels=num_in_channels,
@@ -827,6 +841,8 @@ def clear_all():
elif os.name == "unix":
shutil.rmtree(os.path.join(home, ".cache/AMD/VkCache"))
shutil.rmtree(os.path.join(home, ".local/shark_tank"))
if args.local_tank_cache != "":
shutil.rmtree(args.local_tank_cache)
def get_generated_imgs_path() -> Path:

View File

@@ -143,6 +143,7 @@ def chat(
global model_vmfb_key
global vicuna_model
device_id = None
model_name, model_path = list(map(str.strip, model.split("=>")))
if "cuda" in device:
device = "cuda"
@@ -151,6 +152,7 @@ def chat(
elif "task" in device:
device = "cpu-task"
elif "vulkan" in device:
device_id = int(device.split("://")[1])
device = "vulkan"
elif "rocm" in device:
device = "rocm"
@@ -169,10 +171,45 @@ def chat(
# get iree flags that need to be overridden, from commandline args
_extra_args = []
# vulkan target triple
if args.iree_vulkan_target_triple != "":
vulkan_target_triple = args.iree_vulkan_target_triple
from shark.iree_utils.vulkan_utils import (
get_all_vulkan_devices,
get_vulkan_target_triple,
)
if device == "vulkan":
vulkaninfo_list = get_all_vulkan_devices()
if vulkan_target_triple == "":
# We already have the device_id extracted via WebUI, so we directly use
# that to find the target triple.
vulkan_target_triple = get_vulkan_target_triple(
vulkaninfo_list[device_id]
)
_extra_args.append(
f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
f"-iree-vulkan-target-triple={vulkan_target_triple}"
)
if "rdna" in vulkan_target_triple:
flags_to_add = [
"--iree-spirv-index-bits=64",
]
_extra_args = _extra_args + flags_to_add
if device_id is None:
id = 0
for device in vulkaninfo_list:
target_triple = get_vulkan_target_triple(
vulkaninfo_list[id]
)
if target_triple == vulkan_target_triple:
device_id = id
break
id += 1
assert (
device_id
), f"no vulkan hardware for target-triple '{vulkan_target_triple}' exists"
print(f"Will use target triple : {vulkan_target_triple}")
if model_name == "vicuna4":
vicuna_model = ShardedVicuna(
@@ -196,6 +233,7 @@ def chat(
download_vmfb=download_vmfb,
load_mlir_from_shark_tank=True,
extra_args_cmd=_extra_args,
device_id=device_id,
)
prompt = create_prompt(model_name, history)
@@ -254,6 +292,7 @@ def llm_chat_api(InputData: dict):
UnshardedVicuna,
)
device_id = None
if vicuna_model == 0:
if "cuda" in device:
device = "cuda"
@@ -262,6 +301,7 @@ def llm_chat_api(InputData: dict):
elif "task" in device:
device = "cpu-task"
elif "vulkan" in device:
device_id = int(device.split("://")[1])
device = "vulkan"
else:
print("unrecognized device")
@@ -274,6 +314,7 @@ def llm_chat_api(InputData: dict):
max_num_tokens=max_toks,
download_vmfb=True,
load_mlir_from_shark_tank=True,
device_id=device_id,
)
# TODO: add role dict for different models

View File

@@ -16,7 +16,7 @@ iree-tools-tf
# TensorFlow and JAX.
gin-config
tf-nightly
keras
keras-nightly
#tf-models-nightly
#tensorflow-text-nightly
transformers

View File

@@ -25,7 +25,7 @@ diffusers
accelerate
scipy
ftfy
gradio
gradio==3.44.3
altair
omegaconf
# 0.3.2 doesn't have binaries for arm64

View File

@@ -300,6 +300,7 @@ def compile_module_to_flatbuffer(
args += get_iree_common_args(debug=debug)
args += get_model_specific_args()
args += extra_args
args += shark_args.additional_compile_args
if frontend in ["tensorflow", "tf"]:
input_type = "auto"

View File

@@ -24,10 +24,16 @@ from shark.parser import shark_args
@functools.cache
def get_vulkan_device_name(device_num=0):
def get_all_vulkan_devices():
vulkaninfo_dump, _ = run_cmd("vulkaninfo")
vulkaninfo_dump = vulkaninfo_dump.split(linesep)
vulkaninfo_list = [s.strip() for s in vulkaninfo_dump if "deviceName" in s]
return vulkaninfo_list
@functools.cache
def get_vulkan_device_name(device_num=0):
vulkaninfo_list = get_all_vulkan_devices()
if len(vulkaninfo_list) == 0:
raise ValueError("No device name found in VulkanInfo!")
if len(vulkaninfo_list) > 1:
@@ -178,9 +184,7 @@ def get_iree_vulkan_args(device_num=0, extra_args=[]):
@functools.cache
def get_iree_vulkan_runtime_flags():
vulkan_runtime_flags = [
f"--vulkan_large_heap_block_size={shark_args.vulkan_large_heap_block_size}",
f"--vulkan_validation_layers={'true' if shark_args.vulkan_validation_layers else 'false'}",
f"--vulkan_vma_allocator={'true' if shark_args.vulkan_vma_allocator else 'false'}",
]
return vulkan_runtime_flags

View File

@@ -14,8 +14,21 @@
import argparse
import os
import shlex
import subprocess
class SplitStrToListAction(argparse.Action):
def __init__(self, option_strings, dest, *args, **kwargs):
super(SplitStrToListAction, self).__init__(
option_strings=option_strings, dest=dest, *args, **kwargs
)
def __call__(self, parser, namespace, values, option_string=None):
del parser, option_string
setattr(namespace, self.dest, shlex.split(values[0]))
parser = argparse.ArgumentParser(description="SHARK runner.")
parser.add_argument(
@@ -24,6 +37,13 @@ parser.add_argument(
default="cpu",
help="Device on which shark_runner runs. options are cpu, cuda, and vulkan",
)
parser.add_argument(
"--additional_compile_args",
default=list(),
nargs=1,
action=SplitStrToListAction,
help="Additional arguments to pass to the compiler. These are appended as the last arguments.",
)
parser.add_argument(
"--enable_tf32",
type=bool,
@@ -133,13 +153,6 @@ parser.add_argument(
help="Profiles vulkan device and collects the .rdc info.",
)
parser.add_argument(
"--vulkan_large_heap_block_size",
default="2073741824",
help="Flag for setting VMA preferredLargeHeapBlockSize for "
"vulkan device, default is 4G.",
)
parser.add_argument(
"--vulkan_validation_layers",
default=False,
@@ -147,11 +160,4 @@ parser.add_argument(
help="Flag for disabling vulkan validation layers when benchmarking.",
)
parser.add_argument(
"--vulkan_vma_allocator",
default=False,
action=argparse.BooleanOptionalAction,
help="Flag for enabling / disabling Vulkan VMA Allocator.",
)
shark_args, unknown = parser.parse_known_args()

View File

@@ -168,7 +168,7 @@ def save_json(data, filename):
def collect_huggingface_logits(
model_name: str, max_seq_len: int, save_json: bool
model_name: str, max_seq_len: int, to_save_json: bool
) -> Tuple[float, float]:
# Load
t0 = time.time()
@@ -194,11 +194,11 @@ def collect_huggingface_logits(
for idx, tokens in enumerate(tokenized_prompts):
print("prompt: {}".format(PROMPTS[idx]))
logits = run_huggingface_model(model_wrapper, tokens)
if save_json:
if to_save_json:
results.append([PROMPTS[idx], logits[0].tolist()])
run_time = time.time() - t0
print("--- Took {} seconds to run Huggingface.".format(run_time))
if save_json:
if to_save_json:
save_json(results, "/tmp/huggingface.json")
run_memory_info = get_memory_info()
return {
@@ -215,7 +215,10 @@ def collect_huggingface_logits(
def collect_shark_logits(
model_name: str, max_seq_len: int, recompile_shark: bool, save_json: bool
model_name: str,
max_seq_len: int,
recompile_shark: bool,
to_save_json: bool,
) -> Tuple[float, float]:
# Load
t0 = time.time()
@@ -246,11 +249,11 @@ def collect_shark_logits(
print("prompt: {}".format(PROMPTS[idx]))
logits = run_shark_model(model_wrapper, tokens)
lst = [e.tolist() for e in logits]
if save_json:
if to_save_json:
results.append([PROMPTS[idx], lst])
run_time = time.time() - t0
print("--- Took {} seconds to run Shark.".format(run_time))
if save_json:
if to_save_json:
save_json(results, "/tmp/shark.json")
platform_postfix = "-compile" if recompile_shark else "-precompiled"
run_memory_info = get_memory_info()