mirror of
https://github.com/nod-ai/SHARK-Studio.git
synced 2026-01-13 07:48:01 -05:00
Compare commits
2 Commits
debug
...
20230804.8
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cefcc45873 | ||
|
|
e2b4de8c0a |
@@ -6,8 +6,8 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
from typing import List, Tuple
|
||||
|
||||
import subprocess
|
||||
|
||||
import torch
|
||||
import torch_mlir
|
||||
from torch_mlir import TensorPlaceholder
|
||||
@@ -27,7 +27,7 @@ from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
|
||||
VicunaNorm,
|
||||
VicunaNormCompiled,
|
||||
)
|
||||
from apps.language_models.src.model_wrappers.vicuna4 import(
|
||||
from apps.language_models.src.model_wrappers.vicuna4 import (
|
||||
LlamaModel,
|
||||
EightLayerLayerSV,
|
||||
EightLayerLayerFV,
|
||||
@@ -478,9 +478,8 @@ class ShardedVicuna(VicunaBase):
|
||||
self.tokenizer = self.get_tokenizer()
|
||||
self.config = config_json
|
||||
self.weight_group_size = weight_group_size
|
||||
self.compressed=compressed
|
||||
self.compressed = compressed
|
||||
self.shark_model = self.compile(device=device)
|
||||
|
||||
|
||||
def get_tokenizer(self):
|
||||
kwargs = {}
|
||||
@@ -678,18 +677,29 @@ class ShardedVicuna(VicunaBase):
|
||||
hidden_states, dynamic_axes=[1]
|
||||
)
|
||||
|
||||
module = torch_mlir.compile(
|
||||
lmh,
|
||||
(hidden_states,),
|
||||
torch_mlir.OutputType.LINALG_ON_TENSORS,
|
||||
use_tracing=False,
|
||||
verbose=False,
|
||||
# module = torch_mlir.compile(
|
||||
# lmh,
|
||||
# (hidden_states,),
|
||||
# torch_mlir.OutputType.LINALG_ON_TENSORS,
|
||||
# use_tracing=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
# bytecode_stream = BytesIO()
|
||||
# module.operation.write_bytecode(bytecode_stream)
|
||||
# bytecode = bytecode_stream.getvalue()
|
||||
# f_ = open(mlir_path, "wb")
|
||||
# f_.write(bytecode)
|
||||
# f_.close()
|
||||
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/lmhead.mlir lmhead.mlir"
|
||||
# subprocess.check_call(command.split())
|
||||
filepath = Path("lmhead.mlir")
|
||||
download_public_file(
|
||||
"gs://shark_tank/elias/compressed_sv/lmhead.mlir",
|
||||
filepath.absolute(),
|
||||
single_file=True,
|
||||
)
|
||||
bytecode_stream = BytesIO()
|
||||
module.operation.write_bytecode(bytecode_stream)
|
||||
bytecode = bytecode_stream.getvalue()
|
||||
f_ = open(mlir_path, "wb")
|
||||
f_.write(bytecode)
|
||||
f_ = open(f"lmhead.mlir", "rb")
|
||||
bytecode = f_.read()
|
||||
f_.close()
|
||||
|
||||
shark_module = SharkInference(
|
||||
@@ -721,18 +731,23 @@ class ShardedVicuna(VicunaBase):
|
||||
hidden_states, dynamic_axes=[1]
|
||||
)
|
||||
|
||||
module = torch_mlir.compile(
|
||||
fvn,
|
||||
(hidden_states,),
|
||||
torch_mlir.OutputType.LINALG_ON_TENSORS,
|
||||
use_tracing=False,
|
||||
verbose=False,
|
||||
# module = torch_mlir.compile(
|
||||
# fvn,
|
||||
# (hidden_states,),
|
||||
# torch_mlir.OutputType.LINALG_ON_TENSORS,
|
||||
# use_tracing=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/norm.mlir norm.mlir"
|
||||
# subprocess.check_call(command.split())
|
||||
filepath = Path("norm.mlir")
|
||||
download_public_file(
|
||||
"gs://shark_tank/elias/compressed_sv/norm.mlir",
|
||||
filepath.absolute(),
|
||||
single_file=True,
|
||||
)
|
||||
bytecode_stream = BytesIO()
|
||||
module.operation.write_bytecode(bytecode_stream)
|
||||
bytecode = bytecode_stream.getvalue()
|
||||
f_ = open(mlir_path, "wb")
|
||||
f_.write(bytecode)
|
||||
f_ = open(f"norm.mlir", "rb")
|
||||
bytecode = f_.read()
|
||||
f_.close()
|
||||
|
||||
shark_module = SharkInference(
|
||||
@@ -763,18 +778,29 @@ class ShardedVicuna(VicunaBase):
|
||||
input_ids = torch_mlir.TensorPlaceholder.like(
|
||||
input_ids, dynamic_axes=[1]
|
||||
)
|
||||
module = torch_mlir.compile(
|
||||
fve,
|
||||
(input_ids,),
|
||||
torch_mlir.OutputType.LINALG_ON_TENSORS,
|
||||
use_tracing=False,
|
||||
verbose=False,
|
||||
# module = torch_mlir.compile(
|
||||
# fve,
|
||||
# (input_ids,),
|
||||
# torch_mlir.OutputType.LINALG_ON_TENSORS,
|
||||
# use_tracing=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
# bytecode_stream = BytesIO()
|
||||
# module.operation.write_bytecode(bytecode_stream)
|
||||
# bytecode = bytecode_stream.getvalue()
|
||||
# f_ = open(mlir_path, "wb")
|
||||
# f_.write(bytecode)
|
||||
# f_.close()
|
||||
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/embedding.mlir embedding.mlir"
|
||||
# subprocess.check_call(command.split())
|
||||
filepath = Path("embedding.mlir")
|
||||
download_public_file(
|
||||
"gs://shark_tank/elias/compressed_sv/embedding.mlir",
|
||||
filepath.absolute(),
|
||||
single_file=True,
|
||||
)
|
||||
bytecode_stream = BytesIO()
|
||||
module.operation.write_bytecode(bytecode_stream)
|
||||
bytecode = bytecode_stream.getvalue()
|
||||
f_ = open(mlir_path, "wb")
|
||||
f_.write(bytecode)
|
||||
f_ = open(f"embedding.mlir", "rb")
|
||||
bytecode = f_.read()
|
||||
f_.close()
|
||||
|
||||
shark_module = SharkInference(
|
||||
@@ -978,17 +1004,21 @@ class ShardedVicuna(VicunaBase):
|
||||
f_.close()
|
||||
mlirs.append(bytecode)
|
||||
else:
|
||||
command = f"gsutil cp gs://shark_tank/elias/compressed_sv/{idx}_full.mlir {idx}_full.mlir"
|
||||
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/{idx}_full.mlir {idx}_full.mlir"
|
||||
|
||||
subprocess.check_call(command.split())
|
||||
# subprocess.check_call(command.split())
|
||||
filepath = Path(f"{idx}_full.mlir")
|
||||
download_public_file(
|
||||
f"gs://shark_tank/elias/compressed_sv/{idx}_full.mlir",
|
||||
filepath.absolute(),
|
||||
single_file=True,
|
||||
)
|
||||
|
||||
f_ = open(f"{idx}_full.mlir", "rb")
|
||||
bytecode = f_.read()
|
||||
f_.close()
|
||||
mlirs.append(bytecode)
|
||||
|
||||
|
||||
|
||||
if vmfb_path.exists():
|
||||
# print(f"Found layer {idx} vmfb")
|
||||
device_idx = self.get_device_index(
|
||||
@@ -1125,7 +1155,6 @@ class ShardedVicuna(VicunaBase):
|
||||
)
|
||||
|
||||
if not compressed:
|
||||
|
||||
layers0 = [
|
||||
FirstVicunaLayer(layer) for layer in vicuna_model.model.layers
|
||||
]
|
||||
@@ -1145,7 +1174,7 @@ class ShardedVicuna(VicunaBase):
|
||||
layers0 = [layers00, layers01, layers02, layers03]
|
||||
layers1 = [layers10, layers11, layers12, layers13]
|
||||
|
||||
_, modules = self.compile_to_vmfb_one_model(
|
||||
_, modules = self.compile_to_vmfb_one_model4(
|
||||
placeholder_input0,
|
||||
layers0,
|
||||
placeholder_input1,
|
||||
@@ -1169,7 +1198,9 @@ class ShardedVicuna(VicunaBase):
|
||||
return sharded_model
|
||||
|
||||
def compile(self, device="cpu"):
|
||||
return self.get_sharded_model(device=device, compressed=self.compressed)
|
||||
return self.get_sharded_model(
|
||||
device=device, compressed=self.compressed
|
||||
)
|
||||
|
||||
def generate(self, prompt, cli=False):
|
||||
# TODO: refactor for cleaner integration
|
||||
|
||||
Reference in New Issue
Block a user