Compare commits

...

2 Commits

Author SHA1 Message Date
Elias Joseph
cefcc45873 fixed install method 2023-08-04 20:20:53 -04:00
Elias Joseph
e2b4de8c0a download all mlirs 2023-08-04 19:10:01 -04:00

View File

@@ -6,8 +6,8 @@ from io import BytesIO
from pathlib import Path
from tqdm import tqdm
from typing import List, Tuple
import subprocess
import torch
import torch_mlir
from torch_mlir import TensorPlaceholder
@@ -27,7 +27,7 @@ from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
VicunaNorm,
VicunaNormCompiled,
)
from apps.language_models.src.model_wrappers.vicuna4 import(
from apps.language_models.src.model_wrappers.vicuna4 import (
LlamaModel,
EightLayerLayerSV,
EightLayerLayerFV,
@@ -478,9 +478,8 @@ class ShardedVicuna(VicunaBase):
self.tokenizer = self.get_tokenizer()
self.config = config_json
self.weight_group_size = weight_group_size
self.compressed=compressed
self.compressed = compressed
self.shark_model = self.compile(device=device)
def get_tokenizer(self):
kwargs = {}
@@ -678,18 +677,29 @@ class ShardedVicuna(VicunaBase):
hidden_states, dynamic_axes=[1]
)
module = torch_mlir.compile(
lmh,
(hidden_states,),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
# module = torch_mlir.compile(
# lmh,
# (hidden_states,),
# torch_mlir.OutputType.LINALG_ON_TENSORS,
# use_tracing=False,
# verbose=False,
# )
# bytecode_stream = BytesIO()
# module.operation.write_bytecode(bytecode_stream)
# bytecode = bytecode_stream.getvalue()
# f_ = open(mlir_path, "wb")
# f_.write(bytecode)
# f_.close()
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/lmhead.mlir lmhead.mlir"
# subprocess.check_call(command.split())
filepath = Path("lmhead.mlir")
download_public_file(
"gs://shark_tank/elias/compressed_sv/lmhead.mlir",
filepath.absolute(),
single_file=True,
)
bytecode_stream = BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(mlir_path, "wb")
f_.write(bytecode)
f_ = open(f"lmhead.mlir", "rb")
bytecode = f_.read()
f_.close()
shark_module = SharkInference(
@@ -721,18 +731,23 @@ class ShardedVicuna(VicunaBase):
hidden_states, dynamic_axes=[1]
)
module = torch_mlir.compile(
fvn,
(hidden_states,),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
# module = torch_mlir.compile(
# fvn,
# (hidden_states,),
# torch_mlir.OutputType.LINALG_ON_TENSORS,
# use_tracing=False,
# verbose=False,
# )
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/norm.mlir norm.mlir"
# subprocess.check_call(command.split())
filepath = Path("norm.mlir")
download_public_file(
"gs://shark_tank/elias/compressed_sv/norm.mlir",
filepath.absolute(),
single_file=True,
)
bytecode_stream = BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(mlir_path, "wb")
f_.write(bytecode)
f_ = open(f"norm.mlir", "rb")
bytecode = f_.read()
f_.close()
shark_module = SharkInference(
@@ -763,18 +778,29 @@ class ShardedVicuna(VicunaBase):
input_ids = torch_mlir.TensorPlaceholder.like(
input_ids, dynamic_axes=[1]
)
module = torch_mlir.compile(
fve,
(input_ids,),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
# module = torch_mlir.compile(
# fve,
# (input_ids,),
# torch_mlir.OutputType.LINALG_ON_TENSORS,
# use_tracing=False,
# verbose=False,
# )
# bytecode_stream = BytesIO()
# module.operation.write_bytecode(bytecode_stream)
# bytecode = bytecode_stream.getvalue()
# f_ = open(mlir_path, "wb")
# f_.write(bytecode)
# f_.close()
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/embedding.mlir embedding.mlir"
# subprocess.check_call(command.split())
filepath = Path("embedding.mlir")
download_public_file(
"gs://shark_tank/elias/compressed_sv/embedding.mlir",
filepath.absolute(),
single_file=True,
)
bytecode_stream = BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(mlir_path, "wb")
f_.write(bytecode)
f_ = open(f"embedding.mlir", "rb")
bytecode = f_.read()
f_.close()
shark_module = SharkInference(
@@ -978,17 +1004,21 @@ class ShardedVicuna(VicunaBase):
f_.close()
mlirs.append(bytecode)
else:
command = f"gsutil cp gs://shark_tank/elias/compressed_sv/{idx}_full.mlir {idx}_full.mlir"
# command = f"gsutil cp gs://shark_tank/elias/compressed_sv/{idx}_full.mlir {idx}_full.mlir"
subprocess.check_call(command.split())
# subprocess.check_call(command.split())
filepath = Path(f"{idx}_full.mlir")
download_public_file(
f"gs://shark_tank/elias/compressed_sv/{idx}_full.mlir",
filepath.absolute(),
single_file=True,
)
f_ = open(f"{idx}_full.mlir", "rb")
bytecode = f_.read()
f_.close()
mlirs.append(bytecode)
if vmfb_path.exists():
# print(f"Found layer {idx} vmfb")
device_idx = self.get_device_index(
@@ -1125,7 +1155,6 @@ class ShardedVicuna(VicunaBase):
)
if not compressed:
layers0 = [
FirstVicunaLayer(layer) for layer in vicuna_model.model.layers
]
@@ -1145,7 +1174,7 @@ class ShardedVicuna(VicunaBase):
layers0 = [layers00, layers01, layers02, layers03]
layers1 = [layers10, layers11, layers12, layers13]
_, modules = self.compile_to_vmfb_one_model(
_, modules = self.compile_to_vmfb_one_model4(
placeholder_input0,
layers0,
placeholder_input1,
@@ -1169,7 +1198,9 @@ class ShardedVicuna(VicunaBase):
return sharded_model
def compile(self, device="cpu"):
return self.get_sharded_model(device=device, compressed=self.compressed)
return self.get_sharded_model(
device=device, compressed=self.compressed
)
def generate(self, prompt, cli=False):
# TODO: refactor for cleaner integration