fixed install method

download all mlirs
2026-01-13 07:48:01 -05:00 · 2023-08-04 20:20:53 -04:00 · 2023-08-04 19:10:01 -04:00
1 changed files with 75 additions and 44 deletions
--- a/apps/language_models/scripts/vicuna.py
+++ b/apps/language_models/scripts/vicuna.py
@@ -6,8 +6,8 @@ from io import BytesIO
 from pathlib import Path
 from tqdm import tqdm
 from typing import List, Tuple
-
 import subprocess
+
 import torch
 import torch_mlir
 from torch_mlir import TensorPlaceholder
@@ -27,7 +27,7 @@ from apps.language_models.src.model_wrappers.vicuna_sharded_model import (
    VicunaNorm,
    VicunaNormCompiled,
 )
-from apps.language_models.src.model_wrappers.vicuna4 import(
+from apps.language_models.src.model_wrappers.vicuna4 import (
    LlamaModel,
    EightLayerLayerSV,
    EightLayerLayerFV,
@@ -478,9 +478,8 @@ class ShardedVicuna(VicunaBase):
        self.tokenizer = self.get_tokenizer()
        self.config = config_json
        self.weight_group_size = weight_group_size
-        self.compressed=compressed
+        self.compressed = compressed
        self.shark_model = self.compile(device=device)
-        

    def get_tokenizer(self):
        kwargs = {}
@@ -678,18 +677,29 @@ class ShardedVicuna(VicunaBase):
                hidden_states, dynamic_axes=[1]
            )

-            module = torch_mlir.compile(
-                lmh,
-                (hidden_states,),
-                torch_mlir.OutputType.LINALG_ON_TENSORS,
-                use_tracing=False,
-                verbose=False,
+            # module = torch_mlir.compile(
+            #    lmh,
+            #    (hidden_states,),
+            #    torch_mlir.OutputType.LINALG_ON_TENSORS,
+            #    use_tracing=False,
+            #    verbose=False,
+            # )
+            # bytecode_stream = BytesIO()
+            # module.operation.write_bytecode(bytecode_stream)
+            # bytecode = bytecode_stream.getvalue()
+            # f_ = open(mlir_path, "wb")
+            # f_.write(bytecode)
+            # f_.close()
+            # command = f"gsutil cp gs://shark_tank/elias/compressed_sv/lmhead.mlir lmhead.mlir"
+            # subprocess.check_call(command.split())
+            filepath = Path("lmhead.mlir")
+            download_public_file(
+                "gs://shark_tank/elias/compressed_sv/lmhead.mlir",
+                filepath.absolute(),
+                single_file=True,
            )
-            bytecode_stream = BytesIO()
-            module.operation.write_bytecode(bytecode_stream)
-            bytecode = bytecode_stream.getvalue()
-            f_ = open(mlir_path, "wb")
-            f_.write(bytecode)
+            f_ = open(f"lmhead.mlir", "rb")
+            bytecode = f_.read()
            f_.close()

        shark_module = SharkInference(
@@ -721,18 +731,23 @@ class ShardedVicuna(VicunaBase):
                hidden_states, dynamic_axes=[1]
            )

-            module = torch_mlir.compile(
-                fvn,
-                (hidden_states,),
-                torch_mlir.OutputType.LINALG_ON_TENSORS,
-                use_tracing=False,
-                verbose=False,
+            # module = torch_mlir.compile(
+            #    fvn,
+            #    (hidden_states,),
+            #    torch_mlir.OutputType.LINALG_ON_TENSORS,
+            #    use_tracing=False,
+            #    verbose=False,
+            # )
+            # command = f"gsutil cp gs://shark_tank/elias/compressed_sv/norm.mlir norm.mlir"
+            # subprocess.check_call(command.split())
+            filepath = Path("norm.mlir")
+            download_public_file(
+                "gs://shark_tank/elias/compressed_sv/norm.mlir",
+                filepath.absolute(),
+                single_file=True,
            )
-            bytecode_stream = BytesIO()
-            module.operation.write_bytecode(bytecode_stream)
-            bytecode = bytecode_stream.getvalue()
-            f_ = open(mlir_path, "wb")
-            f_.write(bytecode)
+            f_ = open(f"norm.mlir", "rb")
+            bytecode = f_.read()
            f_.close()

        shark_module = SharkInference(
@@ -763,18 +778,29 @@ class ShardedVicuna(VicunaBase):
            input_ids = torch_mlir.TensorPlaceholder.like(
                input_ids, dynamic_axes=[1]
            )
-            module = torch_mlir.compile(
-                fve,
-                (input_ids,),
-                torch_mlir.OutputType.LINALG_ON_TENSORS,
-                use_tracing=False,
-                verbose=False,
+            # module = torch_mlir.compile(
+            #    fve,
+            #    (input_ids,),
+            #    torch_mlir.OutputType.LINALG_ON_TENSORS,
+            #    use_tracing=False,
+            #    verbose=False,
+            # )
+            # bytecode_stream = BytesIO()
+            # module.operation.write_bytecode(bytecode_stream)
+            # bytecode = bytecode_stream.getvalue()
+            # f_ = open(mlir_path, "wb")
+            # f_.write(bytecode)
+            # f_.close()
+            # command = f"gsutil cp gs://shark_tank/elias/compressed_sv/embedding.mlir embedding.mlir"
+            # subprocess.check_call(command.split())
+            filepath = Path("embedding.mlir")
+            download_public_file(
+                "gs://shark_tank/elias/compressed_sv/embedding.mlir",
+                filepath.absolute(),
+                single_file=True,
            )
-            bytecode_stream = BytesIO()
-            module.operation.write_bytecode(bytecode_stream)
-            bytecode = bytecode_stream.getvalue()
-            f_ = open(mlir_path, "wb")
-            f_.write(bytecode)
+            f_ = open(f"embedding.mlir", "rb")
+            bytecode = f_.read()
            f_.close()

        shark_module = SharkInference(
@@ -978,17 +1004,21 @@ class ShardedVicuna(VicunaBase):
                f_.close()
                mlirs.append(bytecode)
            else:
-                command = f"gsutil cp gs://shark_tank/elias/compressed_sv/{idx}_full.mlir {idx}_full.mlir"
+                # command = f"gsutil cp gs://shark_tank/elias/compressed_sv/{idx}_full.mlir {idx}_full.mlir"

-                subprocess.check_call(command.split())
+                # subprocess.check_call(command.split())
+                filepath = Path(f"{idx}_full.mlir")
+                download_public_file(
+                    f"gs://shark_tank/elias/compressed_sv/{idx}_full.mlir",
+                    filepath.absolute(),
+                    single_file=True,
+                )

                f_ = open(f"{idx}_full.mlir", "rb")
                bytecode = f_.read()
                f_.close()
                mlirs.append(bytecode)

-                
-
            if vmfb_path.exists():
                # print(f"Found layer {idx} vmfb")
                device_idx = self.get_device_index(
@@ -1125,7 +1155,6 @@ class ShardedVicuna(VicunaBase):
        )

        if not compressed:
-
            layers0 = [
                FirstVicunaLayer(layer) for layer in vicuna_model.model.layers
            ]
@@ -1145,7 +1174,7 @@ class ShardedVicuna(VicunaBase):
            layers0 = [layers00, layers01, layers02, layers03]
            layers1 = [layers10, layers11, layers12, layers13]

-        _, modules = self.compile_to_vmfb_one_model(
+        _, modules = self.compile_to_vmfb_one_model4(
            placeholder_input0,
            layers0,
            placeholder_input1,
@@ -1169,7 +1198,9 @@ class ShardedVicuna(VicunaBase):
        return sharded_model

    def compile(self, device="cpu"):
-        return self.get_sharded_model(device=device, compressed=self.compressed)
+        return self.get_sharded_model(
+            device=device, compressed=self.compressed
+        )

    def generate(self, prompt, cli=False):
        # TODO: refactor for cleaner integration
Author	SHA1	Message	Date
Elias Joseph	cefcc45873	fixed install method	2023-08-04 20:20:53 -04:00
Elias Joseph	e2b4de8c0a	download all mlirs	2023-08-04 19:10:01 -04:00