Add StableLM model (#1331)

2026-04-03 03:00:17 -04:00 · 2023-04-21 22:21:02 +05:30
parent 0b01bbe479
commit d2f7e03b7e
1 changed files with 207 additions and 0 deletions
--- a/apps/language_models/scripts/stablelm.py
+++ b/apps/language_models/scripts/stablelm.py
@@ -0,0 +1,207 @@
+import torch
+import shark
+from shark.shark_importer import import_with_fx
+from shark.shark_inference import SharkInference
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+import torch_mlir
+from apps.stable_diffusion.src.utils import (
+    base_models,
+    get_opt_flags,
+    get_vmfb_path_name,
+)
+from apps.stable_diffusion.src.models.model_wrappers import replace_shape_str
+import os
+from io import BytesIO
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "stabilityai/stablelm-tuned-alpha-7b"
+)
+
+
+class StopOnTokens(StoppingCriteria):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        stop_ids = [50278, 50279, 50277, 1, 0]
+        for stop_id in stop_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+
+
+system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+"""
+
+prompt = f"{system_prompt}<|USER|>What's your mood today?<|ASSISTANT|>"
+
+inputs = tokenizer(prompt, return_tensors="pt")
+
+
+class SLM(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelForCausalLM.from_pretrained(
+            "stabilityai/stablelm-tuned-alpha-7b"
+        )
+
+    def forward(self, input_ids, attention_mask):
+        return self.model(input_ids, attention_mask)[0]
+
+
+slm_model = SLM()
+
+res_pytorch = slm_model(inputs["input_ids"], inputs["attention_mask"])
+
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._decomp import get_decompositions
+from typing import List
+
+fx_g = make_fx(
+    slm_model,
+    decomposition_table=get_decompositions(
+        [
+            torch.ops.aten.embedding_dense_backward,
+            torch.ops.aten.native_layer_norm_backward,
+            torch.ops.aten.slice_backward,
+            torch.ops.aten.select_backward,
+            torch.ops.aten.norm.ScalarOpt_dim,
+            torch.ops.aten.native_group_norm,
+            torch.ops.aten.upsample_bilinear2d.vec,
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.split_with_sizes,
+        ]
+    ),
+)(inputs["input_ids"], inputs["attention_mask"])
+
+
+def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
+    removed_indexes = []
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, (list, tuple)):
+                node_arg = list(node_arg)
+                node_args_len = len(node_arg)
+                for i in range(node_args_len):
+                    curr_index = node_args_len - (i + 1)
+                    if node_arg[curr_index] is None:
+                        removed_indexes.append(curr_index)
+                        node_arg.pop(curr_index)
+                node.args = (tuple(node_arg),)
+                break
+
+    if len(removed_indexes) > 0:
+        fx_g.graph.lint()
+        fx_g.graph.eliminate_dead_code()
+        fx_g.recompile()
+    removed_indexes.sort()
+    return removed_indexes
+
+
+def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
+    """
+    Replace tuple with tuple element in functions that return one-element tuples.
+    Returns true if an unwrapping took place, and false otherwise.
+    """
+    unwrapped_tuple = False
+    for node in fx_g.graph.nodes:
+        if node.op == "output":
+            assert (
+                len(node.args) == 1
+            ), "Output node must have a single argument"
+            node_arg = node.args[0]
+            if isinstance(node_arg, tuple):
+                if len(node_arg) == 1:
+                    node.args = (node_arg[0],)
+                    unwrapped_tuple = True
+                    break
+
+    if unwrapped_tuple:
+        fx_g.graph.lint()
+        fx_g.recompile()
+    return unwrapped_tuple
+
+
+def transform_fx(fx_g):
+    for node in fx_g.graph.nodes:
+        if node.op == "call_function":
+            if node.target in [
+                torch.ops.aten.empty,
+            ]:
+                # aten.empty should be filled with zeros.
+                if node.target in [torch.ops.aten.empty]:
+                    with fx_g.graph.inserting_after(node):
+                        new_node = fx_g.graph.call_function(
+                            torch.ops.aten.zero_,
+                            args=(node,),
+                        )
+                        node.append(new_node)
+                        node.replace_all_uses_with(new_node)
+                        new_node.args = (node,)
+
+    fx_g.graph.lint()
+
+
+transform_fx(fx_g)
+fx_g.recompile()
+removed_none_indexes = _remove_nones(fx_g)
+was_unwrapped = _unwrap_single_tuple_return(fx_g)
+
+fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
+fx_g.recompile()
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+strip_overloads(fx_g)
+
+ts_g = torch.jit.script(fx_g)
+
+module = torch_mlir.compile(
+    ts_g,
+    [inputs["input_ids"], inputs["attention_mask"]],
+    torch_mlir.OutputType.LINALG_ON_TENSORS,
+    use_tracing=False,
+    verbose=False,
+)
+
+bytecode_stream = BytesIO()
+module.operation.write_bytecode(bytecode_stream)
+bytecode = bytecode_stream.getvalue()
+
+shark_module = SharkInference(
+    mlir_module=bytecode, device="cuda", mlir_dialect="tm_tensor"
+)
+shark_module.compile()
+
+result_shark = shark_module(
+    "forward", [inputs["input_ids"], inputs["attention_mask"]]
+)
+
+print("Result PyTorch")
+print(res_pytorch)
+print("Result SHARK")
+print(result_shark)