fix llm app with rangeify (#12334)

* fix llm app with rangeify * add gpt2 contiguous also
2026-01-10 07:28:15 -05:00 · 2025-09-29 20:42:44 +10:00
parent 7ae6898e31
commit b899392f30
2 changed files with 4 additions and 2 deletions
--- a/examples/gpt2.py
+++ b/examples/gpt2.py
@@ -64,7 +64,7 @@ class TransformerBlock:

  def __call__(self, x:Tensor, start_pos:Variable, mask:Optional[Tensor]):
    h = x + self.attn(self.ln_1(x), start_pos, mask).float()
-    return (h + self.mlp(self.ln_2(h)))
+    return (h + self.mlp(self.ln_2(h))).contiguous()

 class Transformer:
  def __init__(self, dim, n_heads, n_layers, norm_eps, vocab_size, max_seq_len=1024):
--- a/tinygrad/apps/llm.py
+++ b/tinygrad/apps/llm.py
@@ -118,7 +118,7 @@ class TransformerBlock:
    return h + self.ffn_down(gated)

  def __call__(self, x: Tensor, start_pos: int|UOp):
-    return self._feed_forward(self._attention(x, start_pos))
+    return self._feed_forward(self._attention(x, start_pos)).contiguous()

 class Transformer:
  def __init__(self, *, num_blocks, dim, hidden_dim, n_heads, n_kv_heads, norm_eps, vocab_size, max_context):
@@ -156,6 +156,8 @@ class Transformer:
                        n_heads=kv[f'{arch}.attention.head_count'], n_kv_heads=kv[f'{arch}.attention.head_count_kv'],
                        norm_eps=kv[f'{arch}.attention.layer_norm_rms_epsilon'], vocab_size=len(kv['tokenizer.ggml.tokens']), max_context=max_context)
    nn.state.load_state_dict(model, state_dict, verbose=False, consume=True, realize=False)  # NOTE: rope_freqs.weight (32,) is unused
+    # NOTE: without this contiguous, it unpacks the weights from the model every time. we shouldn't need this, but for now it's faster
+    for s in nn.state.get_parameters(model): s.replace(s.contiguous())
    return model, kv

  def generate(self, tokens:list[int], start_pos=0):