fix llm app with rangeify (#12334)

* fix llm app with rangeify

* add gpt2 contiguous also
This commit is contained in:
George Hotz
2025-09-29 20:42:44 +10:00
committed by GitHub
parent 7ae6898e31
commit b899392f30
2 changed files with 4 additions and 2 deletions

View File

@@ -64,7 +64,7 @@ class TransformerBlock:
def __call__(self, x:Tensor, start_pos:Variable, mask:Optional[Tensor]):
h = x + self.attn(self.ln_1(x), start_pos, mask).float()
return (h + self.mlp(self.ln_2(h)))
return (h + self.mlp(self.ln_2(h))).contiguous()
class Transformer:
def __init__(self, dim, n_heads, n_layers, norm_eps, vocab_size, max_seq_len=1024):

View File

@@ -118,7 +118,7 @@ class TransformerBlock:
return h + self.ffn_down(gated)
def __call__(self, x: Tensor, start_pos: int|UOp):
return self._feed_forward(self._attention(x, start_pos))
return self._feed_forward(self._attention(x, start_pos)).contiguous()
class Transformer:
def __init__(self, *, num_blocks, dim, hidden_dim, n_heads, n_kv_heads, norm_eps, vocab_size, max_context):
@@ -156,6 +156,8 @@ class Transformer:
n_heads=kv[f'{arch}.attention.head_count'], n_kv_heads=kv[f'{arch}.attention.head_count_kv'],
norm_eps=kv[f'{arch}.attention.layer_norm_rms_epsilon'], vocab_size=len(kv['tokenizer.ggml.tokens']), max_context=max_context)
nn.state.load_state_dict(model, state_dict, verbose=False, consume=True, realize=False) # NOTE: rope_freqs.weight (32,) is unused
# NOTE: without this contiguous, it unpacks the weights from the model every time. we shouldn't need this, but for now it's faster
for s in nn.state.get_parameters(model): s.replace(s.contiguous())
return model, kv
def generate(self, tokens:list[int], start_pos=0):