mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 07:28:15 -05:00
fix llm app with rangeify (#12334)
* fix llm app with rangeify * add gpt2 contiguous also
This commit is contained in:
@@ -64,7 +64,7 @@ class TransformerBlock:
|
||||
|
||||
def __call__(self, x:Tensor, start_pos:Variable, mask:Optional[Tensor]):
|
||||
h = x + self.attn(self.ln_1(x), start_pos, mask).float()
|
||||
return (h + self.mlp(self.ln_2(h)))
|
||||
return (h + self.mlp(self.ln_2(h))).contiguous()
|
||||
|
||||
class Transformer:
|
||||
def __init__(self, dim, n_heads, n_layers, norm_eps, vocab_size, max_seq_len=1024):
|
||||
|
||||
@@ -118,7 +118,7 @@ class TransformerBlock:
|
||||
return h + self.ffn_down(gated)
|
||||
|
||||
def __call__(self, x: Tensor, start_pos: int|UOp):
|
||||
return self._feed_forward(self._attention(x, start_pos))
|
||||
return self._feed_forward(self._attention(x, start_pos)).contiguous()
|
||||
|
||||
class Transformer:
|
||||
def __init__(self, *, num_blocks, dim, hidden_dim, n_heads, n_kv_heads, norm_eps, vocab_size, max_context):
|
||||
@@ -156,6 +156,8 @@ class Transformer:
|
||||
n_heads=kv[f'{arch}.attention.head_count'], n_kv_heads=kv[f'{arch}.attention.head_count_kv'],
|
||||
norm_eps=kv[f'{arch}.attention.layer_norm_rms_epsilon'], vocab_size=len(kv['tokenizer.ggml.tokens']), max_context=max_context)
|
||||
nn.state.load_state_dict(model, state_dict, verbose=False, consume=True, realize=False) # NOTE: rope_freqs.weight (32,) is unused
|
||||
# NOTE: without this contiguous, it unpacks the weights from the model every time. we shouldn't need this, but for now it's faster
|
||||
for s in nn.state.get_parameters(model): s.replace(s.contiguous())
|
||||
return model, kv
|
||||
|
||||
def generate(self, tokens:list[int], start_pos=0):
|
||||
|
||||
Reference in New Issue
Block a user