From 47faa2d7b476a8d1f04c99f60dd963cf95850198 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Wed, 4 Mar 2026 19:07:03 +0800 Subject: [PATCH] hotfix: llm kv cache uses clone instead of realize to avoid many realize --- tinygrad/apps/llm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tinygrad/apps/llm.py b/tinygrad/apps/llm.py index 2df90b040f..cf773546d6 100644 --- a/tinygrad/apps/llm.py +++ b/tinygrad/apps/llm.py @@ -165,7 +165,8 @@ class TransformerBlock: def __call__(self, x: Tensor, start_pos: int|UOp): if not hasattr(self, "cache_kv"): # TODO: how is the dtype of this determined? - self.cache_kv = Tensor.zeros(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device).contiguous().realize() + # NOTE: clone is used to promise the creation of a specific buffer + self.cache_kv = Tensor.zeros(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device).clone() return self._feed_forward(self._attention(x, start_pos)).contiguous() class Transformer: