From 47faa2d7b476a8d1f04c99f60dd963cf95850198 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Wed, 4 Mar 2026 19:07:03 +0800
Subject: [PATCH] hotfix: llm kv cache uses clone instead of realize to avoid
 many realize

---
 tinygrad/apps/llm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tinygrad/apps/llm.py b/tinygrad/apps/llm.py
index 2df90b040f..cf773546d6 100644
--- a/tinygrad/apps/llm.py
+++ b/tinygrad/apps/llm.py
@@ -165,7 +165,8 @@ class TransformerBlock:
   def __call__(self, x: Tensor, start_pos: int|UOp):
     if not hasattr(self, "cache_kv"):
       # TODO: how is the dtype of this determined?
-      self.cache_kv = Tensor.zeros(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device).contiguous().realize()
+      # NOTE: clone is used to promise the creation of a specific buffer
+      self.cache_kv = Tensor.zeros(2, x.shape[0], self.n_kv_heads, self.max_context, self.head_dim, device=x.device).clone()
     return self._feed_forward(self._attention(x, start_pos)).contiguous()
 
 class Transformer: