From c70e8af0680f41b29b3892921b3bc28dcdc6a02a Mon Sep 17 00:00:00 2001
From: Christopher Milan <chrismilan@ucla.edu>
Date: Mon, 2 Mar 2026 19:00:05 -0800
Subject: [PATCH] move IMAGE FLOAT16 logic to allocations (#15095)

* FLOAT16 logic in allocations

* cleanup

* separate that

* only apply when IMAGE == 1

* test passing now

* create image buffers earlier
---
 test/backend/test_schedule.py  |  4 ++--
 tinygrad/codegen/__init__.py   |  6 +++---
 tinygrad/engine/allocations.py | 11 +++++++++--
 tinygrad/tensor.py             |  3 +--
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/test/backend/test_schedule.py b/test/backend/test_schedule.py
index 6ad348fe66..35cb6ab95e 100644
--- a/test/backend/test_schedule.py
+++ b/test/backend/test_schedule.py
@@ -796,7 +796,6 @@ class TestSchedule(unittest.TestCase):
       self.assertIsInstance(out.uop.base.realized.dtype, ImageDType)
 
   @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
-  @unittest.expectedFailure
   def test_image_dot_f16_fusion(self):
     with Context(FLOAT16=1):
       def cnt():
@@ -809,7 +808,8 @@ class TestSchedule(unittest.TestCase):
       with Context(IMAGE=1): cnt1 = cnt()
       with Context(IMAGE=2): cnt2 = cnt()
 
-      self.assertEqual(cnt1, cnt2)
+      self.assertEqual(cnt1, 5)
+      self.assertEqual(cnt2, 5)
 
   @unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
   @unittest.expectedFailure
diff --git a/tinygrad/codegen/__init__.py b/tinygrad/codegen/__init__.py
index d1496b793e..78166a2d46 100644
--- a/tinygrad/codegen/__init__.py
+++ b/tinygrad/codegen/__init__.py
@@ -41,15 +41,15 @@ def full_rewrite_to_sink(sink:UOp, ren:Renderer|None=None, optimize:bool=True) -
     # split ranges
     sink = graph_rewrite(sink, pm_split_ranges+pm_flatten_range, ctx={}, name="split ranges")
 
+    # create image buffers
+    if IMAGE == 1 and ren.device in {"QCOM", "CL"}: sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True)
+
     # symbolic (NOTE: this is a requirement for pm_simplify_ranges to be correct)
     sink = graph_rewrite(sink, sym+pm_flatten_range, name="initial symbolic")
 
     # optimize (schedule) the AST
     sink = graph_rewrite(sink, pm_simplify_ranges, name="simplify ranges")
 
-    # create image buffers
-    if IMAGE == 1 and ren.device in {"QCOM", "CL"}: sink = graph_rewrite(sink, pm_make_images, name="create image buffers", bottom_up=True)
-
     # do postrange optimization, BEAM or hand_coded_optimizations
     sink = apply_opts(sink, ren)
 
diff --git a/tinygrad/engine/allocations.py b/tinygrad/engine/allocations.py
index e19d788d07..22e12416d8 100644
--- a/tinygrad/engine/allocations.py
+++ b/tinygrad/engine/allocations.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass, field
 from tinygrad.uop.ops import UOp, UPat, PatternMatcher, Ops, GroupOp, graph_rewrite, identity_element, track_rewrites
-from tinygrad.dtype import ImageDType
-from tinygrad.helpers import prod, DEBUG, argsort, VIZ, pluralize
+from tinygrad.dtype import dtypes, ImageDType
+from tinygrad.helpers import prod, DEBUG, argsort, VIZ, pluralize, IMAGE, FLOAT16
 
 @dataclass
 class AllocCtx:
@@ -95,6 +95,11 @@ def contiguous_mops_to_view(c:UOp):
   # NOTE: this contiguous is removed because this BUFFER_VIEW/RESHAPE has_buffer_identity
   return UOp(Ops.BUFFER_VIEW, src.dtype, (buf,), (size, offset)).reshape(src.shape).contiguous(tag=c.tag)
 
+def make_float16(assign:UOp, buf:UOp, val:UOp):
+  if IMAGE != 1 or not FLOAT16: return None
+  new_buf = buf.replace(dtype=dtypes.half, src=(buf.src[0].replace(dtype=dtypes.half), *buf.src[1:]) if buf.op is Ops.RESHAPE else buf.src)
+  return assign.replace(dtype=dtypes.half, src=(new_buf, val.cast(dtypes.half))).cast(dtypes.float)
+
 pm_early_transform_tensor_graph = PatternMatcher([
   # CONTIGUOUS(MOPS(BUFFER/BUFFER_VIEW)) → CONTIGUOUS(BUFFER_VIEW) when movement ops collapse to contiguous range
   (UPat(Ops.CONTIGUOUS, src=(UPat(GroupOp.Movement),), name="c"), contiguous_mops_to_view),
@@ -122,6 +127,8 @@ pm_early_transform_tensor_graph = PatternMatcher([
   (UPat(GroupOp.All-{Ops.SINK}, name="x"), lambda x: x.const_like(0).rtag(x.tag) if x._shape is not None and x.size == 0 else None),
   # early fixup const copy (TODO: is this wrong if there's a pad?)
   (UPat(Ops.COPY, src=(UPat.var("s"), UPat()), name="c"), lambda c,s: c.const_like(ss.arg) if (ss:=s.base).op is Ops.CONST else None),
+  # IMAGE FLOAT16: use the texture sampler to store as half and automatically cast float load/store
+  (UPat(Ops.ASSIGN, dtypes.float, src=(UPat.var("buf"), UPat(GroupOp.All-{Ops.COPY}, name="val")), name="assign"), make_float16),
 ])
 
 def untag_and_append(ctx:AllocCtx, x:UOp):
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index fe9f1a8e93..e9d43736bd 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -3643,8 +3643,7 @@ class Tensor(OpMixin):
 
     # contiguous creates the image, and early realize static weights (TODO: test for the static weight)
     if IMAGE >= 2: x,w = x.cast(base_image_type((bs*iy, ix*groups*cin//4, 4))), w.cast(base_image_type((cout//4, H*W*cin, 4)))
-    if IMAGE == 1 and FLOAT16: x, w = x.cast(dtypes.half).contiguous().cast(dtypes.float), w.cast(dtypes.half).contiguous().cast(dtypes.float)
-    else: x, w = x.contiguous(), w.contiguous()
+    x, w = x.contiguous(), w.contiguous()
 
     if IMAGE == 1 and added_weight: w, H = w[:, :-added_weight, ...], H - added_weight