From 6193e279d48ec4f584a3f37f95997bd34a9ebbdd Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Wed, 15 Jan 2025 05:45:03 -0500
Subject: [PATCH 01/16] isolate simple failing test for subbuffer on CONST [pr]
 (#8630)

* simple failing test for subbuffer on CONST [pr]

* add view_supported_devices check
---
 test/test_tensor_uop.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/test_tensor_uop.py b/test/test_tensor_uop.py
index a3b01604e5..03e1b71649 100644
--- a/test/test_tensor_uop.py
+++ b/test/test_tensor_uop.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.realize import run_schedule
-from tinygrad.ops import Ops, UOp, UPat
+from tinygrad.ops import Ops, UOp, UPat, view_supported_devices
 
 class TestTensorUOp(unittest.TestCase):
   def test_fromcpu_shape_tracker(self):
@@ -84,6 +84,15 @@ class TestTensorUOp(unittest.TestCase):
     sched = empty.schedule()
     self.assertEqual(len(sched), 0)
 
+  @unittest.skipIf(Device.DEFAULT in view_supported_devices, "BUFFER_VIEW cannot exist on a CONST")
+  def test_contiguous_folded_alu(self):
+    a = Tensor.empty(8, 8)
+    # NOTE: the buffer for mul_0 late folds to just a CONST
+    mul_0 = a*0
+    out = mul_0.shrink(((4, 8), (0, 8))).contiguous()
+    out.realize()
+    self.assertEqual(out.tolist(), Tensor.zeros(4, 8).tolist())
+
 reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(), UPat(Ops.REDUCE_AXIS)))))
 class TestReduceOp(unittest.TestCase):
   def test_no_split_reduce_kernel(self):

From bae20e50433b5d278d73053bf0d819810fd3c290 Mon Sep 17 00:00:00 2001
From: ignaciosica <mignacio.sica@gmail.com>
Date: Wed, 15 Jan 2025 14:31:48 -0300
Subject: [PATCH 02/16] Generic PTX wmma rendering [pr] (#8632)

* make wmma rendering dtype size generic

* use var instead of calculating multiple times

* compact rendering
---
 tinygrad/renderer/ptx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tinygrad/renderer/ptx.py b/tinygrad/renderer/ptx.py
index c7bb49fdfb..7e91b99071 100644
--- a/tinygrad/renderer/ptx.py
+++ b/tinygrad/renderer/ptx.py
@@ -63,8 +63,8 @@ def render_wmma(ctx: "PTXRenderer", x: UOp):
   dt_map = { dtypes.half: "f16" }
   _i = 0
   for vv in x.src[:2]:
-    for i in range(0, len(ctx.r[vv]), 2):
-      yield f"mov.b32 {ctx.wmma_r[_i]}, {{{', '.join(ctx.r[vv][i:i+2])}}};"
+    for i in range(0, len(ctx.r[vv]), (elems_per_reg := 4//dtype_in.itemsize)):
+      yield f"mov.b32 {ctx.wmma_r[_i]}, " + (f"{{{', '.join(ctx.r[vv][i:i+elems_per_reg])}}}" if elems_per_reg > 1 else ctx.r[vv][i]) + ";"
       _i += 1
   yield f'mma.sync.aligned.m{M}n{N}k{K}.row.col.f32.{dt_map[dtype_in]}.{dt_map[dtype_in]}.f32{" "*12}' +\
   f'{{{", ".join(ctx.r[x])}}}, {{{", ".join(ctx.wmma_r[:n_operands[0]])}}}, {{{", ".join(ctx.wmma_r[-n_operands[1]:])}}}, ' + \
@@ -184,7 +184,7 @@ class PTXRenderer(Renderer):
         r[u] = [ssa('val', dtype=self.types[u.dtype.scalar()]) for _ in range(u.dtype.count)] if u.dtype.count > 1 else ssa('val', u)
       elif u.op is Ops.DEFINE_GLOBAL: bufs.append((f"data{u.arg}", u.dtype))
       elif u.op is Ops.WMMA:
-        self.wmma_r = [ssa("wmma", dtype="b32") for vv in u.src[:2] for i in range(0, len(r[vv]), 2)]
+        self.wmma_r = [ssa("wmma", dtype="b32") for vv in u.src[:2] for i in range(0, len(r[vv]), 4//u.arg[2].itemsize)]
         r[u] = [ssa("wmma", dtype=self.types[u.dtype.scalar()]) for _ in range(u.dtype.count)]
       prefix, dtype = {Ops.CAST: ("cast", None), Ops.BITCAST: ("cast", None), Ops.ENDRANGE: ("pred", "pred"), Ops.RANGE: ("ridx", None),
         Ops.DEFINE_ACC: ("acc", None), Ops.DEFINE_VAR: ("dat", None), Ops.CONST: ("const", None), Ops.DEFINE_LOCAL:("local",self.types[dtypes.ulong]),

From a1f70ce7d0cd29deb6bbb73a2586042642e38f03 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Wed, 15 Jan 2025 12:34:15 -0500
Subject: [PATCH 03/16] only use BUFFER_VIEW in disk [pr] (#8629)

* only use BUFFER_VIEW in disk [pr]

* delete can_view

* BUFFER_VIEW op on DISK

* remove that allow_buffer_view=False

* notes

* bitcast is a low-level op too

* this passes on AMD and LLVM
---
 test/test_schedule.py   |  5 +++--
 test/test_setitem.py    | 10 ++--------
 test/test_tensor_uop.py |  3 +--
 tinygrad/ops.py         | 13 +++++++------
 tinygrad/tensor.py      |  2 +-
 5 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index 2d531f0170..a0cd1b3272 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported
 from tinygrad.dtype import DType, ImageDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
+from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views
 from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
 from tinygrad.codegen.kernel import verify_ast
 from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
@@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase):
     a[0] = 6
     np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
 
-  @unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
+  #@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
+  @unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
   def test_arange_view_op(self):
     a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
     sched = self.check_schedule(a, 1)
diff --git a/test/test_setitem.py b/test/test_setitem.py
index 84ccb8bc1f..f1bb595ef2 100644
--- a/test/test_setitem.py
+++ b/test/test_setitem.py
@@ -1,6 +1,5 @@
 import unittest
-from tinygrad import Device, Tensor, TinyJit, Variable, dtypes
-from tinygrad.helpers import CI
+from tinygrad import Tensor, TinyJit, Variable, dtypes
 import numpy as np
 
 class TestSetitem(unittest.TestCase):
@@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase):
   def test_setitem_overlapping_inplace1(self):
     t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
     t[1:] = t[:-1]
-    if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"):
-      # TODO: FIXME
-      with self.assertRaises(AssertionError):
-        self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
-    else:
-      self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
+    self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
 
   def test_setitem_overlapping_inplace2(self):
     t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
diff --git a/test/test_tensor_uop.py b/test/test_tensor_uop.py
index 03e1b71649..b2d4acad97 100644
--- a/test/test_tensor_uop.py
+++ b/test/test_tensor_uop.py
@@ -3,7 +3,7 @@ import numpy as np
 import unittest
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.engine.realize import run_schedule
-from tinygrad.ops import Ops, UOp, UPat, view_supported_devices
+from tinygrad.ops import Ops, UOp, UPat
 
 class TestTensorUOp(unittest.TestCase):
   def test_fromcpu_shape_tracker(self):
@@ -84,7 +84,6 @@ class TestTensorUOp(unittest.TestCase):
     sched = empty.schedule()
     self.assertEqual(len(sched), 0)
 
-  @unittest.skipIf(Device.DEFAULT in view_supported_devices, "BUFFER_VIEW cannot exist on a CONST")
   def test_contiguous_folded_alu(self):
     a = Tensor.empty(8, 8)
     # NOTE: the buffer for mul_0 late folds to just a CONST
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 2c23541a64..fabd94bdfb 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -367,7 +367,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
       raise RuntimeError(f"unsupported size in bitcast {dtype}")
     # shape changing bitcast can use a subbuffer on DISK
     # TODO: this should be moved to realize.py
-    if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
+    if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
     return UOp(Ops.BITCAST, dtype, (self,))
   def gep(self, i:Union[tuple[int, ...], int]):
     if isinstance(i, int):
@@ -420,9 +420,13 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
     return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
   def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
-  def contiguous(self, allow_buffer_view=True):
+  def contiguous(self):
+    # TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
+    # NOTE: DISK uses subbuffer because DISK does not render kernels
+    if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
+    # otherwise it's normal CONTIGUOUS
     if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
-      return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS)
+      return self.alu(Ops.CONTIGUOUS)
     forced_realize.add(self.base)
     return self
 
@@ -451,9 +455,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
   def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
   def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
-  def can_view(self):
-    return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and
-            not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices)
   @property
   def lbs(self): return [self]
   @property
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index e3d87f51eb..4acc8192af 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait):
       lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
     sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
     # NOTE: this contiguous is making it impossible for the scheduler to do late const folding
-    mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
+    mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis)
     return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
 
   def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):

From 7ecced7f6d9c103f0aa32d74dbf146e4eac47f9d Mon Sep 17 00:00:00 2001
From: uuuvn <83587632+uuuvn@users.noreply.github.com>
Date: Wed, 15 Jan 2025 19:47:08 +0200
Subject: [PATCH 04/16] LLVM JIT prereqs (#8634)

* LLVM JIT prereqs

This commit moves jit loading, disassembling and CPUProgram logic from
`ops_clang.py` to `elf.py`, `helpers.py` and `device.py` respectively

I don't quite like the `helpers.py` destination for capstone_flatdump
but this is where cpu_objdump is so presumably this is how it's supposed
to be

* Types
---
 tinygrad/device.py              | 40 +++++++++++++++++++++--
 tinygrad/helpers.py             |  9 +++++
 tinygrad/runtime/ops_clang.py   | 58 ++++-----------------------------
 tinygrad/runtime/support/elf.py |  9 ++++-
 4 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/tinygrad/device.py b/tinygrad/device.py
index a323d06aed..530770f0eb 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -2,8 +2,10 @@ from __future__ import annotations
 from dataclasses import dataclass, replace
 from collections import defaultdict
 from typing import Optional, Any, Iterator, Generator
-import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time
-from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp
+import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time
+from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
+from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp, mv_address, \
+                             cpu_time_execution
 from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes
 from tinygrad.renderer import Renderer
 
@@ -213,6 +215,40 @@ class _MallocAllocator(LRUAllocator):
 
 MallocAllocator = _MallocAllocator()
 
+# NOTE: MAP_JIT is added to mmap module in python 3.13
+MAP_JIT = 0x0800
+
+# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
+class CPUProgram:
+  helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
+
+  def __init__(self, name:str, lib:bytes):
+    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
+    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
+    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
+
+    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
+    self.mem.write(lib)
+    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
+
+    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
+    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
+    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
+    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
+    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
+
+    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
+
+  def __call__(self, *bufs, vals=(), wait=False):
+    args = list(bufs) + list(vals)
+    # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
+    # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
+    # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
+    # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
+    # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
+    if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
+    return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
+
 # **************** for Compiled Devices ****************
 
 class CompileError(Exception): pass
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index e394557d96..4346255983 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -267,6 +267,15 @@ def cpu_objdump(lib, objdump_tool='objdump'):
     pathlib.Path(f.name).write_bytes(lib)
     print(subprocess.check_output([objdump_tool, '-d', f.name]).decode('utf-8'))
 
+def capstone_flatdump(lib: bytes):
+  import capstone
+  match platform.machine():
+    case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
+    case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
+    case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
+  for instr in cs.disasm(lib, 0):
+    print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
+
 # *** ctypes helpers
 
 # TODO: make this work with read only memoryviews (if possible)
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 56f080681b..689c04c460 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,13 +1,9 @@
-import ctypes, ctypes.util, struct, platform, tempfile, pathlib, subprocess
-from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
-from tinygrad.helpers import OSX, mv_address, cpu_time_execution, cpu_objdump
-from tinygrad.device import Compiled, Compiler, MallocAllocator
-from tinygrad.runtime.support.elf import elf_loader, relocate
+import platform, tempfile, pathlib, subprocess
+from tinygrad.helpers import cpu_objdump, capstone_flatdump
+from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
+from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.renderer.cstyle import ClangRenderer
 
-# NOTE: MAP_JIT is added to mmap module in python 3.13
-MAP_JIT = 0x0800
-
 # Used by ops_dsp.py
 class ClangCompiler(Compiler):
   def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
@@ -33,51 +29,9 @@ class ClangJITCompiler(Compiler):
     args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
     arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
     obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
-    image, _, relocs = elf_loader(obj)
-    # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
-    for ploc,tgt,r_type,r_addend in relocs:
-      image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
-    return bytes(image)
+    return jit_loader(obj)
 
-  def disassemble(self, lib):
-    import capstone
-    match platform.machine():
-      case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
-      case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
-      case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
-    for instr in cs.disasm(lib, 0):
-      print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
-
-# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
-class CPUProgram:
-  helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
-
-  def __init__(self, name:str, lib:bytes):
-    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
-    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
-    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
-
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
-    self.mem.write(lib)
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
-
-    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
-    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
-    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
-    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
-    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
-
-    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
-
-  def __call__(self, *bufs, vals=(), wait=False):
-    args = list(bufs) + list(vals)
-    # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
-    # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
-    # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
-    # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
-    # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
-    if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
-    return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
+  def disassemble(self, lib:bytes): return capstone_flatdump(lib)
 
 class ClangDevice(Compiled):
   def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
diff --git a/tinygrad/runtime/support/elf.py b/tinygrad/runtime/support/elf.py
index 0e3391186d..ffaeff921f 100644
--- a/tinygrad/runtime/support/elf.py
+++ b/tinygrad/runtime/support/elf.py
@@ -1,4 +1,4 @@
-import tinygrad.runtime.autogen.libc as libc
+import struct, tinygrad.runtime.autogen.libc as libc
 from dataclasses import dataclass
 from tinygrad.helpers import getbits, i2u
 
@@ -51,3 +51,10 @@ def relocate(instr: int, ploc: int, tgt: int, r_type: int):
     case libc.R_AARCH64_LDST64_ABS_LO12_NC: return instr | (getbits(tgt, 3, 11) << 10)
     case libc.R_AARCH64_LDST128_ABS_LO12_NC: return instr | (getbits(tgt, 4, 11) << 10)
   raise NotImplementedError(f"Encountered unknown relocation type {r_type}")
+
+def jit_loader(obj: bytes) -> bytes:
+  image, _, relocs = elf_loader(obj)
+  # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
+  for ploc,tgt,r_type,r_addend in relocs:
+    image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
+  return bytes(image)

From b3efeeb71779ff66292552f5d7586d643b3eb8bc Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Thu, 16 Jan 2025 00:22:35 +0300
Subject: [PATCH 05/16] docs: start am docs (#8638)

* docs: init am docs

* missing
---
 docs/developer/am.md | 39 +++++++++++++++++++++++++++++++++++++++
 mkdocs.yml           |  1 +
 2 files changed, 40 insertions(+)
 create mode 100644 docs/developer/am.md

diff --git a/docs/developer/am.md b/docs/developer/am.md
new file mode 100644
index 0000000000..9435bc9537
--- /dev/null
+++ b/docs/developer/am.md
@@ -0,0 +1,39 @@
+# AM Driver
+
+AM driver is a userspace driver targeting AMD's 7900XTX. You only need tinygrad to send compute tasks to your GPU!
+
+## How to run?
+Make sure that amdgpu module is unloaded and just run tinygrad with `AMD=1`!
+
+Optional requirements:
+
+* System without IOMMU for P2P / SDMA support
+* vfio-pci module for IRQ handling
+
+## Environment Variables
+
+| Variable | Possible Value(s) | Description |
+|----------|------------------|-------------|
+| AM_RESET | [1] | Performs a full GPU reset (reloading all firmware and IP blocks) |
+| AM_DEBUG | [0-4] | Sets the level of additional debugging information |
+
+## AM Driver Details
+
+### Compute & SDMA Queues
+
+AM binds compute queues directly to MEC (bypassing MES). Tinygrad uses only one compute queue, which is bound at `pipe=0 queue=0`. Similarly, the single SDMA queue is bound at `engine=0 queue=0`.
+
+### Boot
+
+The GPU being passed can be in one of several states:
+1. Not initialized
+2. Initialized by AMDGPU
+3. Initialized by AM
+
+The first and second states require a full GPU setup since their states are unknown. The second state also requires a mode1 reset to reinitialize all components.
+
+The third state can be set up partially to optimize boot time. In this case, only the GFX and SDMA IPs need to be initialized. To enable this, AM uses a separate boot memory that is guaranteed not to be overwritten. This physical memory is utilized for all blocks that are initialized only during the initial AM boot. To determine if the GPU is in the third state, AM uses `regSCRATCH_REG7` as a flag.
+
+### VM Management
+
+Each AM device sets up only a single `VMID=0` and one page directory. The page directory used is 3-level and thus supports up to 512TB of virtual addresses. All AM devices are located in one virtual address space.
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index 871ca8a148..291998dac5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -27,6 +27,7 @@ nav:
     - Runtime:
       - developer/runtime.md
       - HCQ: developer/hcq.md
+      - AM Driver: developer/am.md
   - tinybox: tinybox.md
 #- tinygrad: reference/
 

From d5c90da2866f2417c99f1c5bc432e9280fcd7a01 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:14:28 -0500
Subject: [PATCH 06/16] move subbuffer to a rewrite rule in the scheduler 
 (#8639)

* delete buffer_view from tensor

* add to the scheduler

* move buffer_view to the scheduler

* gradient doesn't care.

* for/with
---
 tinygrad/engine/schedule.py | 23 ++++++++---------------
 tinygrad/gradient.py        |  4 +---
 tinygrad/ops.py             |  9 +--------
 3 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 7dc36febb8..eec52ae4b6 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -51,17 +51,6 @@ tensor_uop_spec = PatternMatcher([
   # ASSIGN changes the value of a realized buffer
   (UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))),
    lambda assign,target,new_val: (target.op is Ops.BUFFER or target.is_realized) and (assign.dtype == target.dtype == new_val.dtype)),
-
-  # TODO: BUFFER_VIEW is overloaded, it should be removed.
-  # BUFFER_VIEW shares the device buffer with its source, it uses a subbuffer of the underlying source buffer
-
-  (UPat(Ops.BUFFER_VIEW, name="root", src=(UPat.var("x"),)), lambda root,x:
-   # BUFFER_VIEW can replace contiguous, keeping dtype the same
-   (root.dtype == x.dtype) or
-   # it can also replace bitcast, this changes the dtype, but the itemsize stays the same
-   (root.dtype != x.dtype and root.dtype.itemsize == x.dtype.itemsize) or
-   # it can also represent shape changing bitcast (only on DISK)
-   (root.dtype != x.dtype and root.dtype.itemsize != x.dtype.itemsize and x.device.startswith("DISK"))),
 ])
 
 # **** ScheduleItem return type
@@ -455,6 +444,12 @@ def fold_img_cast(ctx:ScheduleContext, xb:UOp, view:UOp, b:UOp, to_cast:UOp, **k
 def sink_outputs(ctx:ScheduleContext, sink:UOp) -> None:
   for x in sink.src: realize(ctx, x.buf_uop, x)
 
+def create_subbuffer(base:UOp, b:UOp, root:UOp, x:UOp):
+  if not root.device.startswith("DISK"): return None
+  if x.op is not Ops.VIEW: x = x.src[-1] # TODO: remove this once forced_realize is gone
+  buffers[b] = x.buf_uop.buffer.view(b.size, b.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
+  return base.replace(src=(b, root.replace(op=Ops.BUFFER_VIEW)))
+
 do_realize = PatternMatcher([
   # always realize sinked ops
   (UPat(Ops.SINK, name="sink"), sink_outputs),
@@ -467,6 +462,8 @@ do_realize = PatternMatcher([
   # realize before COPY or BUFFER_VIEW
   (UPat(Ops.COPY, src=(UPat(), UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
   (UPat(Ops.BUFFER_VIEW, src=(UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
+  # substitute BITCAST/CONTIGUOUS with BUFFER_VIEW on DISK
+  (UPatScheduled((Ops.BITCAST, Ops.CONTIGUOUS), name="root", src=(UPat.var("x"),)), create_subbuffer),
 ])
 
 # **** rewrite VIEW into LOAD/STORE/VALID or fuse the underlying UOp
@@ -502,10 +499,6 @@ def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None:
   if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop)
   for x in op.base.src:
     if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None
-  # BUFFER_VIEW overrides the underlying buffer
-  # TODO: this should be a shrink on the buffer
-  if op.op is Ops.BUFFER_VIEW:
-    buffers[buf_uop] = (x:=op.src[0]).buf_uop.buffer.view(view.size, view.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
   buf_uop.buffer.ref(1)
 create_ctx = PatternMatcher([(UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf_uop"), UPat())), append_uop)])
 
diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py
index a2fa71a98d..756a9c9785 100644
--- a/tinygrad/gradient.py
+++ b/tinygrad/gradient.py
@@ -37,9 +37,7 @@ pm_gradient = PatternMatcher([
   (UPat(Ops.EXPAND, name="ret"), lambda ctx, ret:
     (ctx.cast(sum_acc_dtype(ctx.dtype)).r(Ops.ADD, tuple(i for i,(si,so) in enumerate(zip(ret.src[0].shape, ret.arg)) if si!=so)).cast(ctx.dtype),)),
 
-  # there's no gradient for...is this ASSIGN?
-  (UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)),
-  # also no gradient for bitcast
+  # there's no gradient for bitcast
   (UPat(Ops.BITCAST), lambda ctx: (None,)),
 ])
 
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index fabd94bdfb..7725a1f506 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -291,7 +291,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if self.op in GroupOp.Buffer: return vsrc[0] if len(vsrc:=[x.st for x in self.src if x.op is Ops.VIEW]) != 0 else None
     if not (src_sts := [x.st for x in self.src if x.st is not None]): return None
     assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}"
-    if self.op is Ops.BUFFER_VIEW:
+    if self.op in {Ops.BITCAST, Ops.BUFFER_VIEW}:
       shape = src_sts[0].shape
       if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,)
     # only reduce ops are allowed to change shape, everything else derives shape from sources
@@ -365,9 +365,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def bitcast(self, dtype:DType):
     if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
       raise RuntimeError(f"unsupported size in bitcast {dtype}")
-    # shape changing bitcast can use a subbuffer on DISK
-    # TODO: this should be moved to realize.py
-    if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
     return UOp(Ops.BITCAST, dtype, (self,))
   def gep(self, i:Union[tuple[int, ...], int]):
     if isinstance(i, int):
@@ -421,10 +418,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
   def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
   def contiguous(self):
-    # TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
-    # NOTE: DISK uses subbuffer because DISK does not render kernels
-    if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
-    # otherwise it's normal CONTIGUOUS
     if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
       return self.alu(Ops.CONTIGUOUS)
     forced_realize.add(self.base)

From 82ef956cb8fdd7233c78f26927b416c7b03798ee Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 03:29:07 -0500
Subject: [PATCH 07/16] Revert "move subbuffer to a rewrite rule in the
 scheduler  (#8639)" (#8641)

This reverts commit d5c90da2866f2417c99f1c5bc432e9280fcd7a01.
---
 tinygrad/engine/schedule.py | 23 +++++++++++++++--------
 tinygrad/gradient.py        |  4 +++-
 tinygrad/ops.py             |  9 ++++++++-
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index eec52ae4b6..7dc36febb8 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -51,6 +51,17 @@ tensor_uop_spec = PatternMatcher([
   # ASSIGN changes the value of a realized buffer
   (UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))),
    lambda assign,target,new_val: (target.op is Ops.BUFFER or target.is_realized) and (assign.dtype == target.dtype == new_val.dtype)),
+
+  # TODO: BUFFER_VIEW is overloaded, it should be removed.
+  # BUFFER_VIEW shares the device buffer with its source, it uses a subbuffer of the underlying source buffer
+
+  (UPat(Ops.BUFFER_VIEW, name="root", src=(UPat.var("x"),)), lambda root,x:
+   # BUFFER_VIEW can replace contiguous, keeping dtype the same
+   (root.dtype == x.dtype) or
+   # it can also replace bitcast, this changes the dtype, but the itemsize stays the same
+   (root.dtype != x.dtype and root.dtype.itemsize == x.dtype.itemsize) or
+   # it can also represent shape changing bitcast (only on DISK)
+   (root.dtype != x.dtype and root.dtype.itemsize != x.dtype.itemsize and x.device.startswith("DISK"))),
 ])
 
 # **** ScheduleItem return type
@@ -444,12 +455,6 @@ def fold_img_cast(ctx:ScheduleContext, xb:UOp, view:UOp, b:UOp, to_cast:UOp, **k
 def sink_outputs(ctx:ScheduleContext, sink:UOp) -> None:
   for x in sink.src: realize(ctx, x.buf_uop, x)
 
-def create_subbuffer(base:UOp, b:UOp, root:UOp, x:UOp):
-  if not root.device.startswith("DISK"): return None
-  if x.op is not Ops.VIEW: x = x.src[-1] # TODO: remove this once forced_realize is gone
-  buffers[b] = x.buf_uop.buffer.view(b.size, b.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
-  return base.replace(src=(b, root.replace(op=Ops.BUFFER_VIEW)))
-
 do_realize = PatternMatcher([
   # always realize sinked ops
   (UPat(Ops.SINK, name="sink"), sink_outputs),
@@ -462,8 +467,6 @@ do_realize = PatternMatcher([
   # realize before COPY or BUFFER_VIEW
   (UPat(Ops.COPY, src=(UPat(), UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
   (UPat(Ops.BUFFER_VIEW, src=(UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
-  # substitute BITCAST/CONTIGUOUS with BUFFER_VIEW on DISK
-  (UPatScheduled((Ops.BITCAST, Ops.CONTIGUOUS), name="root", src=(UPat.var("x"),)), create_subbuffer),
 ])
 
 # **** rewrite VIEW into LOAD/STORE/VALID or fuse the underlying UOp
@@ -499,6 +502,10 @@ def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None:
   if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop)
   for x in op.base.src:
     if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None
+  # BUFFER_VIEW overrides the underlying buffer
+  # TODO: this should be a shrink on the buffer
+  if op.op is Ops.BUFFER_VIEW:
+    buffers[buf_uop] = (x:=op.src[0]).buf_uop.buffer.view(view.size, view.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
   buf_uop.buffer.ref(1)
 create_ctx = PatternMatcher([(UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf_uop"), UPat())), append_uop)])
 
diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py
index 756a9c9785..a2fa71a98d 100644
--- a/tinygrad/gradient.py
+++ b/tinygrad/gradient.py
@@ -37,7 +37,9 @@ pm_gradient = PatternMatcher([
   (UPat(Ops.EXPAND, name="ret"), lambda ctx, ret:
     (ctx.cast(sum_acc_dtype(ctx.dtype)).r(Ops.ADD, tuple(i for i,(si,so) in enumerate(zip(ret.src[0].shape, ret.arg)) if si!=so)).cast(ctx.dtype),)),
 
-  # there's no gradient for bitcast
+  # there's no gradient for...is this ASSIGN?
+  (UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)),
+  # also no gradient for bitcast
   (UPat(Ops.BITCAST), lambda ctx: (None,)),
 ])
 
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 7725a1f506..fabd94bdfb 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -291,7 +291,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if self.op in GroupOp.Buffer: return vsrc[0] if len(vsrc:=[x.st for x in self.src if x.op is Ops.VIEW]) != 0 else None
     if not (src_sts := [x.st for x in self.src if x.st is not None]): return None
     assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}"
-    if self.op in {Ops.BITCAST, Ops.BUFFER_VIEW}:
+    if self.op is Ops.BUFFER_VIEW:
       shape = src_sts[0].shape
       if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,)
     # only reduce ops are allowed to change shape, everything else derives shape from sources
@@ -365,6 +365,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def bitcast(self, dtype:DType):
     if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
       raise RuntimeError(f"unsupported size in bitcast {dtype}")
+    # shape changing bitcast can use a subbuffer on DISK
+    # TODO: this should be moved to realize.py
+    if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
     return UOp(Ops.BITCAST, dtype, (self,))
   def gep(self, i:Union[tuple[int, ...], int]):
     if isinstance(i, int):
@@ -418,6 +421,10 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
   def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
   def contiguous(self):
+    # TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
+    # NOTE: DISK uses subbuffer because DISK does not render kernels
+    if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
+    # otherwise it's normal CONTIGUOUS
     if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
       return self.alu(Ops.CONTIGUOUS)
     forced_realize.add(self.base)

From 611208cd8a61f5cc4b184982b9a829c015d11d76 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 04:30:11 -0500
Subject: [PATCH 08/16] =?UTF-8?q?Revert=20"Revert=20"move=20subbuffer=20to?=
 =?UTF-8?q?=20a=20rewrite=20rule=20in=20the=20scheduler=20=20(#8639)"=20(?=
 =?UTF-8?q?=E2=80=A6"=20(#8643)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 82ef956cb8fdd7233c78f26927b416c7b03798ee.
---
 tinygrad/engine/schedule.py | 23 ++++++++---------------
 tinygrad/gradient.py        |  4 +---
 tinygrad/ops.py             |  9 +--------
 3 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 7dc36febb8..eec52ae4b6 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -51,17 +51,6 @@ tensor_uop_spec = PatternMatcher([
   # ASSIGN changes the value of a realized buffer
   (UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))),
    lambda assign,target,new_val: (target.op is Ops.BUFFER or target.is_realized) and (assign.dtype == target.dtype == new_val.dtype)),
-
-  # TODO: BUFFER_VIEW is overloaded, it should be removed.
-  # BUFFER_VIEW shares the device buffer with its source, it uses a subbuffer of the underlying source buffer
-
-  (UPat(Ops.BUFFER_VIEW, name="root", src=(UPat.var("x"),)), lambda root,x:
-   # BUFFER_VIEW can replace contiguous, keeping dtype the same
-   (root.dtype == x.dtype) or
-   # it can also replace bitcast, this changes the dtype, but the itemsize stays the same
-   (root.dtype != x.dtype and root.dtype.itemsize == x.dtype.itemsize) or
-   # it can also represent shape changing bitcast (only on DISK)
-   (root.dtype != x.dtype and root.dtype.itemsize != x.dtype.itemsize and x.device.startswith("DISK"))),
 ])
 
 # **** ScheduleItem return type
@@ -455,6 +444,12 @@ def fold_img_cast(ctx:ScheduleContext, xb:UOp, view:UOp, b:UOp, to_cast:UOp, **k
 def sink_outputs(ctx:ScheduleContext, sink:UOp) -> None:
   for x in sink.src: realize(ctx, x.buf_uop, x)
 
+def create_subbuffer(base:UOp, b:UOp, root:UOp, x:UOp):
+  if not root.device.startswith("DISK"): return None
+  if x.op is not Ops.VIEW: x = x.src[-1] # TODO: remove this once forced_realize is gone
+  buffers[b] = x.buf_uop.buffer.view(b.size, b.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
+  return base.replace(src=(b, root.replace(op=Ops.BUFFER_VIEW)))
+
 do_realize = PatternMatcher([
   # always realize sinked ops
   (UPat(Ops.SINK, name="sink"), sink_outputs),
@@ -467,6 +462,8 @@ do_realize = PatternMatcher([
   # realize before COPY or BUFFER_VIEW
   (UPat(Ops.COPY, src=(UPat(), UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
   (UPat(Ops.BUFFER_VIEW, src=(UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
+  # substitute BITCAST/CONTIGUOUS with BUFFER_VIEW on DISK
+  (UPatScheduled((Ops.BITCAST, Ops.CONTIGUOUS), name="root", src=(UPat.var("x"),)), create_subbuffer),
 ])
 
 # **** rewrite VIEW into LOAD/STORE/VALID or fuse the underlying UOp
@@ -502,10 +499,6 @@ def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None:
   if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop)
   for x in op.base.src:
     if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None
-  # BUFFER_VIEW overrides the underlying buffer
-  # TODO: this should be a shrink on the buffer
-  if op.op is Ops.BUFFER_VIEW:
-    buffers[buf_uop] = (x:=op.src[0]).buf_uop.buffer.view(view.size, view.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
   buf_uop.buffer.ref(1)
 create_ctx = PatternMatcher([(UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf_uop"), UPat())), append_uop)])
 
diff --git a/tinygrad/gradient.py b/tinygrad/gradient.py
index a2fa71a98d..756a9c9785 100644
--- a/tinygrad/gradient.py
+++ b/tinygrad/gradient.py
@@ -37,9 +37,7 @@ pm_gradient = PatternMatcher([
   (UPat(Ops.EXPAND, name="ret"), lambda ctx, ret:
     (ctx.cast(sum_acc_dtype(ctx.dtype)).r(Ops.ADD, tuple(i for i,(si,so) in enumerate(zip(ret.src[0].shape, ret.arg)) if si!=so)).cast(ctx.dtype),)),
 
-  # there's no gradient for...is this ASSIGN?
-  (UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)),
-  # also no gradient for bitcast
+  # there's no gradient for bitcast
   (UPat(Ops.BITCAST), lambda ctx: (None,)),
 ])
 
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index fabd94bdfb..7725a1f506 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -291,7 +291,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     if self.op in GroupOp.Buffer: return vsrc[0] if len(vsrc:=[x.st for x in self.src if x.op is Ops.VIEW]) != 0 else None
     if not (src_sts := [x.st for x in self.src if x.st is not None]): return None
     assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}"
-    if self.op is Ops.BUFFER_VIEW:
+    if self.op in {Ops.BITCAST, Ops.BUFFER_VIEW}:
       shape = src_sts[0].shape
       if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,)
     # only reduce ops are allowed to change shape, everything else derives shape from sources
@@ -365,9 +365,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
   def bitcast(self, dtype:DType):
     if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
       raise RuntimeError(f"unsupported size in bitcast {dtype}")
-    # shape changing bitcast can use a subbuffer on DISK
-    # TODO: this should be moved to realize.py
-    if self._device is not None and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
     return UOp(Ops.BITCAST, dtype, (self,))
   def gep(self, i:Union[tuple[int, ...], int]):
     if isinstance(i, int):
@@ -421,10 +418,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
   def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
   def contiguous(self):
-    # TODO: BUFFER_VIEW op should be deleted and subbuffer should be moved to realize.py
-    # NOTE: DISK uses subbuffer because DISK does not render kernels
-    if self.device.startswith("DISK"): return self.alu(Ops.BUFFER_VIEW)
-    # otherwise it's normal CONTIGUOUS
     if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
       return self.alu(Ops.CONTIGUOUS)
     forced_realize.add(self.base)

From 00e5979897d6fcbdadcc5f785d41958d2e7aee34 Mon Sep 17 00:00:00 2001
From: uuuvn <83587632+uuuvn@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:56:52 +0200
Subject: [PATCH 09/16] Use full soname for libgcc_s in CPUProgram (#8642)

Number after .so is abi version, it is always 1 for libgcc_s.
Most linux systems set default library versions via symlinks that are
simply followed to get actual elf, however conda does it via linker
scripts which ctypes doesn't follow (below contents of libgcc_s.so):
```
/* GNU ld script
   Use the shared library, but some functions are only in
   the static library.  */
GROUP ( libgcc_s.so.1 -lgcc )
```
ctypes.util.find_library thinks that this is the actual elf and
ctypes.CDLL just loads this text file as a shared library. The result
is:
```
  File "/home/me/src/tinygrad/tinygrad/device.py", line 223, in CPUProgram
    helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/me/miniforge3/envs/tinygrad/lib/python3.12/ctypes/__init__.py", line 379, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: /home/me/miniforge3/envs/tinygrad/lib/libgcc_s.so: invalid ELF header
```
---
 tinygrad/device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinygrad/device.py b/tinygrad/device.py
index 530770f0eb..d4782120c1 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -220,7 +220,7 @@ MAP_JIT = 0x0800
 
 # CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
 class CPUProgram:
-  helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
+  helper_handle = ctypes.CDLL(ctypes.util.find_library('System') if OSX else 'libgcc_s.so.1')
 
   def __init__(self, name:str, lib:bytes):
     # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/

From 81a84aa85ae539583e0d403b32b8b2f35284f1b1 Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 05:27:47 -0500
Subject: [PATCH 10/16] remove is_unrealized_unmasked_const [pr] (#8644)

---
 test/test_schedule.py | 10 ----------
 tinygrad/ops.py       |  1 -
 tinygrad/tensor.py    |  5 +++--
 3 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index a0cd1b3272..bff3ba4aad 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -2105,16 +2105,6 @@ class TestConst(unittest.TestCase):
     print(a.lazydata)
     self.assertTrue(tensor_const_pm.rewrite(a.lazydata))
 
-  def test_uop_methods(self):
-    a = Tensor(1)
-    self.assertTrue(a.lazydata.is_unrealized_unmasked_const())
-
-    a = Tensor.ones((4, 4))
-    self.assertTrue(a.lazydata.is_unrealized_unmasked_const())
-
-    a = Tensor.ones((4, 4)).pad((1, 1),)
-    self.assertFalse(a.lazydata.is_unrealized_unmasked_const())
-
   def test_const_schedule(self):
     a = Tensor.ones((4, 4))
     sched = a.schedule()
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 7725a1f506..e4052bb747 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -447,7 +447,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     # COPY is COPY(DEVICE, copyin.base) -> VIEW(copyin.st)
     return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
   def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
-  def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
   @property
   def lbs(self): return [self]
   @property
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 4acc8192af..1df22659ce 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -3130,9 +3130,10 @@ class Tensor(SimpleMathTrait):
     # broadcast
     return x._broadcast_to(out_shape:=_broadcast_shape(x.shape, y.shape)), y._broadcast_to(out_shape)
 
+  # TODO: tensor should stop checking if things are const
   def _to_const_val(self, x:Union[Tensor, ConstType]) -> Union[Tensor, ConstType]:
-    return x.lazydata.const_arg if isinstance(x, Tensor) and isinstance(x.lazydata, UOp) and x.lazydata.is_unrealized_unmasked_const() \
-      and not x.requires_grad and self._broadcasted(x)[0].shape == self.shape else x
+    return x.lazydata.const_arg if isinstance(x, Tensor) and isinstance(x.lazydata, UOp) and x.lazydata.base.op is Ops.CONST \
+      and unwrap(x.lazydata.st).views[0].mask is None and not x.requires_grad and self._broadcasted(x)[0].shape == self.shape else x
 
   def add(self, x:Union[Tensor, ConstType], reverse=False) -> Tensor:
     """

From f671da675569f65d88719d45d183c6cf4fb8103c Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:47:36 +0300
Subject: [PATCH 11/16] ci: add AM start time to benchmark (#8637)

* ci: add AM start time to benchmark

* am: unlock it

* add AMD

* revert this
---
 .github/workflows/benchmark.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index cccf231dee..dcac986702 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -373,6 +373,10 @@ jobs:
     #  run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
     - name: Remove amdgpu
       run: sleep 5 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid.
+    - name: Test AM cold start time
+      run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
+    - name: Test AM warm start time
+      run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
     - name: Run Stable Diffusion
       run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
     - name: Run SDXL

From f91ca508cf88b09c616473561f68d2d46fbfcef9 Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:22:27 +0300
Subject: [PATCH 12/16] am: bind for sdma (#8633)

* am: bind for sdma

* fix
---
 tinygrad/runtime/ops_amd.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index c7e3935cbf..23d9e9a1bf 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -197,24 +197,44 @@ class AMDCopyQueue(HWQueue):
            *data64_le(signal.timestamp_addr))
     return self
 
+  def bind(self, dev:AMDDevice):
+    if not dev.driverless: return
+
+    self.binded_device = dev
+    self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
+    hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
+    for i, value in enumerate(self._q): hw_view[i] = value
+
+    self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
+    self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
+
   def _submit(self, dev:AMDDevice):
     if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
 
+    if self.binded_device == dev:
+      # An IB packet must end on a 8 DW boundary.
+      add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
+      cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
+
+      if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
+        cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
+    else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
+
     tail_blit_dword = 0
-    for cmdsz in self.internal_cmd_sizes:
+    for cmdsz in cmd_sizes:
       if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
       tail_blit_dword += cmdsz
 
     start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
-    dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self._q[:tail_blit_dword])
+    dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
     dev.sdma_queue.put_value += tail_blit_dword * 4
 
-    if (rem_packet_cnt := len(self._q) - tail_blit_dword) > 0:
+    if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
       zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
       ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
       dev.sdma_queue.put_value += zero_fill
 
-      dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self._q[tail_blit_dword:])
+      dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
       dev.sdma_queue.put_value += rem_packet_cnt * 4
 
     dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value

From 0289fbb1c2338cf404a1da2ea01c6f6903f4a333 Mon Sep 17 00:00:00 2001
From: eliotgolding <177857289+eliotgolding@users.noreply.github.com>
Date: Thu, 16 Jan 2025 21:27:39 +0000
Subject: [PATCH 13/16] limit real_size to the size of first View of
 ShapeTracker (#8628)

* fix real_size

* add fuzzer; typing

* spacing

---------

Co-authored-by: chenyu <chenyu@fastmail.com>
---
 test/external/fuzz_shapetracker_size.py | 13 +++++++++++++
 test/unit/test_shapetracker.py          | 24 ++++++++++++++++++++++++
 tinygrad/shape/shapetracker.py          |  6 ++----
 3 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 test/external/fuzz_shapetracker_size.py

diff --git a/test/external/fuzz_shapetracker_size.py b/test/external/fuzz_shapetracker_size.py
new file mode 100644
index 0000000000..dc76f3aecd
--- /dev/null
+++ b/test/external/fuzz_shapetracker_size.py
@@ -0,0 +1,13 @@
+from tinygrad.shape.shapetracker import ShapeTracker
+from test.external.fuzz_shapetracker import shapetracker_ops as st_ops
+from test.unit.test_shapetracker_math import MultiShapeTracker
+from tinygrad.helpers import getenv
+import random
+
+random.seed(getenv("SEED", 42))
+for i in range(getenv("CNT", 2000)):
+  if getenv("DEBUG", 0) >= 1: print()
+  N = random.randint(1, 10000)
+  mst = MultiShapeTracker([ShapeTracker.from_shape((N,))])  # st_ops don't mutate regular shapetrackers for some reason
+  for j in range(20): random.choice(st_ops)(mst)
+  assert mst.sts[0].real_size() <= N, f"{N=}, real_size={mst.sts[0].real_size()}, st={mst.sts[0]}"
diff --git a/test/unit/test_shapetracker.py b/test/unit/test_shapetracker.py
index b2acc24563..b2bada4135 100644
--- a/test/unit/test_shapetracker.py
+++ b/test/unit/test_shapetracker.py
@@ -833,6 +833,30 @@ class TestShapeTrackerSize(unittest.TestCase):
                                                                                strides=(0, 128, 0, 4096, 1), offset=0, mask=None, contiguous=False)))
     self.assertEqual(st.real_size(), 8389632)
 
+  def test_pad_size_simple(self):
+    st = ShapeTracker.from_shape((10,)).pad(((2,4),))
+    self.assertEqual(st.real_size(), 10)
+
+  def test_pad_size_multiview(self):
+    st = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
+    self.assertEqual(st.real_size(), 100)
+
+  # TODO improve real_size accuracy in cases like this?
+  @unittest.expectedFailure
+  def test_stride_size(self):
+    st1 = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
+    st2 = ShapeTracker.from_shape((10,10)).stride((2,1)).reshape((5*10,)).stride((17,))
+    self.assertEqual(st1.real_size(), 78)
+    self.assertEqual(st2.real_size(), 65)
+
+  def test_stride_size_bounds(self):
+    # lower bound checks that real_size doesn't give false positive for fitting in a buffer
+    # upper bound checks that real_size doesn't exceed N when movementops were applied to from_shape((N,))
+    st1 = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
+    st2 = ShapeTracker.from_shape((10,10)).stride((2,1)).reshape((5*10,)).stride((17,))
+    self.assertTrue(78 <= st1.real_size() <= 100)
+    self.assertTrue(65 <= st2.real_size() <= 100)
+
 class TestConsecutive(unittest.TestCase):
   @classmethod
   def setUpClass(self):
diff --git a/tinygrad/shape/shapetracker.py b/tinygrad/shape/shapetracker.py
index a2ed622816..8585ca6ffc 100644
--- a/tinygrad/shape/shapetracker.py
+++ b/tinygrad/shape/shapetracker.py
@@ -72,12 +72,10 @@ class ShapeTracker:
   def to_indexed_uops(self, _idxs:Optional[list[UOp]|tuple[UOp, ...]]=None) -> tuple[UOp, UOp]:
     return views_to_indexed_uops(self.views, tuple(_idxs) if _idxs is not None else None)
 
+  # upper bound on buffer size required to fit this shapetracker
   def real_size(self) -> int:
     if 0 in self.shape: return 0
-    idx, valid = self.to_indexed_uops()
-    if not valid.vmax: return 0
-    assert idx.vmax < 1e12, f"real_size broken for {self}"
-    return int(idx.vmax+1)
+    return int((v.shrink(v.mask) if (v:=self.views[0]).mask else v).to_indexed_uops()[0].vmax + 1)
 
   def vars(self) -> set[Variable]: return set().union(*[v.vars() for v in self.views])
 

From 4f0d1b475951757dc99cba4bc824003ba41110cb Mon Sep 17 00:00:00 2001
From: Mike Ashcroft <MashDevel@users.noreply.github.com>
Date: Thu, 16 Jan 2025 21:24:56 -0500
Subject: [PATCH 14/16] Disable graphs by default if using an intel macbook
 (#8648) (#8649)

---
 tinygrad/helpers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 4346255983..d47f62d7d7 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -101,7 +101,8 @@ class ContextVar:
   def __gt__(self, x): return self.value > x
   def __lt__(self, x): return self.value < x
 
-DEBUG, IMAGE, BEAM, NOOPT, JIT = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0), ContextVar("JIT", 1)
+DEBUG, IMAGE, BEAM, NOOPT = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
+JIT = ContextVar("JIT", 2 if platform.system() == 'Darwin' and ('Intel' in platform.processor() or 'i386' in platform.processor()) else 1)
 WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)
 USE_TC, TC_OPT, AMX, TRANSCENDENTAL = ContextVar("TC", 1), ContextVar("TC_OPT", 0), ContextVar("AMX", 0), ContextVar("TRANSCENDENTAL", 1)
 FUSE_ARANGE, FUSE_CONV_BW = ContextVar("FUSE_ARANGE", 0), ContextVar("FUSE_CONV_BW", 0)

From 2b7db9b45db109c53500c7110cbed32aa8c223fa Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Fri, 17 Jan 2025 03:04:18 -0500
Subject: [PATCH 15/16] delete unused cast/bitcast lines from ops.py [pr]
 (#8651)

* move cast and bitcast out

* more deletion of bitcast arg

* fix test_bitcast_fuses

* update tests

* work
---
 test/test_multitensor.py      |  4 ++++
 test/test_schedule.py         |  4 ++--
 test/unit/test_disk_tensor.py |  4 ++--
 tinygrad/engine/schedule.py   |  2 +-
 tinygrad/multi.py             |  2 +-
 tinygrad/ops.py               | 10 ++--------
 tinygrad/tensor.py            |  4 ++--
 7 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index c8325137d7..52a7e11ed2 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1041,5 +1041,9 @@ class TestTensorOps(unittest.TestCase):
   def test_interpolate(self):
     helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))
 
+  @unittest.expectedFailure # 'MultiLazyBuffer' object has no attribute 'bitcast'
+  def test_bitcast(self):
+    helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/test/test_schedule.py b/test/test_schedule.py
index bff3ba4aad..d342ba26e6 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -1344,8 +1344,8 @@ class TestSchedule(unittest.TestCase):
 
   def test_bitcast_fuses(self):
     x = cast(UOp, Tensor.empty(1, dtype=dtypes.float32).realize().lazydata)
-    a = x.alu(Ops.EXP2).cast(dtypes.int32, True)
-    b = x.cast(dtypes.int32, True)
+    a = x.alu(Ops.EXP2).bitcast(dtypes.int32)
+    b = x.bitcast(dtypes.int32)
     b = a.alu(Ops.ADD, b)
     check_schedule(b, 1) # this should fuse when it makes sense
 
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index 917a497e9d..7078a994f3 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -68,8 +68,8 @@ class TestRawDiskBuffer(unittest.TestCase):
     _test_bitcasted(t, dtypes.float32, 3.1415927)
     _test_bitcasted(t, dtypes.uint32, 0x40490FDB)
     # doesn't suport normal cast
-    with self.assertRaises(RuntimeError):
-      Tensor.empty((4,), dtype=dtypes.int16, device=f"disk:{tmp}").cast(dtypes.float16)
+    with self.assertRaises(NotImplementedError):
+      Tensor.empty((4,), dtype=dtypes.int16, device=f"disk:{tmp}").cast(dtypes.float16).realize()
 
     # Those two should be moved to test_dtype.py:test_shape_change_bitcast after bitcast works on non-disk
     with self.assertRaises(RuntimeError):
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index eec52ae4b6..d879daff54 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -13,7 +13,7 @@ from tinygrad.device import Buffer
 # creation can recurse a lot
 sys.setrecursionlimit(10000)
 
-# **** big graph spec
+# **** Tensor UOp spec
 
 tensor_uop_spec = PatternMatcher([
   (UPat(Ops.DEVICE, dtypes.void, (), name="device"), lambda device: isinstance(device.arg, str)),
diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 0dbb3520cc..35e5e34dcb 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -76,7 +76,7 @@ class MultiLazyBuffer(MathTrait):
   # passthroughs
   @property
   def is_realized(self) -> bool: return all(lb.base.realized is not None for lb in self.real_lbs)
-  def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
+  def cast(self, dtype:DType): return MultiLazyBuffer([x.cast(dtype) for x in self.lbs], self.axis, self.real)
   def const_like(self, b) -> MultiLazyBuffer: return MultiLazyBuffer([x.const_like(b) for x in self.lbs], self.axis, self.real)
   def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
   def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index e4052bb747..62d884bd98 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -358,14 +358,8 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
     assert self.dtype.count == 1
     if count == 1: return self
     return UOp(Ops.VECTORIZE, self.dtype.vec(count), (self,)*count)
-  def cast(self, dtype:DType, bitcast=False):
-    if bitcast: return self.bitcast(dtype)
-    if self._device is not None and self._device.startswith("DISK"): raise RuntimeError("CAST isn't supported on DISK")
-    return UOp(Ops.CAST, dtype, (self,))
-  def bitcast(self, dtype:DType):
-    if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
-      raise RuntimeError(f"unsupported size in bitcast {dtype}")
-    return UOp(Ops.BITCAST, dtype, (self,))
+  def cast(self, dtype:DType): return UOp(Ops.CAST, dtype, (self,))
+  def bitcast(self, dtype:DType): return UOp(Ops.BITCAST, dtype, (self,))
   def gep(self, i:Union[tuple[int, ...], int]):
     if isinstance(i, int):
       # NOTE: these are just shortcuts to not have to create and fold later
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 1df22659ce..e0a5dc0867 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -3813,8 +3813,8 @@ class Tensor(SimpleMathTrait):
     """
     if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
     dt = to_dtype(dtype)
-    if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and (ns:=dt.itemsize) != (os:=self.dtype.itemsize):
-      if (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
+    if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
+    if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os:
       new_uint, old_uint = to_dtype(f"uint{8*ns}"), to_dtype(f"uint{8*os}")
       tmp = self.bitcast(old_uint)
       if ns > os: return functools.reduce(Tensor.add, (tmp[..., i::ns//os].cast(new_uint) << 8*i*os for i in range(ns//os))).bitcast(dtype)

From 23f0ff0ed80e99ea330b978f86960e95a9c6d7ba Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Fri, 17 Jan 2025 03:17:19 -0500
Subject: [PATCH 16/16] add bitcast to multi [pr] (#8652)

---
 test/test_multitensor.py | 1 -
 tinygrad/multi.py        | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index 52a7e11ed2..8e96e2ba59 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1041,7 +1041,6 @@ class TestTensorOps(unittest.TestCase):
   def test_interpolate(self):
     helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))
 
-  @unittest.expectedFailure # 'MultiLazyBuffer' object has no attribute 'bitcast'
   def test_bitcast(self):
     helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))
 
diff --git a/tinygrad/multi.py b/tinygrad/multi.py
index 35e5e34dcb..e6e165bb25 100644
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -77,6 +77,7 @@ class MultiLazyBuffer(MathTrait):
   @property
   def is_realized(self) -> bool: return all(lb.base.realized is not None for lb in self.real_lbs)
   def cast(self, dtype:DType): return MultiLazyBuffer([x.cast(dtype) for x in self.lbs], self.axis, self.real)
+  def bitcast(self, dtype:DType): return MultiLazyBuffer([x.bitcast(dtype) for x in self.lbs], self.axis, self.real)
   def const_like(self, b) -> MultiLazyBuffer: return MultiLazyBuffer([x.const_like(b) for x in self.lbs], self.axis, self.real)
   def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
   def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)