Merge branch 'master' into retinanet_mlperf

2026-04-29 03:00:14 -04:00 · 2025-01-17 07:51:08 -08:00
parent b570c64d5b 23f0ff0ed8
commit 53881cdfc8
22 changed files with 216 additions and 134 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -373,6 +373,10 @@ jobs:
    #  run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
    - name: Remove amdgpu
      run: sleep 5 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid.
+    - name: Test AM cold start time
+      run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
+    - name: Test AM warm start time
+      run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Run Stable Diffusion
      run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
    - name: Run SDXL
--- a/docs/developer/am.md
+++ b/docs/developer/am.md
@@ -0,0 +1,39 @@
+# AM Driver
+
+AM driver is a userspace driver targeting AMD's 7900XTX. You only need tinygrad to send compute tasks to your GPU!
+
+## How to run?
+Make sure that amdgpu module is unloaded and just run tinygrad with `AMD=1`!
+
+Optional requirements:
+
+* System without IOMMU for P2P / SDMA support
+* vfio-pci module for IRQ handling
+
+## Environment Variables
+
+| Variable | Possible Value(s) | Description |
+|----------|------------------|-------------|
+| AM_RESET | [1] | Performs a full GPU reset (reloading all firmware and IP blocks) |
+| AM_DEBUG | [0-4] | Sets the level of additional debugging information |
+
+## AM Driver Details
+
+### Compute & SDMA Queues
+
+AM binds compute queues directly to MEC (bypassing MES). Tinygrad uses only one compute queue, which is bound at `pipe=0 queue=0`. Similarly, the single SDMA queue is bound at `engine=0 queue=0`.
+
+### Boot
+
+The GPU being passed can be in one of several states:
+1. Not initialized
+2. Initialized by AMDGPU
+3. Initialized by AM
+
+The first and second states require a full GPU setup since their states are unknown. The second state also requires a mode1 reset to reinitialize all components.
+
+The third state can be set up partially to optimize boot time. In this case, only the GFX and SDMA IPs need to be initialized. To enable this, AM uses a separate boot memory that is guaranteed not to be overwritten. This physical memory is utilized for all blocks that are initialized only during the initial AM boot. To determine if the GPU is in the third state, AM uses `regSCRATCH_REG7` as a flag.
+
+### VM Management
+
+Each AM device sets up only a single `VMID=0` and one page directory. The page directory used is 3-level and thus supports up to 512TB of virtual addresses. All AM devices are located in one virtual address space.
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -27,6 +27,7 @@ nav:
    - Runtime:
      - developer/runtime.md
      - HCQ: developer/hcq.md
+      - AM Driver: developer/am.md
  - tinybox: tinybox.md
 #- tinygrad: reference/

--- a/test/external/fuzz_shapetracker_size.py
+++ b/test/external/fuzz_shapetracker_size.py
@@ -0,0 +1,13 @@
+from tinygrad.shape.shapetracker import ShapeTracker
+from test.external.fuzz_shapetracker import shapetracker_ops as st_ops
+from test.unit.test_shapetracker_math import MultiShapeTracker
+from tinygrad.helpers import getenv
+import random
+
+random.seed(getenv("SEED", 42))
+for i in range(getenv("CNT", 2000)):
+  if getenv("DEBUG", 0) >= 1: print()
+  N = random.randint(1, 10000)
+  mst = MultiShapeTracker([ShapeTracker.from_shape((N,))])  # st_ops don't mutate regular shapetrackers for some reason
+  for j in range(20): random.choice(st_ops)(mst)
+  assert mst.sts[0].real_size() <= N, f"{N=}, real_size={mst.sts[0].real_size()}, st={mst.sts[0]}"
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -1041,5 +1041,8 @@ class TestTensorOps(unittest.TestCase):
  def test_interpolate(self):
    helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))

+  def test_bitcast(self):
+    helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))
+
 if __name__ == '__main__':
  unittest.main()
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported
 from tinygrad.dtype import DType, ImageDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
+from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views
 from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
 from tinygrad.codegen.kernel import verify_ast
 from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
@@ -1344,8 +1344,8 @@ class TestSchedule(unittest.TestCase):

  def test_bitcast_fuses(self):
    x = cast(UOp, Tensor.empty(1, dtype=dtypes.float32).realize().lazydata)
-    a = x.alu(Ops.EXP2).cast(dtypes.int32, True)
-    b = x.cast(dtypes.int32, True)
+    a = x.alu(Ops.EXP2).bitcast(dtypes.int32)
+    b = x.bitcast(dtypes.int32)
    b = a.alu(Ops.ADD, b)
    check_schedule(b, 1) # this should fuse when it makes sense

@@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase):
    a[0] = 6
    np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])

-  @unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
+  #@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
+  @unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
  def test_arange_view_op(self):
    a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
    sched = self.check_schedule(a, 1)
@@ -2104,16 +2105,6 @@ class TestConst(unittest.TestCase):
    print(a.lazydata)
    self.assertTrue(tensor_const_pm.rewrite(a.lazydata))

-  def test_uop_methods(self):
-    a = Tensor(1)
-    self.assertTrue(a.lazydata.is_unrealized_unmasked_const())
-
-    a = Tensor.ones((4, 4))
-    self.assertTrue(a.lazydata.is_unrealized_unmasked_const())
-
-    a = Tensor.ones((4, 4)).pad((1, 1),)
-    self.assertFalse(a.lazydata.is_unrealized_unmasked_const())
-
  def test_const_schedule(self):
    a = Tensor.ones((4, 4))
    sched = a.schedule()
--- a/test/test_setitem.py
+++ b/test/test_setitem.py
@@ -1,6 +1,5 @@
 import unittest
-from tinygrad import Device, Tensor, TinyJit, Variable, dtypes
-from tinygrad.helpers import CI
+from tinygrad import Tensor, TinyJit, Variable, dtypes
 import numpy as np

 class TestSetitem(unittest.TestCase):
@@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase):
  def test_setitem_overlapping_inplace1(self):
    t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
    t[1:] = t[:-1]
-    if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"):
-      # TODO: FIXME
-      with self.assertRaises(AssertionError):
-        self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
-    else:
-      self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
+    self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])

  def test_setitem_overlapping_inplace2(self):
    t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
--- a/test/test_tensor_uop.py
+++ b/test/test_tensor_uop.py
@@ -84,6 +84,14 @@ class TestTensorUOp(unittest.TestCase):
    sched = empty.schedule()
    self.assertEqual(len(sched), 0)

+  def test_contiguous_folded_alu(self):
+    a = Tensor.empty(8, 8)
+    # NOTE: the buffer for mul_0 late folds to just a CONST
+    mul_0 = a*0
+    out = mul_0.shrink(((4, 8), (0, 8))).contiguous()
+    out.realize()
+    self.assertEqual(out.tolist(), Tensor.zeros(4, 8).tolist())
+
 reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(), UPat(Ops.REDUCE_AXIS)))))
 class TestReduceOp(unittest.TestCase):
  def test_no_split_reduce_kernel(self):
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -68,8 +68,8 @@ class TestRawDiskBuffer(unittest.TestCase):
    _test_bitcasted(t, dtypes.float32, 3.1415927)
    _test_bitcasted(t, dtypes.uint32, 0x40490FDB)
    # doesn't suport normal cast
-    with self.assertRaises(RuntimeError):
-      Tensor.empty((4,), dtype=dtypes.int16, device=f"disk:{tmp}").cast(dtypes.float16)
+    with self.assertRaises(NotImplementedError):
+      Tensor.empty((4,), dtype=dtypes.int16, device=f"disk:{tmp}").cast(dtypes.float16).realize()

    # Those two should be moved to test_dtype.py:test_shape_change_bitcast after bitcast works on non-disk
    with self.assertRaises(RuntimeError):
--- a/test/unit/test_shapetracker.py
+++ b/test/unit/test_shapetracker.py
@@ -833,6 +833,30 @@ class TestShapeTrackerSize(unittest.TestCase):
                                                                               strides=(0, 128, 0, 4096, 1), offset=0, mask=None, contiguous=False)))
    self.assertEqual(st.real_size(), 8389632)

+  def test_pad_size_simple(self):
+    st = ShapeTracker.from_shape((10,)).pad(((2,4),))
+    self.assertEqual(st.real_size(), 10)
+
+  def test_pad_size_multiview(self):
+    st = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
+    self.assertEqual(st.real_size(), 100)
+
+  # TODO improve real_size accuracy in cases like this?
+  @unittest.expectedFailure
+  def test_stride_size(self):
+    st1 = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
+    st2 = ShapeTracker.from_shape((10,10)).stride((2,1)).reshape((5*10,)).stride((17,))
+    self.assertEqual(st1.real_size(), 78)
+    self.assertEqual(st2.real_size(), 65)
+
+  def test_stride_size_bounds(self):
+    # lower bound checks that real_size doesn't give false positive for fitting in a buffer
+    # upper bound checks that real_size doesn't exceed N when movementops were applied to from_shape((N,))
+    st1 = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
+    st2 = ShapeTracker.from_shape((10,10)).stride((2,1)).reshape((5*10,)).stride((17,))
+    self.assertTrue(78 <= st1.real_size() <= 100)
+    self.assertTrue(65 <= st2.real_size() <= 100)
+
 class TestConsecutive(unittest.TestCase):
  @classmethod
  def setUpClass(self):
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -2,8 +2,10 @@ from __future__ import annotations
 from dataclasses import dataclass, replace
 from collections import defaultdict
 from typing import Optional, Any, Iterator, Generator
-import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time
-from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp
+import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time
+from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
+from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp, mv_address, \
+                             cpu_time_execution
 from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes
 from tinygrad.renderer import Renderer

@@ -213,6 +215,40 @@ class _MallocAllocator(LRUAllocator):

 MallocAllocator = _MallocAllocator()

+# NOTE: MAP_JIT is added to mmap module in python 3.13
+MAP_JIT = 0x0800
+
+# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
+class CPUProgram:
+  helper_handle = ctypes.CDLL(ctypes.util.find_library('System') if OSX else 'libgcc_s.so.1')
+
+  def __init__(self, name:str, lib:bytes):
+    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
+    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
+    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
+
+    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
+    self.mem.write(lib)
+    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
+
+    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
+    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
+    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
+    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
+    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
+
+    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
+
+  def __call__(self, *bufs, vals=(), wait=False):
+    args = list(bufs) + list(vals)
+    # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
+    # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
+    # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
+    # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
+    # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
+    if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
+    return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
+
 # **************** for Compiled Devices ****************

 class CompileError(Exception): pass
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -13,7 +13,7 @@ from tinygrad.device import Buffer
 # creation can recurse a lot
 sys.setrecursionlimit(10000)

-# **** big graph spec
+# **** Tensor UOp spec

 tensor_uop_spec = PatternMatcher([
  (UPat(Ops.DEVICE, dtypes.void, (), name="device"), lambda device: isinstance(device.arg, str)),
@@ -51,17 +51,6 @@ tensor_uop_spec = PatternMatcher([
  # ASSIGN changes the value of a realized buffer
  (UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))),
   lambda assign,target,new_val: (target.op is Ops.BUFFER or target.is_realized) and (assign.dtype == target.dtype == new_val.dtype)),
-
-  # TODO: BUFFER_VIEW is overloaded, it should be removed.
-  # BUFFER_VIEW shares the device buffer with its source, it uses a subbuffer of the underlying source buffer
-
-  (UPat(Ops.BUFFER_VIEW, name="root", src=(UPat.var("x"),)), lambda root,x:
-   # BUFFER_VIEW can replace contiguous, keeping dtype the same
-   (root.dtype == x.dtype) or
-   # it can also replace bitcast, this changes the dtype, but the itemsize stays the same
-   (root.dtype != x.dtype and root.dtype.itemsize == x.dtype.itemsize) or
-   # it can also represent shape changing bitcast (only on DISK)
-   (root.dtype != x.dtype and root.dtype.itemsize != x.dtype.itemsize and x.device.startswith("DISK"))),
 ])

 # **** ScheduleItem return type
@@ -455,6 +444,12 @@ def fold_img_cast(ctx:ScheduleContext, xb:UOp, view:UOp, b:UOp, to_cast:UOp, **k
 def sink_outputs(ctx:ScheduleContext, sink:UOp) -> None:
  for x in sink.src: realize(ctx, x.buf_uop, x)

+def create_subbuffer(base:UOp, b:UOp, root:UOp, x:UOp):
+  if not root.device.startswith("DISK"): return None
+  if x.op is not Ops.VIEW: x = x.src[-1] # TODO: remove this once forced_realize is gone
+  buffers[b] = x.buf_uop.buffer.view(b.size, b.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
+  return base.replace(src=(b, root.replace(op=Ops.BUFFER_VIEW)))
+
 do_realize = PatternMatcher([
  # always realize sinked ops
  (UPat(Ops.SINK, name="sink"), sink_outputs),
@@ -467,6 +462,8 @@ do_realize = PatternMatcher([
  # realize before COPY or BUFFER_VIEW
  (UPat(Ops.COPY, src=(UPat(), UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
  (UPat(Ops.BUFFER_VIEW, src=(UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
+  # substitute BITCAST/CONTIGUOUS with BUFFER_VIEW on DISK
+  (UPatScheduled((Ops.BITCAST, Ops.CONTIGUOUS), name="root", src=(UPat.var("x"),)), create_subbuffer),
 ])

 # **** rewrite VIEW into LOAD/STORE/VALID or fuse the underlying UOp
@@ -502,10 +499,6 @@ def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None:
  if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop)
  for x in op.base.src:
    if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None
-  # BUFFER_VIEW overrides the underlying buffer
-  # TODO: this should be a shrink on the buffer
-  if op.op is Ops.BUFFER_VIEW:
-    buffers[buf_uop] = (x:=op.src[0]).buf_uop.buffer.view(view.size, view.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
  buf_uop.buffer.ref(1)
 create_ctx = PatternMatcher([(UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf_uop"), UPat())), append_uop)])

--- a/tinygrad/gradient.py
+++ b/tinygrad/gradient.py
@@ -37,9 +37,7 @@ pm_gradient = PatternMatcher([
  (UPat(Ops.EXPAND, name="ret"), lambda ctx, ret:
    (ctx.cast(sum_acc_dtype(ctx.dtype)).r(Ops.ADD, tuple(i for i,(si,so) in enumerate(zip(ret.src[0].shape, ret.arg)) if si!=so)).cast(ctx.dtype),)),

-  # there's no gradient for...is this ASSIGN?
-  (UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)),
-  # also no gradient for bitcast
+  # there's no gradient for bitcast
  (UPat(Ops.BITCAST), lambda ctx: (None,)),
 ])

--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -101,7 +101,8 @@ class ContextVar:
  def __gt__(self, x): return self.value > x
  def __lt__(self, x): return self.value < x

-DEBUG, IMAGE, BEAM, NOOPT, JIT = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0), ContextVar("JIT", 1)
+DEBUG, IMAGE, BEAM, NOOPT = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
+JIT = ContextVar("JIT", 2 if platform.system() == 'Darwin' and ('Intel' in platform.processor() or 'i386' in platform.processor()) else 1)
 WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)
 USE_TC, TC_OPT, AMX, TRANSCENDENTAL = ContextVar("TC", 1), ContextVar("TC_OPT", 0), ContextVar("AMX", 0), ContextVar("TRANSCENDENTAL", 1)
 FUSE_ARANGE, FUSE_CONV_BW = ContextVar("FUSE_ARANGE", 0), ContextVar("FUSE_CONV_BW", 0)
@@ -267,6 +268,15 @@ def cpu_objdump(lib, objdump_tool='objdump'):
    pathlib.Path(f.name).write_bytes(lib)
    print(subprocess.check_output([objdump_tool, '-d', f.name]).decode('utf-8'))

+def capstone_flatdump(lib: bytes):
+  import capstone
+  match platform.machine():
+    case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
+    case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
+    case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
+  for instr in cs.disasm(lib, 0):
+    print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
+
 # *** ctypes helpers

 # TODO: make this work with read only memoryviews (if possible)
--- a/tinygrad/multi.py
+++ b/tinygrad/multi.py
@@ -76,7 +76,8 @@ class MultiLazyBuffer(MathTrait):
  # passthroughs
  @property
  def is_realized(self) -> bool: return all(lb.base.realized is not None for lb in self.real_lbs)
-  def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
+  def cast(self, dtype:DType): return MultiLazyBuffer([x.cast(dtype) for x in self.lbs], self.axis, self.real)
+  def bitcast(self, dtype:DType): return MultiLazyBuffer([x.bitcast(dtype) for x in self.lbs], self.axis, self.real)
  def const_like(self, b) -> MultiLazyBuffer: return MultiLazyBuffer([x.const_like(b) for x in self.lbs], self.axis, self.real)
  def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
  def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -291,7 +291,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
    if self.op in GroupOp.Buffer: return vsrc[0] if len(vsrc:=[x.st for x in self.src if x.op is Ops.VIEW]) != 0 else None
    if not (src_sts := [x.st for x in self.src if x.st is not None]): return None
    assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}"
-    if self.op is Ops.BUFFER_VIEW:
+    if self.op in {Ops.BITCAST, Ops.BUFFER_VIEW}:
      shape = src_sts[0].shape
      if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,)
    # only reduce ops are allowed to change shape, everything else derives shape from sources
@@ -358,17 +358,8 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
    assert self.dtype.count == 1
    if count == 1: return self
    return UOp(Ops.VECTORIZE, self.dtype.vec(count), (self,)*count)
-  def cast(self, dtype:DType, bitcast=False):
-    if bitcast: return self.bitcast(dtype)
-    if self._device is not None and self._device.startswith("DISK"): raise RuntimeError("CAST isn't supported on DISK")
-    return UOp(Ops.CAST, dtype, (self,))
-  def bitcast(self, dtype:DType):
-    if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
-      raise RuntimeError(f"unsupported size in bitcast {dtype}")
-    # shape changing bitcast can use a subbuffer on DISK
-    # TODO: this should be moved to realize.py
-    if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
-    return UOp(Ops.BITCAST, dtype, (self,))
+  def cast(self, dtype:DType): return UOp(Ops.CAST, dtype, (self,))
+  def bitcast(self, dtype:DType): return UOp(Ops.BITCAST, dtype, (self,))
  def gep(self, i:Union[tuple[int, ...], int]):
    if isinstance(i, int):
      # NOTE: these are just shortcuts to not have to create and fold later
@@ -420,9 +411,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
    if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
    return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape)  # reduce original axes, then split
  def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
-  def contiguous(self, allow_buffer_view=True):
+  def contiguous(self):
    if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
-      return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS)
+      return self.alu(Ops.CONTIGUOUS)
    forced_realize.add(self.base)
    return self

@@ -450,10 +441,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
    # COPY is COPY(DEVICE, copyin.base) -> VIEW(copyin.st)
    return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
  def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
-  def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
-  def can_view(self):
-    return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and
-            not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices)
  @property
  def lbs(self): return [self]
  @property
--- a/tinygrad/renderer/ptx.py
+++ b/tinygrad/renderer/ptx.py
@@ -63,8 +63,8 @@ def render_wmma(ctx: "PTXRenderer", x: UOp):
  dt_map = { dtypes.half: "f16" }
  _i = 0
  for vv in x.src[:2]:
-    for i in range(0, len(ctx.r[vv]), 2):
-      yield f"mov.b32 {ctx.wmma_r[_i]}, {{{', '.join(ctx.r[vv][i:i+2])}}};"
+    for i in range(0, len(ctx.r[vv]), (elems_per_reg := 4//dtype_in.itemsize)):
+      yield f"mov.b32 {ctx.wmma_r[_i]}, " + (f"{{{', '.join(ctx.r[vv][i:i+elems_per_reg])}}}" if elems_per_reg > 1 else ctx.r[vv][i]) + ";"
      _i += 1
  yield f'mma.sync.aligned.m{M}n{N}k{K}.row.col.f32.{dt_map[dtype_in]}.{dt_map[dtype_in]}.f32{" "*12}' +\
  f'{{{", ".join(ctx.r[x])}}}, {{{", ".join(ctx.wmma_r[:n_operands[0]])}}}, {{{", ".join(ctx.wmma_r[-n_operands[1]:])}}}, ' + \
@@ -184,7 +184,7 @@ class PTXRenderer(Renderer):
        r[u] = [ssa('val', dtype=self.types[u.dtype.scalar()]) for _ in range(u.dtype.count)] if u.dtype.count > 1 else ssa('val', u)
      elif u.op is Ops.DEFINE_GLOBAL: bufs.append((f"data{u.arg}", u.dtype))
      elif u.op is Ops.WMMA:
-        self.wmma_r = [ssa("wmma", dtype="b32") for vv in u.src[:2] for i in range(0, len(r[vv]), 2)]
+        self.wmma_r = [ssa("wmma", dtype="b32") for vv in u.src[:2] for i in range(0, len(r[vv]), 4//u.arg[2].itemsize)]
        r[u] = [ssa("wmma", dtype=self.types[u.dtype.scalar()]) for _ in range(u.dtype.count)]
      prefix, dtype = {Ops.CAST: ("cast", None), Ops.BITCAST: ("cast", None), Ops.ENDRANGE: ("pred", "pred"), Ops.RANGE: ("ridx", None),
        Ops.DEFINE_ACC: ("acc", None), Ops.DEFINE_VAR: ("dat", None), Ops.CONST: ("const", None), Ops.DEFINE_LOCAL:("local",self.types[dtypes.ulong]),
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -197,24 +197,44 @@ class AMDCopyQueue(HWQueue):
           *data64_le(signal.timestamp_addr))
    return self

+  def bind(self, dev:AMDDevice):
+    if not dev.driverless: return
+
+    self.binded_device = dev
+    self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
+    hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
+    for i, value in enumerate(self._q): hw_view[i] = value
+
+    self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
+    self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
+
  def _submit(self, dev:AMDDevice):
    if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")

+    if self.binded_device == dev:
+      # An IB packet must end on a 8 DW boundary.
+      add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
+      cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
+
+      if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
+        cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
+    else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
+
    tail_blit_dword = 0
-    for cmdsz in self.internal_cmd_sizes:
+    for cmdsz in cmd_sizes:
      if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
      tail_blit_dword += cmdsz

    start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
-    dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self._q[:tail_blit_dword])
+    dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
    dev.sdma_queue.put_value += tail_blit_dword * 4

-    if (rem_packet_cnt := len(self._q) - tail_blit_dword) > 0:
+    if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
      zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
      ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
      dev.sdma_queue.put_value += zero_fill

-      dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self._q[tail_blit_dword:])
+      dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
      dev.sdma_queue.put_value += rem_packet_cnt * 4

    dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,13 +1,9 @@
-import ctypes, ctypes.util, struct, platform, tempfile, pathlib, subprocess
-from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
-from tinygrad.helpers import OSX, mv_address, cpu_time_execution, cpu_objdump
-from tinygrad.device import Compiled, Compiler, MallocAllocator
-from tinygrad.runtime.support.elf import elf_loader, relocate
+import platform, tempfile, pathlib, subprocess
+from tinygrad.helpers import cpu_objdump, capstone_flatdump
+from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
+from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.renderer.cstyle import ClangRenderer

-# NOTE: MAP_JIT is added to mmap module in python 3.13
-MAP_JIT = 0x0800
-
 # Used by ops_dsp.py
 class ClangCompiler(Compiler):
  def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
@@ -33,51 +29,9 @@ class ClangJITCompiler(Compiler):
    args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
    arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
    obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
-    image, _, relocs = elf_loader(obj)
-    # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
-    for ploc,tgt,r_type,r_addend in relocs:
-      image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
-    return bytes(image)
+    return jit_loader(obj)

-  def disassemble(self, lib):
-    import capstone
-    match platform.machine():
-      case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
-      case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
-      case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
-    for instr in cs.disasm(lib, 0):
-      print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
-
-# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
-class CPUProgram:
-  helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
-
-  def __init__(self, name:str, lib:bytes):
-    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
-    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
-    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
-
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
-    self.mem.write(lib)
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
-
-    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
-    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
-    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
-    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
-    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
-
-    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
-
-  def __call__(self, *bufs, vals=(), wait=False):
-    args = list(bufs) + list(vals)
-    # NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
-    # Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
-    # https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
-    # This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
-    # The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
-    if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
-    return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
+  def disassemble(self, lib:bytes): return capstone_flatdump(lib)

 class ClangDevice(Compiled):
  def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
--- a/tinygrad/runtime/support/elf.py
+++ b/tinygrad/runtime/support/elf.py
@@ -1,4 +1,4 @@
-import tinygrad.runtime.autogen.libc as libc
+import struct, tinygrad.runtime.autogen.libc as libc
 from dataclasses import dataclass
 from tinygrad.helpers import getbits, i2u

@@ -51,3 +51,10 @@ def relocate(instr: int, ploc: int, tgt: int, r_type: int):
    case libc.R_AARCH64_LDST64_ABS_LO12_NC: return instr | (getbits(tgt, 3, 11) << 10)
    case libc.R_AARCH64_LDST128_ABS_LO12_NC: return instr | (getbits(tgt, 4, 11) << 10)
  raise NotImplementedError(f"Encountered unknown relocation type {r_type}")
+
+def jit_loader(obj: bytes) -> bytes:
+  image, _, relocs = elf_loader(obj)
+  # This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
+  for ploc,tgt,r_type,r_addend in relocs:
+    image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
+  return bytes(image)
--- a/tinygrad/shape/shapetracker.py
+++ b/tinygrad/shape/shapetracker.py
@@ -72,12 +72,10 @@ class ShapeTracker:
  def to_indexed_uops(self, _idxs:Optional[list[UOp]|tuple[UOp, ...]]=None) -> tuple[UOp, UOp]:
    return views_to_indexed_uops(self.views, tuple(_idxs) if _idxs is not None else None)

+  # upper bound on buffer size required to fit this shapetracker
  def real_size(self) -> int:
    if 0 in self.shape: return 0
-    idx, valid = self.to_indexed_uops()
-    if not valid.vmax: return 0
-    assert idx.vmax < 1e12, f"real_size broken for {self}"
-    return int(idx.vmax+1)
+    return int((v.shrink(v.mask) if (v:=self.views[0]).mask else v).to_indexed_uops()[0].vmax + 1)

  def vars(self) -> set[Variable]: return set().union(*[v.vars() for v in self.views])

--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait):
      lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
    sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
    # NOTE: this contiguous is making it impossible for the scheduler to do late const folding
-    mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
+    mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis)
    return Tensor(mlb, device=devices, requires_grad=self.requires_grad)

  def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):
@@ -3130,9 +3130,10 @@ class Tensor(SimpleMathTrait):
    # broadcast
    return x._broadcast_to(out_shape:=_broadcast_shape(x.shape, y.shape)), y._broadcast_to(out_shape)

+  # TODO: tensor should stop checking if things are const
  def _to_const_val(self, x:Union[Tensor, ConstType]) -> Union[Tensor, ConstType]:
-    return x.lazydata.const_arg if isinstance(x, Tensor) and isinstance(x.lazydata, UOp) and x.lazydata.is_unrealized_unmasked_const() \
-      and not x.requires_grad and self._broadcasted(x)[0].shape == self.shape else x
+    return x.lazydata.const_arg if isinstance(x, Tensor) and isinstance(x.lazydata, UOp) and x.lazydata.base.op is Ops.CONST \
+      and unwrap(x.lazydata.st).views[0].mask is None and not x.requires_grad and self._broadcasted(x)[0].shape == self.shape else x

  def add(self, x:Union[Tensor, ConstType], reverse=False) -> Tensor:
    """
@@ -3812,8 +3813,8 @@ class Tensor(SimpleMathTrait):
    """
    if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
    dt = to_dtype(dtype)
-    if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and (ns:=dt.itemsize) != (os:=self.dtype.itemsize):
-      if (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
+    if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
+    if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os:
      new_uint, old_uint = to_dtype(f"uint{8*ns}"), to_dtype(f"uint{8*os}")
      tmp = self.bitcast(old_uint)
      if ns > os: return functools.reduce(Tensor.add, (tmp[..., i::ns//os].cast(new_uint) << 8*i*os for i in range(ns//os))).bitcast(dtype)