Merge branch 'master' into retinanet_mlperf

This commit is contained in:
Francis Lata
2025-01-17 07:51:08 -08:00
22 changed files with 216 additions and 134 deletions

View File

@@ -373,6 +373,10 @@ jobs:
# run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
- name: Remove amdgpu
run: sleep 5 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid.
- name: Test AM cold start time
run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
- name: Test AM warm start time
run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
- name: Run Stable Diffusion
run: AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt
- name: Run SDXL

39
docs/developer/am.md Normal file
View File

@@ -0,0 +1,39 @@
# AM Driver
AM driver is a userspace driver targeting AMD's 7900XTX. You only need tinygrad to send compute tasks to your GPU!
## How to run?
Make sure that amdgpu module is unloaded and just run tinygrad with `AMD=1`!
Optional requirements:
* System without IOMMU for P2P / SDMA support
* vfio-pci module for IRQ handling
## Environment Variables
| Variable | Possible Value(s) | Description |
|----------|------------------|-------------|
| AM_RESET | [1] | Performs a full GPU reset (reloading all firmware and IP blocks) |
| AM_DEBUG | [0-4] | Sets the level of additional debugging information |
## AM Driver Details
### Compute & SDMA Queues
AM binds compute queues directly to MEC (bypassing MES). Tinygrad uses only one compute queue, which is bound at `pipe=0 queue=0`. Similarly, the single SDMA queue is bound at `engine=0 queue=0`.
### Boot
The GPU being passed can be in one of several states:
1. Not initialized
2. Initialized by AMDGPU
3. Initialized by AM
The first and second states require a full GPU setup since their states are unknown. The second state also requires a mode1 reset to reinitialize all components.
The third state can be set up partially to optimize boot time. In this case, only the GFX and SDMA IPs need to be initialized. To enable this, AM uses a separate boot memory that is guaranteed not to be overwritten. This physical memory is utilized for all blocks that are initialized only during the initial AM boot. To determine if the GPU is in the third state, AM uses `regSCRATCH_REG7` as a flag.
### VM Management
Each AM device sets up only a single `VMID=0` and one page directory. The page directory used is 3-level and thus supports up to 512TB of virtual addresses. All AM devices are located in one virtual address space.

View File

@@ -27,6 +27,7 @@ nav:
- Runtime:
- developer/runtime.md
- HCQ: developer/hcq.md
- AM Driver: developer/am.md
- tinybox: tinybox.md
#- tinygrad: reference/

13
test/external/fuzz_shapetracker_size.py vendored Normal file
View File

@@ -0,0 +1,13 @@
from tinygrad.shape.shapetracker import ShapeTracker
from test.external.fuzz_shapetracker import shapetracker_ops as st_ops
from test.unit.test_shapetracker_math import MultiShapeTracker
from tinygrad.helpers import getenv
import random
random.seed(getenv("SEED", 42))
for i in range(getenv("CNT", 2000)):
if getenv("DEBUG", 0) >= 1: print()
N = random.randint(1, 10000)
mst = MultiShapeTracker([ShapeTracker.from_shape((N,))]) # st_ops don't mutate regular shapetrackers for some reason
for j in range(20): random.choice(st_ops)(mst)
assert mst.sts[0].real_size() <= N, f"{N=}, real_size={mst.sts[0].real_size()}, st={mst.sts[0]}"

View File

@@ -1041,5 +1041,8 @@ class TestTensorOps(unittest.TestCase):
def test_interpolate(self):
helper_test_shard_op([(4,16,16),(4,24,24)], lambda x: Tensor.interpolate(x, (19,19)))
def test_bitcast(self):
helper_test_shard_op([(256,), (256,)], lambda x: x.bitcast(dtypes.int))
if __name__ == '__main__':
unittest.main()

View File

@@ -13,7 +13,7 @@ from tinygrad.device import is_dtype_supported
from tinygrad.dtype import DType, ImageDType
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, view_supported_devices, symbolic_simple, merge_views
from tinygrad.ops import PatternMatcher, UOp, Ops, UPat, graph_rewrite, track_rewrites, symbolic_simple, merge_views
from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, getenv, SPLIT_REDUCEOP, unwrap, prod, Context
from tinygrad.codegen.kernel import verify_ast
from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars, view_right, view_left, remove_movement_ops
@@ -1344,8 +1344,8 @@ class TestSchedule(unittest.TestCase):
def test_bitcast_fuses(self):
x = cast(UOp, Tensor.empty(1, dtype=dtypes.float32).realize().lazydata)
a = x.alu(Ops.EXP2).cast(dtypes.int32, True)
b = x.cast(dtypes.int32, True)
a = x.alu(Ops.EXP2).bitcast(dtypes.int32)
b = x.bitcast(dtypes.int32)
b = a.alu(Ops.ADD, b)
check_schedule(b, 1) # this should fuse when it makes sense
@@ -1630,7 +1630,8 @@ class TestIndexing(unittest.TestCase):
a[0] = 6
np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
#@unittest.skipUnless(Device.DEFAULT in view_supported_devices, "need view")
@unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
def test_arange_view_op(self):
a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
sched = self.check_schedule(a, 1)
@@ -2104,16 +2105,6 @@ class TestConst(unittest.TestCase):
print(a.lazydata)
self.assertTrue(tensor_const_pm.rewrite(a.lazydata))
def test_uop_methods(self):
a = Tensor(1)
self.assertTrue(a.lazydata.is_unrealized_unmasked_const())
a = Tensor.ones((4, 4))
self.assertTrue(a.lazydata.is_unrealized_unmasked_const())
a = Tensor.ones((4, 4)).pad((1, 1),)
self.assertFalse(a.lazydata.is_unrealized_unmasked_const())
def test_const_schedule(self):
a = Tensor.ones((4, 4))
sched = a.schedule()

View File

@@ -1,6 +1,5 @@
import unittest
from tinygrad import Device, Tensor, TinyJit, Variable, dtypes
from tinygrad.helpers import CI
from tinygrad import Tensor, TinyJit, Variable, dtypes
import numpy as np
class TestSetitem(unittest.TestCase):
@@ -139,12 +138,7 @@ class TestSetitem(unittest.TestCase):
def test_setitem_overlapping_inplace1(self):
t = Tensor([[3.0], [2.0], [1.0]]).contiguous()
t[1:] = t[:-1]
if (Device.DEFAULT == "LLVM") or (CI and Device.DEFAULT == "AMD"):
# TODO: FIXME
with self.assertRaises(AssertionError):
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
else:
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
self.assertEqual(t.tolist(), [[3.0], [3.0], [2.0]])
def test_setitem_overlapping_inplace2(self):
t = Tensor([[3.0], [2.0], [1.0]]).contiguous()

View File

@@ -84,6 +84,14 @@ class TestTensorUOp(unittest.TestCase):
sched = empty.schedule()
self.assertEqual(len(sched), 0)
def test_contiguous_folded_alu(self):
a = Tensor.empty(8, 8)
# NOTE: the buffer for mul_0 late folds to just a CONST
mul_0 = a*0
out = mul_0.shrink(((4, 8), (0, 8))).contiguous()
out.realize()
self.assertEqual(out.tolist(), Tensor.zeros(4, 8).tolist())
reduce_kernel = UPat(Ops.SINK, src=(UPat(Ops.STORE, src=(UPat(), UPat(), UPat(Ops.REDUCE_AXIS)))))
class TestReduceOp(unittest.TestCase):
def test_no_split_reduce_kernel(self):

View File

@@ -68,8 +68,8 @@ class TestRawDiskBuffer(unittest.TestCase):
_test_bitcasted(t, dtypes.float32, 3.1415927)
_test_bitcasted(t, dtypes.uint32, 0x40490FDB)
# doesn't suport normal cast
with self.assertRaises(RuntimeError):
Tensor.empty((4,), dtype=dtypes.int16, device=f"disk:{tmp}").cast(dtypes.float16)
with self.assertRaises(NotImplementedError):
Tensor.empty((4,), dtype=dtypes.int16, device=f"disk:{tmp}").cast(dtypes.float16).realize()
# Those two should be moved to test_dtype.py:test_shape_change_bitcast after bitcast works on non-disk
with self.assertRaises(RuntimeError):

View File

@@ -833,6 +833,30 @@ class TestShapeTrackerSize(unittest.TestCase):
strides=(0, 128, 0, 4096, 1), offset=0, mask=None, contiguous=False)))
self.assertEqual(st.real_size(), 8389632)
def test_pad_size_simple(self):
st = ShapeTracker.from_shape((10,)).pad(((2,4),))
self.assertEqual(st.real_size(), 10)
def test_pad_size_multiview(self):
st = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
self.assertEqual(st.real_size(), 100)
# TODO improve real_size accuracy in cases like this?
@unittest.expectedFailure
def test_stride_size(self):
st1 = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
st2 = ShapeTracker.from_shape((10,10)).stride((2,1)).reshape((5*10,)).stride((17,))
self.assertEqual(st1.real_size(), 78)
self.assertEqual(st2.real_size(), 65)
def test_stride_size_bounds(self):
# lower bound checks that real_size doesn't give false positive for fitting in a buffer
# upper bound checks that real_size doesn't exceed N when movementops were applied to from_shape((N,))
st1 = ShapeTracker.from_shape((10,10)).pad(((2,4), (3,1))).reshape((16*14,)).stride((17,))
st2 = ShapeTracker.from_shape((10,10)).stride((2,1)).reshape((5*10,)).stride((17,))
self.assertTrue(78 <= st1.real_size() <= 100)
self.assertTrue(65 <= st2.real_size() <= 100)
class TestConsecutive(unittest.TestCase):
@classmethod
def setUpClass(self):

View File

@@ -2,8 +2,10 @@ from __future__ import annotations
from dataclasses import dataclass, replace
from collections import defaultdict
from typing import Optional, Any, Iterator, Generator
import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, contextlib, sys, re, atexit, pickle, decimal, time
from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp
import multiprocessing, importlib, inspect, functools, pathlib, os, ctypes, ctypes.util, platform, contextlib, sys, re, atexit, pickle, decimal, time
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
from tinygrad.helpers import CI, OSX, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, PROFILE, temp, mv_address, \
cpu_time_execution
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes
from tinygrad.renderer import Renderer
@@ -213,6 +215,40 @@ class _MallocAllocator(LRUAllocator):
MallocAllocator = _MallocAllocator()
# NOTE: MAP_JIT is added to mmap module in python 3.13
MAP_JIT = 0x0800
# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
class CPUProgram:
helper_handle = ctypes.CDLL(ctypes.util.find_library('System') if OSX else 'libgcc_s.so.1')
def __init__(self, name:str, lib:bytes):
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
def __call__(self, *bufs, vals=(), wait=False):
args = list(bufs) + list(vals)
# NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
# Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
# This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
# The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
# **************** for Compiled Devices ****************
class CompileError(Exception): pass

View File

@@ -13,7 +13,7 @@ from tinygrad.device import Buffer
# creation can recurse a lot
sys.setrecursionlimit(10000)
# **** big graph spec
# **** Tensor UOp spec
tensor_uop_spec = PatternMatcher([
(UPat(Ops.DEVICE, dtypes.void, (), name="device"), lambda device: isinstance(device.arg, str)),
@@ -51,17 +51,6 @@ tensor_uop_spec = PatternMatcher([
# ASSIGN changes the value of a realized buffer
(UPat(Ops.ASSIGN, name="assign", src=(UPat.var("target"), UPat.var("new_val"))),
lambda assign,target,new_val: (target.op is Ops.BUFFER or target.is_realized) and (assign.dtype == target.dtype == new_val.dtype)),
# TODO: BUFFER_VIEW is overloaded, it should be removed.
# BUFFER_VIEW shares the device buffer with its source, it uses a subbuffer of the underlying source buffer
(UPat(Ops.BUFFER_VIEW, name="root", src=(UPat.var("x"),)), lambda root,x:
# BUFFER_VIEW can replace contiguous, keeping dtype the same
(root.dtype == x.dtype) or
# it can also replace bitcast, this changes the dtype, but the itemsize stays the same
(root.dtype != x.dtype and root.dtype.itemsize == x.dtype.itemsize) or
# it can also represent shape changing bitcast (only on DISK)
(root.dtype != x.dtype and root.dtype.itemsize != x.dtype.itemsize and x.device.startswith("DISK"))),
])
# **** ScheduleItem return type
@@ -455,6 +444,12 @@ def fold_img_cast(ctx:ScheduleContext, xb:UOp, view:UOp, b:UOp, to_cast:UOp, **k
def sink_outputs(ctx:ScheduleContext, sink:UOp) -> None:
for x in sink.src: realize(ctx, x.buf_uop, x)
def create_subbuffer(base:UOp, b:UOp, root:UOp, x:UOp):
if not root.device.startswith("DISK"): return None
if x.op is not Ops.VIEW: x = x.src[-1] # TODO: remove this once forced_realize is gone
buffers[b] = x.buf_uop.buffer.view(b.size, b.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
return base.replace(src=(b, root.replace(op=Ops.BUFFER_VIEW)))
do_realize = PatternMatcher([
# always realize sinked ops
(UPat(Ops.SINK, name="sink"), sink_outputs),
@@ -467,6 +462,8 @@ do_realize = PatternMatcher([
# realize before COPY or BUFFER_VIEW
(UPat(Ops.COPY, src=(UPat(), UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
(UPat(Ops.BUFFER_VIEW, src=(UPat.any(UPatScheduled(), UPatScheduled().view()),)), realize),
# substitute BITCAST/CONTIGUOUS with BUFFER_VIEW on DISK
(UPatScheduled((Ops.BITCAST, Ops.CONTIGUOUS), name="root", src=(UPat.var("x"),)), create_subbuffer),
])
# **** rewrite VIEW into LOAD/STORE/VALID or fuse the underlying UOp
@@ -502,10 +499,6 @@ def append_uop(ctx:ScheduleContext, view:UOp, buf_uop:UOp) -> None:
if (op:=uval(view)).op is Ops.ASSIGN: ctx.assigns.add(buf_uop)
for x in op.base.src:
if is_scheduled(x.base): ctx.children.setdefault(x.base.buf_uop, {})[buf_uop] = None
# BUFFER_VIEW overrides the underlying buffer
# TODO: this should be a shrink on the buffer
if op.op is Ops.BUFFER_VIEW:
buffers[buf_uop] = (x:=op.src[0]).buf_uop.buffer.view(view.size, view.dtype, unwrap(x.st).views[0].offset*x.dtype.itemsize)
buf_uop.buffer.ref(1)
create_ctx = PatternMatcher([(UPat(Ops.VIEW, name="view", src=(UPat(Ops.BUFFER, name="buf_uop"), UPat())), append_uop)])

View File

@@ -37,9 +37,7 @@ pm_gradient = PatternMatcher([
(UPat(Ops.EXPAND, name="ret"), lambda ctx, ret:
(ctx.cast(sum_acc_dtype(ctx.dtype)).r(Ops.ADD, tuple(i for i,(si,so) in enumerate(zip(ret.src[0].shape, ret.arg)) if si!=so)).cast(ctx.dtype),)),
# there's no gradient for...is this ASSIGN?
(UPat(Ops.VIEW, src=(UPat(Ops.BUFFER), UPat(Ops.BUFFER_VIEW))), lambda: (None, None)),
# also no gradient for bitcast
# there's no gradient for bitcast
(UPat(Ops.BITCAST), lambda ctx: (None,)),
])

View File

@@ -101,7 +101,8 @@ class ContextVar:
def __gt__(self, x): return self.value > x
def __lt__(self, x): return self.value < x
DEBUG, IMAGE, BEAM, NOOPT, JIT = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0), ContextVar("JIT", 1)
DEBUG, IMAGE, BEAM, NOOPT = ContextVar("DEBUG", 0), ContextVar("IMAGE", 0), ContextVar("BEAM", 0), ContextVar("NOOPT", 0)
JIT = ContextVar("JIT", 2 if platform.system() == 'Darwin' and ('Intel' in platform.processor() or 'i386' in platform.processor()) else 1)
WINO, CAPTURING, TRACEMETA = ContextVar("WINO", 0), ContextVar("CAPTURING", 1), ContextVar("TRACEMETA", 1)
USE_TC, TC_OPT, AMX, TRANSCENDENTAL = ContextVar("TC", 1), ContextVar("TC_OPT", 0), ContextVar("AMX", 0), ContextVar("TRANSCENDENTAL", 1)
FUSE_ARANGE, FUSE_CONV_BW = ContextVar("FUSE_ARANGE", 0), ContextVar("FUSE_CONV_BW", 0)
@@ -267,6 +268,15 @@ def cpu_objdump(lib, objdump_tool='objdump'):
pathlib.Path(f.name).write_bytes(lib)
print(subprocess.check_output([objdump_tool, '-d', f.name]).decode('utf-8'))
def capstone_flatdump(lib: bytes):
import capstone
match platform.machine():
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
for instr in cs.disasm(lib, 0):
print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
# *** ctypes helpers
# TODO: make this work with read only memoryviews (if possible)

View File

@@ -76,7 +76,8 @@ class MultiLazyBuffer(MathTrait):
# passthroughs
@property
def is_realized(self) -> bool: return all(lb.base.realized is not None for lb in self.real_lbs)
def cast(self, dtype:DType, bitcast:bool=False): return MultiLazyBuffer([x.cast(dtype, bitcast) for x in self.lbs], self.axis, self.real)
def cast(self, dtype:DType): return MultiLazyBuffer([x.cast(dtype) for x in self.lbs], self.axis, self.real)
def bitcast(self, dtype:DType): return MultiLazyBuffer([x.bitcast(dtype) for x in self.lbs], self.axis, self.real)
def const_like(self, b) -> MultiLazyBuffer: return MultiLazyBuffer([x.const_like(b) for x in self.lbs], self.axis, self.real)
def assign(self, x:MultiLazyBuffer): return MultiLazyBuffer([s.assign(d) for s,d in zip(self.lbs, x.lbs)], self.axis, self.real)
def contiguous(self): return MultiLazyBuffer([x.contiguous() for x in self.lbs], self.axis, self.real)

View File

@@ -291,7 +291,7 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
if self.op in GroupOp.Buffer: return vsrc[0] if len(vsrc:=[x.st for x in self.src if x.op is Ops.VIEW]) != 0 else None
if not (src_sts := [x.st for x in self.src if x.st is not None]): return None
assert all_same([x.shape for x in src_sts]), f"UOp sources must have the same shape {self} {[x.shape for x in src_sts]}"
if self.op is Ops.BUFFER_VIEW:
if self.op in {Ops.BITCAST, Ops.BUFFER_VIEW}:
shape = src_sts[0].shape
if self.dtype.itemsize != (input_sz:=self.src[0].dtype.itemsize): shape = shape[:-1]+((shape[-1]*input_sz) // self.dtype.itemsize,)
# only reduce ops are allowed to change shape, everything else derives shape from sources
@@ -358,17 +358,8 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
assert self.dtype.count == 1
if count == 1: return self
return UOp(Ops.VECTORIZE, self.dtype.vec(count), (self,)*count)
def cast(self, dtype:DType, bitcast=False):
if bitcast: return self.bitcast(dtype)
if self._device is not None and self._device.startswith("DISK"): raise RuntimeError("CAST isn't supported on DISK")
return UOp(Ops.CAST, dtype, (self,))
def bitcast(self, dtype:DType):
if self.st is not None and self.shape and ((self.shape[-1]*self.dtype.itemsize)%dtype.itemsize != 0):
raise RuntimeError(f"unsupported size in bitcast {dtype}")
# shape changing bitcast can use a subbuffer on DISK
# TODO: this should be moved to realize.py
if self.can_view() and self.device.startswith("DISK"): return UOp(Ops.BUFFER_VIEW, dtype, (self,))
return UOp(Ops.BITCAST, dtype, (self,))
def cast(self, dtype:DType): return UOp(Ops.CAST, dtype, (self,))
def bitcast(self, dtype:DType): return UOp(Ops.BITCAST, dtype, (self,))
def gep(self, i:Union[tuple[int, ...], int]):
if isinstance(i, int):
# NOTE: these are just shortcuts to not have to create and fold later
@@ -420,9 +411,9 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
if DEBUG >= 3: print(f"split {divisor}: {self.shape} -> {splitted.shape} -> {new_shape}")
return splitted._reduce_op(op, axis)._reduce_op(op, (len(new_shape),)).reshape(new_shape) # reduce original axes, then split
def assign(self, x:UOp): return UOp(Ops.ASSIGN, self.dtype, (self,x))
def contiguous(self, allow_buffer_view=True):
def contiguous(self):
if not unwrap(self.st).contiguous or self.size != self.base.size or self.base.op is Ops.CONST:
return self.alu(Ops.BUFFER_VIEW if allow_buffer_view and self.can_view() else Ops.CONTIGUOUS)
return self.alu(Ops.CONTIGUOUS)
forced_realize.add(self.base)
return self
@@ -450,10 +441,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
# COPY is COPY(DEVICE, copyin.base) -> VIEW(copyin.st)
return UOp(Ops.COPY, self.base.dtype, (UOp(Ops.DEVICE, arg=device), self.base), clone).view(unwrap(self.st))
def clone(self) -> UOp: return self.copy_to_device(self.device, clone=True)
def is_unrealized_unmasked_const(self): return self.base.op is Ops.CONST and all(v.mask is None for v in unwrap(self.st).views)
def can_view(self):
return (self.st is not None and self._device is not None and self.st.consecutive and self.base.op is not Ops.CONST and
not isinstance(self.dtype, ImageDType) and self.device.split(":")[0] in view_supported_devices)
@property
def lbs(self): return [self]
@property

View File

@@ -63,8 +63,8 @@ def render_wmma(ctx: "PTXRenderer", x: UOp):
dt_map = { dtypes.half: "f16" }
_i = 0
for vv in x.src[:2]:
for i in range(0, len(ctx.r[vv]), 2):
yield f"mov.b32 {ctx.wmma_r[_i]}, {{{', '.join(ctx.r[vv][i:i+2])}}};"
for i in range(0, len(ctx.r[vv]), (elems_per_reg := 4//dtype_in.itemsize)):
yield f"mov.b32 {ctx.wmma_r[_i]}, " + (f"{{{', '.join(ctx.r[vv][i:i+elems_per_reg])}}}" if elems_per_reg > 1 else ctx.r[vv][i]) + ";"
_i += 1
yield f'mma.sync.aligned.m{M}n{N}k{K}.row.col.f32.{dt_map[dtype_in]}.{dt_map[dtype_in]}.f32{" "*12}' +\
f'{{{", ".join(ctx.r[x])}}}, {{{", ".join(ctx.wmma_r[:n_operands[0]])}}}, {{{", ".join(ctx.wmma_r[-n_operands[1]:])}}}, ' + \
@@ -184,7 +184,7 @@ class PTXRenderer(Renderer):
r[u] = [ssa('val', dtype=self.types[u.dtype.scalar()]) for _ in range(u.dtype.count)] if u.dtype.count > 1 else ssa('val', u)
elif u.op is Ops.DEFINE_GLOBAL: bufs.append((f"data{u.arg}", u.dtype))
elif u.op is Ops.WMMA:
self.wmma_r = [ssa("wmma", dtype="b32") for vv in u.src[:2] for i in range(0, len(r[vv]), 2)]
self.wmma_r = [ssa("wmma", dtype="b32") for vv in u.src[:2] for i in range(0, len(r[vv]), 4//u.arg[2].itemsize)]
r[u] = [ssa("wmma", dtype=self.types[u.dtype.scalar()]) for _ in range(u.dtype.count)]
prefix, dtype = {Ops.CAST: ("cast", None), Ops.BITCAST: ("cast", None), Ops.ENDRANGE: ("pred", "pred"), Ops.RANGE: ("ridx", None),
Ops.DEFINE_ACC: ("acc", None), Ops.DEFINE_VAR: ("dat", None), Ops.CONST: ("const", None), Ops.DEFINE_LOCAL:("local",self.types[dtypes.ulong]),

View File

@@ -197,24 +197,44 @@ class AMDCopyQueue(HWQueue):
*data64_le(signal.timestamp_addr))
return self
def bind(self, dev:AMDDevice):
if not dev.driverless: return
self.binded_device = dev
self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
for i, value in enumerate(self._q): hw_view[i] = value
self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
def _submit(self, dev:AMDDevice):
if dev.sdma_queue.put_value - dev.sdma_queue.read_ptr[0] > dev.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
if self.binded_device == dev:
# An IB packet must end on a 8 DW boundary.
add = (8 - (((dev.sdma_queue.put_value % 32) // 4) + len(self.indirect_cmd) % 8)) % 8
cmds, cmd_sizes = ([0] * add) + self.indirect_cmd, [len(self.indirect_cmd) + add]
if len(cmds) * 4 >= (dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes):
cmds, cmd_sizes = [0, 0] + self.indirect_cmd, [8]
else: cmds, cmd_sizes = self._q, self.internal_cmd_sizes
tail_blit_dword = 0
for cmdsz in self.internal_cmd_sizes:
for cmdsz in cmd_sizes:
if (tail_blit_dword + cmdsz) * 4 >= dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes: break
tail_blit_dword += cmdsz
start_idx = (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes) // 4
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self._q[:tail_blit_dword])
dev.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', cmds[:tail_blit_dword])
dev.sdma_queue.put_value += tail_blit_dword * 4
if (rem_packet_cnt := len(self._q) - tail_blit_dword) > 0:
if (rem_packet_cnt := len(cmds) - tail_blit_dword) > 0:
zero_fill = dev.sdma_queue.ring.nbytes - dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes
ctypes.memset(mv_address(dev.sdma_queue.ring) + (dev.sdma_queue.put_value % dev.sdma_queue.ring.nbytes), 0, zero_fill)
dev.sdma_queue.put_value += zero_fill
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self._q[tail_blit_dword:])
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
dev.sdma_queue.put_value += rem_packet_cnt * 4
dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value

View File

@@ -1,13 +1,9 @@
import ctypes, ctypes.util, struct, platform, tempfile, pathlib, subprocess
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
from tinygrad.helpers import OSX, mv_address, cpu_time_execution, cpu_objdump
from tinygrad.device import Compiled, Compiler, MallocAllocator
from tinygrad.runtime.support.elf import elf_loader, relocate
import platform, tempfile, pathlib, subprocess
from tinygrad.helpers import cpu_objdump, capstone_flatdump
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
from tinygrad.runtime.support.elf import jit_loader
from tinygrad.renderer.cstyle import ClangRenderer
# NOTE: MAP_JIT is added to mmap module in python 3.13
MAP_JIT = 0x0800
# Used by ops_dsp.py
class ClangCompiler(Compiler):
def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
@@ -33,51 +29,9 @@ class ClangJITCompiler(Compiler):
args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
image, _, relocs = elf_loader(obj)
# This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
for ploc,tgt,r_type,r_addend in relocs:
image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
return bytes(image)
return jit_loader(obj)
def disassemble(self, lib):
import capstone
match platform.machine():
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
for instr in cs.disasm(lib, 0):
print(f"{instr.address:#08x}: {instr.mnemonic}\t{instr.op_str}")
# CPUProgram is a jit/shellcode program that can be just mmapped and jumped to
class CPUProgram:
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'gcc_s'))
def __init__(self, name:str, lib:bytes):
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
def __call__(self, *bufs, vals=(), wait=False):
args = list(bufs) + list(vals)
# NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.
# Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64
# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
# This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)
# The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+
if platform.machine() == "arm64" and OSX: args = args[:8] + [ctypes.c_int64(a) if isinstance(a, int) else a for a in args[8:]]
return cpu_time_execution(lambda: self.fxn(*args), enable=wait)
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
class ClangDevice(Compiled):
def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)

View File

@@ -1,4 +1,4 @@
import tinygrad.runtime.autogen.libc as libc
import struct, tinygrad.runtime.autogen.libc as libc
from dataclasses import dataclass
from tinygrad.helpers import getbits, i2u
@@ -51,3 +51,10 @@ def relocate(instr: int, ploc: int, tgt: int, r_type: int):
case libc.R_AARCH64_LDST64_ABS_LO12_NC: return instr | (getbits(tgt, 3, 11) << 10)
case libc.R_AARCH64_LDST128_ABS_LO12_NC: return instr | (getbits(tgt, 4, 11) << 10)
raise NotImplementedError(f"Encountered unknown relocation type {r_type}")
def jit_loader(obj: bytes) -> bytes:
image, _, relocs = elf_loader(obj)
# This is needed because we have an object file, not a .so that has all internal references (like loads of constants from .rodata) resolved.
for ploc,tgt,r_type,r_addend in relocs:
image[ploc:ploc+4] = struct.pack("<I", relocate(struct.unpack("<I", image[ploc:ploc+4])[0], ploc, tgt+r_addend, r_type))
return bytes(image)

View File

@@ -72,12 +72,10 @@ class ShapeTracker:
def to_indexed_uops(self, _idxs:Optional[list[UOp]|tuple[UOp, ...]]=None) -> tuple[UOp, UOp]:
return views_to_indexed_uops(self.views, tuple(_idxs) if _idxs is not None else None)
# upper bound on buffer size required to fit this shapetracker
def real_size(self) -> int:
if 0 in self.shape: return 0
idx, valid = self.to_indexed_uops()
if not valid.vmax: return 0
assert idx.vmax < 1e12, f"real_size broken for {self}"
return int(idx.vmax+1)
return int((v.shrink(v.mask) if (v:=self.views[0]).mask else v).to_indexed_uops()[0].vmax + 1)
def vars(self) -> set[Variable]: return set().union(*[v.vars() for v in self.views])

View File

@@ -413,7 +413,7 @@ class Tensor(SimpleMathTrait):
lbs = [cast(UOp, t.lazydata) for t in self.split(sizes, axis)]
sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(lbs, devices)]
# NOTE: this contiguous is making it impossible for the scheduler to do late const folding
mlb = MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
mlb = MultiLazyBuffer([lb.contiguous() for lb in sharded_lbs], axis)
return Tensor(mlb, device=devices, requires_grad=self.requires_grad)
def shard_(self, devices:tuple[str, ...], axis:Optional[int]=None):
@@ -3130,9 +3130,10 @@ class Tensor(SimpleMathTrait):
# broadcast
return x._broadcast_to(out_shape:=_broadcast_shape(x.shape, y.shape)), y._broadcast_to(out_shape)
# TODO: tensor should stop checking if things are const
def _to_const_val(self, x:Union[Tensor, ConstType]) -> Union[Tensor, ConstType]:
return x.lazydata.const_arg if isinstance(x, Tensor) and isinstance(x.lazydata, UOp) and x.lazydata.is_unrealized_unmasked_const() \
and not x.requires_grad and self._broadcasted(x)[0].shape == self.shape else x
return x.lazydata.const_arg if isinstance(x, Tensor) and isinstance(x.lazydata, UOp) and x.lazydata.base.op is Ops.CONST \
and unwrap(x.lazydata.st).views[0].mask is None and not x.requires_grad and self._broadcasted(x)[0].shape == self.shape else x
def add(self, x:Union[Tensor, ConstType], reverse=False) -> Tensor:
"""
@@ -3812,8 +3813,8 @@ class Tensor(SimpleMathTrait):
"""
if self.requires_grad: raise RuntimeError("can't backprop through bitcast")
dt = to_dtype(dtype)
if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and (ns:=dt.itemsize) != (os:=self.dtype.itemsize):
if (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
if (ns:=dt.itemsize) != (os:=self.dtype.itemsize) and (self.shape[-1]*os) % ns != 0: raise RuntimeError("unsupported size in bitcast")
if (not isinstance(self.device, str) or not self.device.startswith("DISK")) and ns != os:
new_uint, old_uint = to_dtype(f"uint{8*ns}"), to_dtype(f"uint{8*os}")
tmp = self.bitcast(old_uint)
if ns > os: return functools.reduce(Tensor.add, (tmp[..., i::ns//os].cast(new_uint) << 8*i*os for i in range(ns//os))).bitcast(dtype)