mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
split tc test (#12003)
* split tc test * split hand coded opts * remove some skipped tests * skips on emulated
This commit is contained in:
20
.github/workflows/benchmark.yml
vendored
20
.github/workflows/benchmark.yml
vendored
@@ -65,12 +65,12 @@ jobs:
|
||||
- name: Test speed vs torch
|
||||
run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
|
||||
- name: Test tensor cores
|
||||
run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
run: METAL=1 python3.11 test/opt/test_tensor_cores.py
|
||||
- name: Test AMX tensor cores
|
||||
run: |
|
||||
DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 CPU=1 AMX=1 python3.11 test/opt/test_tensor_cores.py
|
||||
DEBUG=2 LLVM=1 AMX=1 python3.11 test/opt/test_tensor_cores.py
|
||||
DEBUG=2 CPU=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
|
||||
DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 LLVM=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
|
||||
- name: Run Tensor Core GEMM (float)
|
||||
run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt
|
||||
@@ -196,8 +196,8 @@ jobs:
|
||||
run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py
|
||||
- name: Test tensor cores
|
||||
run: |
|
||||
NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
PTX=1 ALLOW_TF32=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
|
||||
PTX=1 ALLOW_TF32=1 NV=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Run Tensor Core GEMM (CUDA)
|
||||
run: |
|
||||
CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
|
||||
@@ -396,8 +396,8 @@ jobs:
|
||||
run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
|
||||
- name: Test tensor cores
|
||||
run: |
|
||||
AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
AMD=1 AMD_LLVM=0 python3 test/opt/test_tensor_cores.py
|
||||
AMD=1 AMD_LLVM=1 python3 test/opt/test_tensor_cores.py
|
||||
AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (AMD)
|
||||
run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt
|
||||
@@ -681,8 +681,8 @@ jobs:
|
||||
# Fails on 9070
|
||||
# - name: Test tensor cores
|
||||
# run: |
|
||||
# AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
# AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
# AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
|
||||
# AMD=1 AMD_LLVM=1 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
|
||||
# AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
|
||||
- name: Run Tensor Core GEMM (AMD)
|
||||
run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee am_matmul_amd.txt
|
||||
@@ -748,7 +748,7 @@ jobs:
|
||||
- name: Test driver start time
|
||||
run: time DEBUG=3 NV=1 python3 test/test_tiny.py TestTiny.test_plus
|
||||
- name: Test tensor cores
|
||||
run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
run: NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test DISK copy time
|
||||
run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
|
||||
- name: Test CPU copy time
|
||||
|
||||
24
.github/workflows/test.yml
vendored
24
.github/workflows/test.yml
vendored
@@ -242,8 +242,7 @@ jobs:
|
||||
- name: Test emulated METAL tensor cores
|
||||
run: |
|
||||
DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_big_gemm
|
||||
DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test emulated AMX tensor cores
|
||||
run: DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
|
||||
- name: Test emulated AMD tensor cores
|
||||
@@ -252,37 +251,28 @@ jobs:
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test emulated AMD MFMA tensor cores
|
||||
run: |
|
||||
DEBUG=2 EMULATE=AMD_MFMA FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD_MFMA FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=AMD_MFMA FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 EMULATE=AMD_MFMA FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test emulated AMD RDNA4 tensor cores
|
||||
run: |
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
DEBUG=2 EMULATE=AMD_RDNA4 FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test emulated CUDA tensor cores
|
||||
run: |
|
||||
DEBUG=2 EMULATE=CUDA FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
|
||||
DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
|
||||
DEBUG=2 EMULATE=CUDA_SM75 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
|
||||
PYTHONPATH="." DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
PYTHONPATH="." DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
PYTHONPATH="." DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test emulated INTEL OpenCL tensor cores
|
||||
run: DEBUG=2 EMULATE=INTEL FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py
|
||||
- name: Full test tensor cores
|
||||
run: |
|
||||
DEBUG=2 EMULATE=METAL FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=AMD FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=CUDA ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 EMULATE=INTEL FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
|
||||
- name: Test emulated AMX tensor cores
|
||||
run: DEBUG=2 AMX=1 EMULATE=AMX FORWARD_ONLY=1 PYTHON=1 python3 test/opt/test_tensor_cores.py
|
||||
- name: Test device flop counts
|
||||
run: |
|
||||
DEBUG=2 EMULATE=METAL PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
|
||||
|
||||
@@ -176,54 +176,5 @@ class TestFloat4(unittest.TestCase):
|
||||
count = TestFloat4.count_half4(program.uops)
|
||||
assert count == expected, f"{count=}, {expected=}"
|
||||
|
||||
@unittest.skip("this doesn't happen anymore")
|
||||
def test_float4_acc(self):
|
||||
# from float32 stable diffusion red tinybox
|
||||
ast = UOp(Ops.SINK, dtypes.void, arg=None, src=(
|
||||
UOp(Ops.STORE, dtypes.void, arg=None, src=(
|
||||
UOp(Ops.VIEW, dtypes.float.ptr(33554432), arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 1, 1, 1), strides=(0, 0, 262144, 512, 1, 0, 0, 0), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501
|
||||
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(33554432), arg=0, src=()),)),
|
||||
UOp(Ops.ADD, dtypes.float, arg=None, src=(
|
||||
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (5, 6, 7)), src=(
|
||||
UOp(Ops.MUL, dtypes.float, arg=None, src=(
|
||||
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
|
||||
UOp(Ops.VIEW, dtypes.float.ptr(67108864), arg=ShapeTracker(views=(View(shape=(1, 1, 1, 256, 4, 514, 4, 514), strides=(0, 0, 0, 262144, 0, 512, 0, 1), offset=-513, mask=((0, 1), (0, 1), (0, 1), (0, 256), (0, 4), (1, 513), (0, 4), (1, 513)), contiguous=False), View(shape=(1, 1, 128, 512, 512, 256, 3, 3), strides=(0, 0, 0, 2056, 1, 4227136, 1058840, 515), offset=0, mask=None, contiguous=False))), src=( # noqa: E501
|
||||
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(67108864), arg=1, src=()),)),)),
|
||||
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
|
||||
UOp(Ops.VIEW, dtypes.float.ptr(294912), arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 256, 3, 3), strides=(0, 0, 2304, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501
|
||||
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(294912), arg=2, src=()),)),)),)),)),
|
||||
UOp(Ops.LOAD, dtypes.float, arg=None, src=(
|
||||
UOp(Ops.VIEW, dtypes.float.ptr(128), arg=ShapeTracker(views=(View(shape=(1, 1, 128, 512, 512, 1, 1, 1), strides=(0, 0, 1, 0, 0, 0, 0, 0), offset=0, mask=None, contiguous=False),)), src=( # noqa: E501
|
||||
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(128), arg=3, src=()),)),)),)),)),))
|
||||
|
||||
for expected, opts in [
|
||||
(1, [Opt(op=OptOps.UPCAST, axis=2, arg=4)]),
|
||||
(4, [Opt(op=OptOps.UPCAST, axis=2, arg=4), Opt(op=OptOps.UPCAST, axis=0, arg=4)]),
|
||||
]:
|
||||
program = get_program(ast, Device[Device.DEFAULT].renderer, opts=opts)
|
||||
count = len([uop for uop in program.uops if uop.op is Ops.DEFINE_REG and uop.dtype == dtypes.float.vec(4)])
|
||||
assert count == expected, f"{count=}, {expected=}"
|
||||
|
||||
@unittest.skip("this doesn't happen anymore")
|
||||
def test_float2_acc(self):
|
||||
# from resnet
|
||||
ast = UOp(Ops.SINK, dtypes.void, arg=None, src=(
|
||||
UOp(Ops.STORE, dtypes.void, arg=None, src=(
|
||||
UOp(Ops.VIEW, dtypes.half.ptr(212926464), arg=ShapeTracker(views=(View(shape=(1, 256, 1, 64, 1, 114, 1, 114), strides=(0, 831744, 0, 12996, 0, 114, 0, 1), offset=0, mask=None, contiguous=True),)), src=( # noqa: E501
|
||||
UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(212926464), arg=0, src=()),)),
|
||||
UOp(Ops.CAST, dtypes.half, arg=None, src=(
|
||||
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (4, 6)), src=(
|
||||
UOp(Ops.CAST, dtypes.float, arg=None, src=(
|
||||
UOp(Ops.LOAD, dtypes.half, arg=None, src=(
|
||||
UOp(Ops.VIEW, dtypes.half.ptr(462422016), arg=ShapeTracker(views=(View(shape=(256, 64, 3, 56, 2, 3, 56, 2), strides=(1806336, 28224, 3, 504, 0, 1, 9, 0), offset=0, mask=((0, 256), (0, 64), (0, 3), (0, 56), (0, 1), (0, 3), (0, 56), (0, 1)), contiguous=False), View(shape=(256, 64, 3, 115, 3, 115), strides=(7225344, 112896, 37632, 336, 112, 1), offset=0, mask=((0, 256), (0, 64), (0, 3), (0, 112), (0, 3), (0, 112)), contiguous=False), View(shape=(256, 64, 456, 456), strides=(7617600, 119025, 345, 1), offset=0, mask=((0, 256), (0, 64), (0, 345), (0, 345)), contiguous=False), View(shape=(1, 256, 1, 64, 4, 114, 4, 114), strides=(0, 13307904, 0, 207936, 51984, 456, 114, 1), offset=0, mask=None, contiguous=True))), src=( # noqa: E501
|
||||
UOp(Ops.DEFINE_GLOBAL, dtypes.half.ptr(462422016), arg=1, src=()),)),)),)),)),)),)),))
|
||||
for expected, opts in [
|
||||
(16, [Opt(op=OptOps.LOCAL, axis=1, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=2, arg=2), Opt(op=OptOps.LOCAL, axis=2, arg=3), Opt(op=OptOps.UPCAST, axis=3, arg=4)]), # noqa: E501
|
||||
(4, [Opt(op=OptOps.LOCAL, axis=1, arg=16), Opt(op=OptOps.UPCAST, axis=1, arg=0), Opt(op=OptOps.UPCAST, axis=2, arg=2)]),
|
||||
]:
|
||||
program = get_program(ast, Device[Device.DEFAULT].renderer, opts=opts)
|
||||
count = len([uop for uop in program.uops if uop.op is Ops.DEFINE_REG and uop.dtype == dtypes.float.vec(2)])
|
||||
assert count == expected, f"{count=}, {expected=}"
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
89
test/opt/test_hand_coded_opts.py
Normal file
89
test/opt/test_hand_coded_opts.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import unittest
|
||||
from tinygrad import Tensor, Device
|
||||
from tinygrad.helpers import Context, prod
|
||||
from tinygrad.uop.ops import AxisType
|
||||
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
|
||||
|
||||
# TODO: remove this
|
||||
from tinygrad.codegen.opt.kernel import Kernel
|
||||
from test.test_linearizer import push_views, helper_linearizer_opt
|
||||
|
||||
class TestHandCodedOpts(unittest.TestCase):
|
||||
def test_masked_upcast(self):
|
||||
layer_1 = Tensor.cat(*[Tensor.empty(5) for _ in range(4)])
|
||||
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.empty(6, 20))
|
||||
|
||||
s = layer_2.schedule()[-1]
|
||||
k = Kernel(push_views(s.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
assert len(k.bufs) == 6 # make sure all ops are done in one kernel
|
||||
# masked upcast should upcast masked axis of size 7
|
||||
# masked upcast should not upcast large (20) last axis
|
||||
# float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous
|
||||
assert k.upcasted == 1 and k.full_shape[-1] == 7
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT in {"METAL", "WEBGPU"}, "METAL/WEBGPU split this kernel since it has 37 buffers")
|
||||
def test_masked_upcast_wino(self):
|
||||
monster = Tensor.stack(*[Tensor.stack(*[Tensor.empty(16) for _ in range(6)]) for _ in range(6)])
|
||||
|
||||
s = monster.schedule()[-1]
|
||||
k = Kernel(push_views(s.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
assert len(k.bufs) == 37 # make sure all ops are done in one kernel
|
||||
# should upcast the two Tensor.stacks
|
||||
assert k.upcasted >= 2 and k.full_shape[k.shape_len-k.upcasted:k.shape_len].count(6) == 2
|
||||
|
||||
def test_masked_upcast_wino_full(self):
|
||||
with Context(WINO=1):
|
||||
x,w = Tensor.rand(1,4,8,8, requires_grad=True).realize(), Tensor.rand(4,4,3,3, requires_grad=True).realize()
|
||||
out = Tensor.conv2d(x,w, padding=1)
|
||||
out.mean().backward()
|
||||
|
||||
upcasts = []
|
||||
wino_schedule = out.schedule()
|
||||
# collect upcasts of tile transform kernels
|
||||
for i, si in enumerate(wino_schedule):
|
||||
k = Kernel(push_views(si.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
if k.reduceop is not None: continue # not a tile transform kernel (there is a gemm reduce kernel)
|
||||
if len(k.bufs) < 22: continue # not a tile transform kernel (there's a permute kernel at the end)
|
||||
upcasts.append(tuple(k.full_shape[k.shape_len - k.upcasted:k.shape_len]))
|
||||
assert len(upcasts) == 3 # 3 transformation matrices
|
||||
assert len(wino_schedule) <= 4 # 4 kernels
|
||||
# this test case's inputs are too small, so one of the 4-stacks became a local, which is fine i guess
|
||||
assert upcasts.count((6, 6)) == 2 #and upcasts.count((4, 4)) == 1
|
||||
|
||||
backward_schedule = Tensor.schedule(x.grad, w.grad)
|
||||
for si in backward_schedule:
|
||||
k = Kernel(push_views(si.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
if len(k.bufs) < 20: continue # not a tile transform kernel
|
||||
# heuristic number to make sure that at least some upcasts but not too many upcasts are being done
|
||||
assert 6 <= prod(k.full_shape[k.shape_len - k.upcasted:k.shape_len]) <= 216
|
||||
assert len(backward_schedule) <= 13 # just the current number, but it could be better
|
||||
|
||||
def test_masked_upcast_many(self):
|
||||
layer_1 = Tensor.cat(Tensor.rand(3, 4), Tensor.rand(4, 4))
|
||||
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 7, 4))
|
||||
layer_3 = Tensor.cat(layer_2.unsqueeze(0), Tensor.rand(6, 7, 7, 4))
|
||||
|
||||
k = helper_linearizer_opt(layer_3)[-1]
|
||||
assert len(k.bufs) == 5 # make sure all ops are done in one kernel
|
||||
# check that we don't do too many upcasts
|
||||
assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
||||
def test_matvec(self):
|
||||
N = 128
|
||||
a = Tensor.rand(1, N).realize()
|
||||
b = Tensor.rand(N, N).realize()
|
||||
c = a @ b
|
||||
|
||||
k = helper_linearizer_opt(c)[-1]
|
||||
|
||||
assert k.group_for_reduces == 1
|
||||
assert k.axis_types.count(AxisType.LOCAL) == 1
|
||||
assert k.upcasted == 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -322,5 +322,19 @@ class TestKernelOpts(unittest.TestCase):
|
||||
]
|
||||
helper_linearizer_opt(r, [x[0] for x in opts_shapes], color_sizes=[x[1] for x in opts_shapes])
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
|
||||
def test_arange_opts(self):
|
||||
a = Tensor.arange(128)
|
||||
helper_linearizer_opt(a, [
|
||||
[Opt(OptOps.GROUP, 0, 32)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
|
||||
])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
188
test/opt/test_tensor_cores.py
Normal file
188
test/opt/test_tensor_cores.py
Normal file
@@ -0,0 +1,188 @@
|
||||
import numpy as np
|
||||
import unittest
|
||||
from dataclasses import replace
|
||||
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import AMX, CI, AMD_LLVM
|
||||
from tinygrad.engine.realize import CompiledRunner, get_program
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
|
||||
# TODO: write a clean version of this
|
||||
from test.test_linearizer import helper_realized_ast, helper_linearizer_opt
|
||||
|
||||
def helper_tc_ensure_uops_and_opts_count(N: int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0,
|
||||
ensure_triggered:bool=True):
|
||||
a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
|
||||
r = a.matmul(b, dtype=dtype_out)
|
||||
sched = r.schedule()
|
||||
realized_ast = sched[-1].ast
|
||||
opts_to_apply = [Opt(OptOps.TC, axis, (tc_select, tc_opt, 1))]
|
||||
|
||||
if ensure_triggered:
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer, opts=opts_to_apply)
|
||||
wmmas = len([uop for uop in program.uops if uop.op is Ops.WMMA])
|
||||
tcs = len([x for x in program.applied_opts if x.op is OptOps.TC])
|
||||
assert wmmas > 0, "tensor core not triggered"
|
||||
assert tcs == 1, "tensor core opt not included"
|
||||
else:
|
||||
try:
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer, opts=opts_to_apply)
|
||||
assert False, "OptOps.TC triggered, expected KernelOptError"
|
||||
except KernelOptError: pass
|
||||
|
||||
def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0, use_tensor_cores:int=1):
|
||||
a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
|
||||
np_a, np_b = a.numpy(), b.numpy()
|
||||
r = a.matmul(b, dtype=dtype_out)
|
||||
if dtype_in == dtypes.bfloat16: r = r.float()
|
||||
realized_ast, bufs = helper_realized_ast(r)
|
||||
opts = [Opt(op=OptOps.TC, axis=axis, arg=(tc_select, tc_opt, use_tensor_cores))]
|
||||
prg = CompiledRunner(replace(get_program(realized_ast, opts=opts), device=Device.DEFAULT))
|
||||
if use_tensor_cores == 1: assert len([uop for uop in prg.p.uops if uop.op is Ops.WMMA]) > 0, "wmma not triggered"
|
||||
assert len([x for x in prg.p.uops[-1].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
prg.exec(bufs)
|
||||
if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
|
||||
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = 1e-2, 1e-2
|
||||
else: tc_atol, tc_rtol = 5e-3, 1e-4
|
||||
c = bufs[0].numpy().reshape((M,N))
|
||||
np.testing.assert_allclose(c, np_a @ np_b, atol=tc_atol, rtol=tc_rtol)
|
||||
|
||||
class TestTensorCores(unittest.TestCase):
|
||||
# TODO: don't skip bf16 for real device (METAL, AMD)
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
# for AMX, tc.dims[2] == 1 so reduceop is None thus tensor_cores are not triggered
|
||||
helper_tc_allclose(tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_codegen(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
n, m, k = tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2]
|
||||
a, b = Tensor.rand(m, k, dtype=tc.dtype_in), Tensor.rand(k, n, dtype=tc.dtype_in)
|
||||
r = a.matmul(b, dtype=tc.dtype_out)
|
||||
prg = get_program(r.schedule()[-1].ast, opts=[Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))])
|
||||
if Device.DEFAULT == "LLVM":
|
||||
assert "0x201000" in prg.src
|
||||
elif Device.DEFAULT == "AMD" and AMD_LLVM:
|
||||
assert "@llvm.amdgcn.wmma" in prg.src
|
||||
elif Device[Device.DEFAULT].renderer.suffix == "PTX":
|
||||
assert "mma.sync.aligned" in prg.src
|
||||
else:
|
||||
assert "__WMMA_" in prg.src
|
||||
|
||||
@unittest.skipIf((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "broken for AMD")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_padded(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
|
||||
|
||||
# AMD compiler bug: AMD miscompiles non-zero padded tc kernels with -O3, producing wrong results, nans or hang (see #9606)
|
||||
# Internal bug: zero-stride dimensions combined with a mask may produce wrong index/valid for pad == 1 on AMD
|
||||
@unittest.skipUnless((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "test for AMD's tc")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skip("warp elements not duplicated properly across lanes")
|
||||
def test_tensor_cores_padded_amd(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_padded_uops(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
pad = 1
|
||||
|
||||
# check that TC is triggered for TC_OPT=2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=True)
|
||||
|
||||
# check that TC is not triggered for TC_OPT<2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=1, ensure_triggered=False)
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=0, ensure_triggered=False)
|
||||
|
||||
# check excessive padding doesn't trigger padded TC in TC_OPT=2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]//4, tc.dims[1], tc.dims[2], tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1]//4, tc.dims[2], tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
if not AMX: # AMX tc.dims[2] == 1
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//4, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_multi_reduce(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
if tc.dtype_in is dtypes.bfloat16: continue # <-- broken with numpy
|
||||
# this will be a M=G16, N=G32, M=G16, M=G16, K=R16, K=R16, K=R16 with 9 choices of TC MNK axes
|
||||
golden_result = None
|
||||
for axis in range(9):
|
||||
a = Tensor.rand(16, 16, 29, 29, dtype=tc.dtype_in).realize()
|
||||
b = Tensor.rand(32, 16, 16, 16, dtype=tc.dtype_in).realize()
|
||||
c = a.conv2d(b, padding=1, dtype=tc.dtype_out)
|
||||
realized_ast, real_bufs = helper_realized_ast(c)
|
||||
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer, opts=[Opt(OptOps.TC, axis, (-1, 2, 1))])
|
||||
assert len([uop for uop in program.uops if uop.op is Ops.WMMA]) > 0, "tensor core not triggered"
|
||||
assert len([x for x in program.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
|
||||
prg = CompiledRunner(program)
|
||||
# TODO: support this even if numpy doesn't
|
||||
if _to_np_dtype(real_bufs[0].dtype) is None: continue
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=_to_np_dtype(real_bufs[0].dtype)).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
|
||||
|
||||
# ensure the results for each choice of axis matches
|
||||
if golden_result is None: golden_result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
|
||||
np.testing.assert_allclose(result, golden_result, atol=0.1, rtol=0.2)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_unroll_phi(self):
|
||||
tc = Device[Device.DEFAULT].renderer.tensor_cores[0]
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out)
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation")
|
||||
def test_tensor_cores_unroll_casted_phi(self):
|
||||
tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0]
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out)
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "slow on EMULATED device")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation")
|
||||
def test_tensor_cores_unroll_casted_phi_with_children(self):
|
||||
# all STORE children are outside the loop
|
||||
tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0]
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out).relu()
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -2,19 +2,22 @@ import numpy as np
|
||||
import unittest
|
||||
from dataclasses import replace
|
||||
|
||||
from tinygrad.codegen.opt.kernel import Opt, OptOps, KernelOptError, Kernel, AxisType
|
||||
from tinygrad.codegen.opt import Opt, OptOps
|
||||
from tinygrad.codegen.gpudims import get_grouped_dims
|
||||
from tinygrad.uop.ops import UOp, Ops, GroupOp, KernelInfo
|
||||
from tinygrad.uop.ops import UOp, Ops, GroupOp
|
||||
from tinygrad.device import Device, Buffer, is_dtype_supported
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
from tinygrad.tensor import Tensor, _to_np_dtype
|
||||
from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner, get_program
|
||||
from tinygrad.codegen.opt.heuristic import hand_coded_optimizations
|
||||
from tinygrad.helpers import prod, Context, getenv, CI, flatten, dedup, AMX, AMD_LLVM, TC_SELECT, TC_OPT
|
||||
from tinygrad.helpers import Context, getenv, flatten, dedup, TC_SELECT, TC_OPT
|
||||
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
|
||||
from tinygrad.codegen import apply_rewrites, rewrites_for_views
|
||||
|
||||
# TODO: remove this
|
||||
from tinygrad.codegen.opt.kernel import Kernel
|
||||
|
||||
def push_views(ast): return apply_rewrites(ast, rewrites_for_views)
|
||||
|
||||
def helper_realized_ast(r:Tensor|list[Tensor]) -> tuple[UOp, list[Buffer]]:
|
||||
@@ -27,44 +30,6 @@ def helper_realized_ast(r:Tensor|list[Tensor]) -> tuple[UOp, list[Buffer]]:
|
||||
bufs = [Buffer((x).device, x.size, x.dtype).allocate() if i < len(s[-1].ast.src) else x for i,x in enumerate(s[-1].bufs)]
|
||||
return push_views(s[-1].ast), bufs
|
||||
|
||||
def helper_tc_allclose(N:int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0, use_tensor_cores:int=1):
|
||||
a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
|
||||
np_a, np_b = a.numpy(), b.numpy()
|
||||
r = a.matmul(b, dtype=dtype_out)
|
||||
if dtype_in == dtypes.bfloat16: r = r.float()
|
||||
realized_ast, bufs = helper_realized_ast(r)
|
||||
opts = [Opt(op=OptOps.TC, axis=axis, arg=(tc_select, tc_opt, use_tensor_cores))]
|
||||
prg = CompiledRunner(replace(get_program(realized_ast, opts=opts), device=Device.DEFAULT))
|
||||
if use_tensor_cores == 1: assert len([uop for uop in prg.p.uops if uop.op is Ops.WMMA]) > 0, "wmma not triggered"
|
||||
assert len([x for x in prg.p.uops[-1].arg.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
prg.exec(bufs)
|
||||
if dtype_in == dtypes.half: tc_atol, tc_rtol = 1e-2, 1e-3
|
||||
elif dtype_in == dtypes.bfloat16: tc_atol, tc_rtol = 1e-2, 1e-2
|
||||
else: tc_atol, tc_rtol = 5e-3, 1e-4
|
||||
c = bufs[0].numpy().reshape((M,N))
|
||||
np.testing.assert_allclose(c, np_a @ np_b, atol=tc_atol, rtol=tc_rtol)
|
||||
|
||||
def helper_tc_ensure_uops_and_opts_count(N: int, M:int, K:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_select:int=-1, tc_opt:int=0,
|
||||
ensure_triggered:bool=True):
|
||||
a, b = Tensor.rand(M, K, dtype=dtype_in), Tensor.rand(K, N, dtype=dtype_in)
|
||||
r = a.matmul(b, dtype=dtype_out)
|
||||
sched = r.schedule()
|
||||
realized_ast = sched[-1].ast
|
||||
opts_to_apply = [Opt(OptOps.TC, axis, (tc_select, tc_opt, 1))]
|
||||
realized_ast = realized_ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
|
||||
|
||||
if ensure_triggered:
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer)
|
||||
wmmas = len([uop for uop in program.uops if uop.op is Ops.WMMA])
|
||||
tcs = len([x for x in program.applied_opts if x.op is OptOps.TC])
|
||||
assert wmmas > 0, "tensor core not triggered"
|
||||
assert tcs == 1, "tensor core opt not included"
|
||||
else:
|
||||
try:
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer)
|
||||
assert False, "OptOps.TC triggered, expected KernelOptError"
|
||||
except KernelOptError: pass
|
||||
|
||||
class TestLinearizer(unittest.TestCase):
|
||||
def test_arg_dedup(self):
|
||||
# NOTE: this realize exists because Tensor.numpy calls .contiguous() internally
|
||||
@@ -116,16 +81,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
if skip and i in skip: continue
|
||||
assert ranges[i-1] != u, f"multireduce nested the ranges! {ranges[i-1], {u}}"
|
||||
|
||||
@unittest.skip("broken. should not depends on push_views and implementation details of getitem")
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"PTX", "AMD", "NV"}, "very slow")
|
||||
def test_indexing_multireduce(self):
|
||||
dataset = Tensor.rand(16384, 256).realize()
|
||||
idxs = Tensor([0,3,5,6]).realize()
|
||||
with Context(FUSE_ARANGE=1):
|
||||
sink = dataset[idxs].contiguous().kernelize().uop.base.src[1].arg.ast
|
||||
real_index = dataset.numpy()[idxs.numpy()].reshape(4, 256, 1, 1)
|
||||
helper_linearizer_ast(push_views(sink), [dataset, idxs], wanna_output=[real_index])
|
||||
|
||||
def test_two_nested_range(self):
|
||||
a = Tensor.randn(2, ).realize()
|
||||
out = a.reshape(2, 1).expand(2, 3).sum()
|
||||
@@ -133,8 +88,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
uops = get_program(lin.get_optimized_ast(), lin.opts).uops
|
||||
ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE]
|
||||
assert len(ranges) == 1 # NOTE: it collapses now
|
||||
# RANGE -> LOAD -> RANGE -> STORE
|
||||
#assert any(x.op is Ops.LOAD for x in uops[ranges[0]:ranges[1]])
|
||||
|
||||
def test_three_nested_range(self):
|
||||
a = Tensor.randn(2, ).realize()
|
||||
@@ -143,10 +96,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
uops = get_program(lin.get_optimized_ast(), lin.opts).uops
|
||||
ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE]
|
||||
assert len(ranges) == 1 # NOTE: it collapses now
|
||||
# RANGE -> RANGE -> LOAD -> RANGE -> STORE
|
||||
# NOTE: nothing should toposort between the first two ranges
|
||||
#assert ranges[0]+1 == ranges[1]
|
||||
#assert any(x.op is Ops.LOAD for x in uops[ranges[1]:ranges[2]])
|
||||
|
||||
def test_two_nested_range_alt_indexing(self):
|
||||
a = Tensor([2, 2]).realize()
|
||||
@@ -177,38 +126,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
uops = get_program(lin.get_optimized_ast(), lin.opts).uops
|
||||
ranges = [i for i,u in enumerate(uops) if u.op is Ops.RANGE]
|
||||
assert len(ranges) == 1 # NOTE: it collapses now
|
||||
#if getenv("PTX"):
|
||||
# LOAD -> RANGE -> CAST -> ALU -> ALU -> LOAD -> ALU -> RANGE -> ALU -> STORE
|
||||
# assert uops[ranges[0]-2].op is Ops.LOAD
|
||||
# assert ranges[1] == ranges[0]+6
|
||||
# assert [x.op for x in uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU]
|
||||
# LOAD -> RANGE -> LOAD -> ALU -> RANGE -> STORE
|
||||
#else:
|
||||
# assert uops[ranges[0]-2].op is Ops.LOAD
|
||||
# assert ranges[1] == ranges[0]+3
|
||||
# assert [x.op for x in uops[ranges[1]-2:ranges[1]]] == [Ops.LOAD, Ops.ALU]
|
||||
|
||||
@unittest.skip("fragile crap")
|
||||
def test_range_outer_op_after_phi(self):
|
||||
a = Tensor.randn(4, 1).realize()
|
||||
out = a.sum() * a.sum()
|
||||
lin = helper_linearizer_opt(out, wanna_output=[a.numpy().sum()*a.numpy().sum()])[0]
|
||||
uops = get_program(lin.get_optimized_ast(), lin.opts).uops
|
||||
# RANGE -> LOAD -> STORE -> ALU
|
||||
end = max(i for i,u in enumerate(uops) if u.op is Ops.ENDRANGE)
|
||||
# the INDEX can be first
|
||||
assert uops[end+1].op in GroupOp.ALU or uops[end+2].op in GroupOp.ALU
|
||||
|
||||
@unittest.skip("fragile crap")
|
||||
def test_range_outer_op_after_phi_nested_range(self):
|
||||
a = Tensor.randn(2, ).realize()
|
||||
out = a.reshape(2, 1).expand(2, 3).sum() + a.reshape(2, 1).expand(2, 3).sum()
|
||||
lin = helper_linearizer_opt(out, wanna_output=[(np.broadcast_to(a.numpy().reshape(2, 1), (2, 3))).sum()*2])[0]
|
||||
uops = get_program(lin.get_optimized_ast(), lin.opts).uops
|
||||
# RANGE -> LOAD -> STORE -> ALU
|
||||
end = max(i for i,u in enumerate(uops) if u.op is Ops.ENDRANGE)
|
||||
# the INDEX can be first
|
||||
assert uops[end+1].op in GroupOp.ALU or uops[end+2].op in GroupOp.ALU
|
||||
|
||||
def test_load_dedup(self):
|
||||
# for different leaves in the AST, the same loads may occur.
|
||||
@@ -312,142 +229,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
d, w = Tensor.rand(4, 8, 8, 8, dtype=tensor_dtype), Tensor.rand(8, 8, 2, 2, dtype=tensor_dtype)
|
||||
helper_arg_acc_dtype(d.conv2d(w, dtype=acc_dtype), expected_dtype)
|
||||
|
||||
# TODO: don't skip bf16 for real device (METAL, AMD)
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
# for AMX, tc.dims[2] == 1 so reduceop is None thus tensor_cores are not triggered
|
||||
helper_tc_allclose(tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0)
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_codegen(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
n, m, k = tc.dims[0], tc.dims[1], 2 if AMX else tc.dims[2]
|
||||
a, b = Tensor.rand(m, k, dtype=tc.dtype_in), Tensor.rand(k, n, dtype=tc.dtype_in)
|
||||
r = a.matmul(b, dtype=tc.dtype_out)
|
||||
prg = get_program(r.schedule()[-1].ast, opts=[Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))])
|
||||
if Device.DEFAULT == "LLVM":
|
||||
assert "0x201000" in prg.src
|
||||
elif Device.DEFAULT == "AMD" and AMD_LLVM:
|
||||
assert "@llvm.amdgcn.wmma" in prg.src
|
||||
elif Device[Device.DEFAULT].renderer.suffix == "PTX":
|
||||
assert "mma.sync.aligned" in prg.src
|
||||
else:
|
||||
assert "__WMMA_" in prg.src
|
||||
|
||||
@unittest.skipIf((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "broken for AMD")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_padded(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
|
||||
|
||||
# AMD compiler bug: AMD miscompiles non-zero padded tc kernels with -O3, producing wrong results, nans or hang (see #9606)
|
||||
# Internal bug: zero-stride dimensions combined with a mask may produce wrong index/valid for pad == 1 on AMD
|
||||
@unittest.skipUnless((Device.DEFAULT == "AMD") or (Device.DEFAULT == "PYTHON" and Device.default.renderer.device == "AMD"), "test for AMD's tc")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skip("warp elements not duplicated properly across lanes")
|
||||
def test_tensor_cores_padded_amd(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
helper_tc_allclose(tc.dims[0]+(pad:=1), tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_padded_uops(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
pad = 1
|
||||
|
||||
# check that TC is triggered for TC_OPT=2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=True)
|
||||
|
||||
# check that TC is not triggered for TC_OPT<2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=1, ensure_triggered=False)
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=0, ensure_triggered=False)
|
||||
|
||||
# check excessive padding doesn't trigger padded TC in TC_OPT=2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]//4, tc.dims[1], tc.dims[2], tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1]//4, tc.dims[2], tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
if not AMX: # AMX tc.dims[2] == 1
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//4, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_multi_reduce(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
if not is_dtype_supported(tc.dtype_in) or not is_dtype_supported(tc.dtype_out): continue
|
||||
if tc.dtype_in is dtypes.bfloat16: continue # <-- broken with numpy
|
||||
# this will be a M=G16, N=G32, M=G16, M=G16, K=R16, K=R16, K=R16 with 9 choices of TC MNK axes
|
||||
golden_result = None
|
||||
for axis in range(9):
|
||||
a = Tensor.rand(16, 16, 29, 29, dtype=tc.dtype_in).realize()
|
||||
b = Tensor.rand(32, 16, 16, 16, dtype=tc.dtype_in).realize()
|
||||
c = a.conv2d(b, padding=1, dtype=tc.dtype_out)
|
||||
realized_ast, real_bufs = helper_realized_ast(c)
|
||||
|
||||
opts_to_apply = [Opt(OptOps.TC, axis, (-1, 2, 1))]
|
||||
realized_ast = realized_ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer)
|
||||
assert len([uop for uop in program.uops if uop.op is Ops.WMMA]) > 0, "tensor core not triggered"
|
||||
assert len([x for x in program.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
|
||||
prg = CompiledRunner(program)
|
||||
# TODO: support this even if numpy doesn't
|
||||
if _to_np_dtype(real_bufs[0].dtype) is None: continue
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=_to_np_dtype(real_bufs[0].dtype)).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
|
||||
|
||||
# ensure the results for each choice of axis matches
|
||||
if golden_result is None: golden_result = np.frombuffer(real_bufs[0].as_buffer(), _to_np_dtype(real_bufs[0].dtype))
|
||||
np.testing.assert_allclose(result, golden_result, atol=0.1, rtol=0.2)
|
||||
|
||||
# check that get_kernel_actions produces all 9 options
|
||||
from tinygrad.codegen.opt.search import get_kernel_actions
|
||||
tc_actions = [k for i, k in get_kernel_actions(Kernel(realized_ast), False).items() if k.applied_opts[0].op == OptOps.TC]
|
||||
|
||||
available_tc = len([x for x in Device[Device.DEFAULT].renderer.tensor_cores if x.dtype_in == tc.dtype_in and x.dtype_out == tc.dtype_out])
|
||||
assert len(tc_actions) == 9 * available_tc, f"should contain 9 possible TC actions for every available TC, got {len(tc_actions)}"
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_unroll_phi(self):
|
||||
tc = Device[Device.DEFAULT].renderer.tensor_cores[0]
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out)
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation")
|
||||
def test_tensor_cores_unroll_casted_phi(self):
|
||||
tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0]
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out)
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU", "LLVM"}, "CPU does not support using a different type for accumulation")
|
||||
def test_tensor_cores_unroll_casted_phi_with_children(self):
|
||||
# all STORE children are outside the loop
|
||||
tc = [tc for tc in Device[Device.DEFAULT].renderer.tensor_cores if tc.dtype_in != tc.dtype_out][0]
|
||||
x, y = Tensor.rand(128, 128, dtype=tc.dtype_in), Tensor.rand(128, 128, dtype=tc.dtype_in)
|
||||
r = x.matmul(y, dtype=tc.dtype_out).relu()
|
||||
k = helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 0, 4)]], apply_tc=True, atol=3e-2, rtol=1e-3)[-1]
|
||||
for u in get_program(k.ast, k.opts, k.applied_opts).uops:
|
||||
if u.op is Ops.WMMA:
|
||||
#assert u.src[-1].dtype == dtypes.float.vec(prod(tc.thread_local_sizes[2]))
|
||||
assert u.src[-1].src[0].op != Ops.STORE
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
|
||||
def test_simple_unroll_no_between_phi_dependencies(self):
|
||||
x, y = Tensor.rand(128, 128), Tensor.rand(128, 128)
|
||||
@@ -577,9 +358,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
sched_copy = sched[:]
|
||||
run_schedule(sched)
|
||||
np.testing.assert_equal(a.flatten().numpy(), [1.,1.,1.,1.,2.,2.,2.,2.,1.,1.,1.,1.,1.,1.,1.,1.])
|
||||
realized_ast = sched_copy[-1].ast
|
||||
realized_ast = realized_ast.replace(arg=KernelInfo(opts_to_apply=tuple()))
|
||||
program = get_program(realized_ast, Device[Device.DEFAULT].renderer)
|
||||
program = get_program(sched_copy[-1].ast, Device[Device.DEFAULT].renderer, opts=())
|
||||
assert not any(u.op == Ops.WHERE for u in program.uops), "found where where where should be folded"
|
||||
|
||||
def test_phi_simplification(self):
|
||||
@@ -624,20 +403,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
for val in store_vals:
|
||||
assert val.dtype == dtypes.float.vec(4) # and val.op is not Ops.VECTORIZE
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
|
||||
def test_arange_opts(self):
|
||||
a = Tensor.arange(128)
|
||||
helper_linearizer_opt(a, [
|
||||
[Opt(OptOps.GROUP, 0, 32)],
|
||||
[Opt(OptOps.GROUPTOP, 0, 32)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
|
||||
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
|
||||
])
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
|
||||
def test_grouped_store_values(self):
|
||||
x = Tensor.randn((4,3,6,6)).realize()
|
||||
@@ -723,82 +488,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
out = [u for u in get_program(k.ast, k.opts, k.applied_opts).uops if u.op is Ops.STORE][0]
|
||||
assert out.src[1].op is Ops.VECTORIZE and out.src[1].dtype.count != 1
|
||||
|
||||
class TestHandCodedOpts(unittest.TestCase):
|
||||
def test_masked_upcast(self):
|
||||
layer_1 = Tensor.cat(*[Tensor.empty(5) for _ in range(4)])
|
||||
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.empty(6, 20))
|
||||
|
||||
s = layer_2.schedule()[-1]
|
||||
k = Kernel(push_views(s.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
assert len(k.bufs) == 6 # make sure all ops are done in one kernel
|
||||
# masked upcast should upcast masked axis of size 7
|
||||
# masked upcast should not upcast large (20) last axis
|
||||
# float4/other hcopt shouldn't upcast last axis, since we already have 7 upcast, and the last axis is not very contiguous
|
||||
assert k.upcasted == 1 and k.full_shape[-1] == 7
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT in {"METAL", "WEBGPU"}, "METAL/WEBGPU split this kernel since it has 37 buffers")
|
||||
def test_masked_upcast_wino(self):
|
||||
monster = Tensor.stack(*[Tensor.stack(*[Tensor.empty(16) for _ in range(6)]) for _ in range(6)])
|
||||
|
||||
s = monster.schedule()[-1]
|
||||
k = Kernel(push_views(s.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
assert len(k.bufs) == 37 # make sure all ops are done in one kernel
|
||||
# should upcast the two Tensor.stacks
|
||||
assert k.upcasted >= 2 and k.full_shape[k.shape_len-k.upcasted:k.shape_len].count(6) == 2
|
||||
|
||||
def test_masked_upcast_wino_full(self):
|
||||
with Context(WINO=1):
|
||||
x,w = Tensor.rand(1,4,8,8, requires_grad=True).realize(), Tensor.rand(4,4,3,3, requires_grad=True).realize()
|
||||
out = Tensor.conv2d(x,w, padding=1)
|
||||
out.mean().backward()
|
||||
|
||||
upcasts = []
|
||||
wino_schedule = out.schedule()
|
||||
# collect upcasts of tile transform kernels
|
||||
for i, si in enumerate(wino_schedule):
|
||||
k = Kernel(push_views(si.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
if k.reduceop is not None: continue # not a tile transform kernel (there is a gemm reduce kernel)
|
||||
if len(k.bufs) < 22: continue # not a tile transform kernel (there's a permute kernel at the end)
|
||||
upcasts.append(tuple(k.full_shape[k.shape_len - k.upcasted:k.shape_len]))
|
||||
assert len(upcasts) == 3 # 3 transformation matrices
|
||||
assert len(wino_schedule) <= 4 # 4 kernels
|
||||
# this test case's inputs are too small, so one of the 4-stacks became a local, which is fine i guess
|
||||
assert upcasts.count((6, 6)) == 2 #and upcasts.count((4, 4)) == 1
|
||||
|
||||
backward_schedule = Tensor.schedule(x.grad, w.grad)
|
||||
for si in backward_schedule:
|
||||
k = Kernel(push_views(si.ast))
|
||||
k.apply_opts(hand_coded_optimizations(k))
|
||||
if len(k.bufs) < 20: continue # not a tile transform kernel
|
||||
# heuristic number to make sure that at least some upcasts but not too many upcasts are being done
|
||||
assert 6 <= prod(k.full_shape[k.shape_len - k.upcasted:k.shape_len]) <= 216
|
||||
assert len(backward_schedule) <= 13 # just the current number, but it could be better
|
||||
|
||||
def test_masked_upcast_many(self):
|
||||
layer_1 = Tensor.cat(Tensor.rand(3, 4), Tensor.rand(4, 4))
|
||||
layer_2 = Tensor.cat(layer_1.unsqueeze(0), Tensor.rand(6, 7, 4))
|
||||
layer_3 = Tensor.cat(layer_2.unsqueeze(0), Tensor.rand(6, 7, 7, 4))
|
||||
|
||||
k = helper_linearizer_opt(layer_3)[-1]
|
||||
assert len(k.bufs) == 5 # make sure all ops are done in one kernel
|
||||
# check that we don't do too many upcasts
|
||||
assert prod(k.full_shape[k.shape_len-k.upcasted:k.shape_len]) <= 49
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
||||
def test_matvec(self):
|
||||
N = 128
|
||||
a = Tensor.rand(1, N).realize()
|
||||
b = Tensor.rand(N, N).realize()
|
||||
c = a @ b
|
||||
|
||||
k = helper_linearizer_opt(c)[-1]
|
||||
|
||||
assert k.group_for_reduces == 1
|
||||
assert k.axis_types.count(AxisType.LOCAL) == 1
|
||||
assert k.upcasted == 1
|
||||
# *** helpers ***
|
||||
|
||||
def helper_linearizer_ast(ast:UOp, inputs:list[Tensor], *args, **kwargs):
|
||||
assert isinstance(ast, UOp), "ast must be UOp"
|
||||
|
||||
Reference in New Issue
Block a user