mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
wmma: add reduce axis choice to TC action space (#4328)
* wmma: add reduce axis choice to TC action space * add test for TC multi-reduce axis choice
This commit is contained in:
@@ -15,7 +15,15 @@ from tinygrad.helpers import prod, Context, getenv
|
||||
from tinygrad.dtype import DType, dtypes
|
||||
from tinygrad.codegen.uops import UOpGraph
|
||||
|
||||
def helper_tc_allclose(m:int, k:int, n:int, dtype_in:DType, dtype_out:DType, tc_opt:int):
|
||||
def helper_realized_ast(r:Tensor):
|
||||
s = create_schedule([r.lazydata])
|
||||
run_schedule(s[:-1]) # run all kernels except the last one
|
||||
# now all input LazyBuffers buffers in s[-1] should be realized
|
||||
# allocate an output buffer
|
||||
output_buffer = Buffer((out:=s[-1].outputs[0]).device, out.size, out.dtype).allocate()
|
||||
return s[-1].ast[0], [output_buffer] + list(s[-1].inputs)
|
||||
|
||||
def helper_tc_allclose(m:int, k:int, n:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_opt:int=0):
|
||||
a, b = Tensor.rand(m, k, dtype=dtype_in), Tensor.rand(k, n, dtype=dtype_in)
|
||||
np_a, np_b = a.numpy(), b.numpy()
|
||||
r = a.matmul(b, acc_dtype=dtype_out)
|
||||
@@ -24,7 +32,7 @@ def helper_tc_allclose(m:int, k:int, n:int, dtype_in:DType, dtype_out:DType, tc_
|
||||
run_schedule(sched)
|
||||
out = r.numpy()
|
||||
k = Linearizer(realized_ast)
|
||||
k.apply_tensor_cores(1, tc_opt=tc_opt)
|
||||
k.apply_tensor_cores(1, axis=axis, tc_opt=tc_opt)
|
||||
k.linearize()
|
||||
assert len([uop for uop in k.uops if uop.uop is UOps.WMMA]) > 0, "tensor core not triggered"
|
||||
assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
@@ -34,13 +42,13 @@ def helper_tc_allclose(m:int, k:int, n:int, dtype_in:DType, dtype_out:DType, tc_
|
||||
else: tc_atol, tc_rtol = 5e-3, 1e-4
|
||||
np.testing.assert_allclose(np_c, out, atol=tc_atol, rtol=tc_rtol)
|
||||
|
||||
def helper_tc_ensure_uops_and_opts_count(m:int, k:int, n:int, dtype_in:DType, dtype_out:DType, tc_opt:int, ensure_triggered:bool=True):
|
||||
def helper_tc_ensure_uops_and_opts_count(m:int, k:int, n:int, dtype_in:DType, dtype_out:DType, axis:int=0, tc_opt:int=0, ensure_triggered:bool=True):
|
||||
a, b = Tensor.rand(m, k, dtype=dtype_in), Tensor.rand(k, n, dtype=dtype_in)
|
||||
r = a.matmul(b, acc_dtype=dtype_out)
|
||||
sched = create_schedule([r.lazydata])
|
||||
realized_ast = sched[-1].ast[0]
|
||||
k = Linearizer(realized_ast)
|
||||
k.apply_tensor_cores(1, tc_opt=tc_opt)
|
||||
k.apply_tensor_cores(1, axis=axis, tc_opt=tc_opt)
|
||||
k.linearize()
|
||||
wmmas = len([uop for uop in k.uops if uop.uop is UOps.WMMA])
|
||||
tcs = len([x for x in k.applied_opts if x.op is OptOps.TC])
|
||||
@@ -214,7 +222,7 @@ class TestLinearizer(unittest.TestCase):
|
||||
self.skipTest("device doesn't have tensor cores")
|
||||
for tc in tensor_cores[Device[Device.DEFAULT].compiler.compiler_opts.device]:
|
||||
if getenv("EMULATE_CUDA") and (tc.dtype_in == dtypes.bfloat16 or tc.dtype_out == dtypes.bfloat16): continue
|
||||
helper_tc_allclose(tc.dims[1], tc.dims[2], tc.dims[0], tc.dtype_in, tc.dtype_out, tc_opt=0)
|
||||
helper_tc_allclose(tc.dims[1], tc.dims[2], tc.dims[0], tc.dtype_in, tc.dtype_out, axis=0, tc_opt=0)
|
||||
|
||||
def test_tensor_cores_padded(self):
|
||||
if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores:
|
||||
@@ -224,7 +232,8 @@ class TestLinearizer(unittest.TestCase):
|
||||
pad = 1
|
||||
|
||||
# check that TC is triggered for TC_OPT=2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[2]+pad, tc.dims[1]+pad,tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=True)
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[2]+pad, tc.dims[1]+pad,
|
||||
tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=True)
|
||||
|
||||
# check that TC is not triggered for TC_OPT<2
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0]+pad, tc.dims[2]+pad, tc.dims[1]+pad,
|
||||
@@ -240,6 +249,33 @@ class TestLinearizer(unittest.TestCase):
|
||||
# check correctness
|
||||
helper_tc_allclose(tc.dims[1]+pad, tc.dims[2]+pad, tc.dims[0]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
|
||||
|
||||
def test_tensor_cores_multi_reduce(self):
|
||||
if not Device[Device.DEFAULT].compiler.compiler_opts.has_tensor_cores:
|
||||
self.skipTest("device doesn't have tensor cores")
|
||||
for tc in tensor_cores[Device[Device.DEFAULT].compiler.compiler_opts.device]:
|
||||
if getenv("EMULATE_CUDA") and (tc.dtype_in == dtypes.bfloat16 or tc.dtype_out == dtypes.bfloat16): continue
|
||||
# this will be a M=G16, N=G32, M=G16, M=G16, K=R16, K=R16, K=R16 with 9 choices of TC MNK axes
|
||||
golden_result = None
|
||||
for axis in range(9):
|
||||
a = Tensor.rand(16, 16, 29, 29, dtype=tc.dtype_in).realize()
|
||||
b = Tensor.rand(32, 16, 16, 16, dtype=tc.dtype_in).realize()
|
||||
c = a.conv2d(b, padding=1, acc_dtype=tc.dtype_out)
|
||||
realized_ast, real_bufs = helper_realized_ast(c)
|
||||
|
||||
k = Linearizer(realized_ast)
|
||||
k.apply_tensor_cores(1, axis=axis, tc_opt=2)
|
||||
k.linearize()
|
||||
assert len([uop for uop in k.uops if uop.uop is UOps.WMMA]) > 0, "tensor core not triggered"
|
||||
assert len([x for x in k.applied_opts if x.op is OptOps.TC]) == 1, "tensor core opt not included"
|
||||
|
||||
prg = Device[Device.DEFAULT].to_program(k)
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
result = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np)
|
||||
|
||||
if golden_result is None: golden_result = np.frombuffer(real_bufs[0].as_buffer(), real_bufs[0].dtype.np)
|
||||
np.testing.assert_allclose(result, golden_result, atol=0.1, rtol=0.15)
|
||||
|
||||
def test_limit_dims_to_max_5d_global(self):
|
||||
t = Tensor.empty(3, 4, 5, 6, 7).pad(((1, 1), (1, 1), (1, 1), (1, 1), (1, 1))) + 1
|
||||
sched = [si for si in create_schedule([t.lazydata]) if si.ast[0].op not in LoadOps]
|
||||
@@ -315,14 +351,6 @@ class TestLinearizer(unittest.TestCase):
|
||||
helper(Tensor.arange(256), max_ops=2)
|
||||
helper(Tensor.arange(255), max_ops=0)
|
||||
|
||||
def helper_realized_ast(r:Tensor):
|
||||
s = create_schedule([r.lazydata])
|
||||
run_schedule(s[:-1]) # run all kernels except the last one
|
||||
# now all input LazyBuffers buffers in s[-1] should be realized
|
||||
# allocate an output buffer
|
||||
output_buffer = Buffer((out:=s[-1].outputs[0]).device, out.size, out.dtype).allocate()
|
||||
return s[-1].ast[0], [output_buffer] + list(s[-1].inputs)
|
||||
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].compiler.compiler_opts.supports_float4, "need backends that support float4")
|
||||
class TestFloat4(unittest.TestCase):
|
||||
@staticmethod
|
||||
@@ -554,7 +582,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False, atol=1e-4, rtol=1e-
|
||||
def check_opt(opts, create_k, to_prg, expected_color_size):
|
||||
k = create_k()
|
||||
if apply_tc:
|
||||
assert k.apply_tensor_cores(1, opts), "no tensor core triggered"
|
||||
assert k.apply_tensor_cores(1, extra_opts=opts), "no tensor core triggered"
|
||||
else:
|
||||
for opt in opts:
|
||||
k.apply_opt(opt)
|
||||
|
||||
Reference in New Issue
Block a user