From c50e374bb62cba4985ecb4d3e214cec28cd95f3a Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:10:10 -0700 Subject: [PATCH] multiple locals + get_kernel_modifier + fix valid (#5739) * multiple locals + get_kernel_modifier + fix valid * fix test pattern matcher --- test/test_pattern_matcher.py | 5 +++++ tinygrad/codegen/lowerer.py | 7 ++++--- tinygrad/codegen/uopgraph.py | 5 ++++- tinygrad/ops.py | 2 +- tinygrad/renderer/cstyle.py | 5 +++++ 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/test/test_pattern_matcher.py b/test/test_pattern_matcher.py index 53b4ce82b9..216cf38c4e 100644 --- a/test/test_pattern_matcher.py +++ b/test/test_pattern_matcher.py @@ -166,6 +166,11 @@ class TestPatternMatcher(TestUOps): dtypes._float2 = dtypes.float.vec(2) dtypes._float4 = dtypes.float.vec(4) dtypes._float8 = dtypes.float.vec(8) + dtypes._float16 = dtypes.float.vec(16) + dtypes._half2 = dtypes.half.vec(2) + dtypes._half4 = dtypes.half.vec(4) + dtypes._half8 = dtypes.half.vec(8) + dtypes._half16 = dtypes.half.vec(16) upat = UPat(UOps.CONST, name="x", dtype=dtypes.float) assert str(upat) == str(eval(str(upat))) evpat:UPat = eval(repr(UPat(src = [UPat(name='a'), UPat(name='b')]))) diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py index c454af7c6d..d5671f0cee 100644 --- a/tinygrad/codegen/lowerer.py +++ b/tinygrad/codegen/lowerer.py @@ -149,8 +149,9 @@ class IndependentLowerer: if x.op is BufferOps.CONST: dtype = x.arg.dtype.base if isinstance(x.arg.dtype, ImageDType) else x.arg.dtype return valid.alu(TernaryOps.WHERE, UOp.const(dtype, x.arg.val), UOp.const(dtype, 0)) - if x.arg.idx == -1: - buf = UOp(UOps.DEFINE_LOCAL, PtrDType(x.arg.dtype.base if isinstance(x.arg.dtype, ImageDType) else x.arg.dtype), (), ("temp", x.arg.st.size)) + if x.arg.idx < 0: + buf = UOp(UOps.DEFINE_LOCAL, PtrDType(x.arg.dtype.base if isinstance(x.arg.dtype, ImageDType) else x.arg.dtype), + arg=(f"temp{-x.arg.idx}", x.arg.st.real_size())) else: buf = UOp(UOps.DEFINE_GLOBAL, x.arg.dtype if isinstance(x.arg.dtype, ImageDType) else PtrDType(x.arg.dtype), (), (x.arg.idx, x.arg.idx < self.output_count)) @@ -159,9 +160,9 @@ class IndependentLowerer: return UOp(UOps.LOAD, x.arg.dtype.scalar(), (buf, idx) + ((valid, UOp.const(x.arg.dtype.scalar(), 0)) if has_valid else ()) + barrier) # NOTE: only store the local reduceop in the first thread if x.arg.idx != -1: - has_valid = True for oidx, ridx in zip(self.idxs, self.ridxs): if oidx != ridx: valid = valid * oidx.eq(0) + has_valid = valid.op is not UOps.CONST or valid.arg is not True return UOp(UOps.STORE, None, (buf, idx, self.to_uop(x.src[0])) + ((valid,) if has_valid else ())) in_uops = tuple(self.to_uop(y) for y in x.src) diff --git a/tinygrad/codegen/uopgraph.py b/tinygrad/codegen/uopgraph.py index b061950305..8095a23918 100644 --- a/tinygrad/codegen/uopgraph.py +++ b/tinygrad/codegen/uopgraph.py @@ -154,8 +154,11 @@ constant_folder = PatternMatcher([ lambda x: UOp(x.op, dtypes.int32, x.src, x.arg)), # VECTORIZE/GEP (NOp(UOps.GEP, src=(NOp(UOps.VECTORIZE).name("cast"),)).name("gep"), lambda gep, cast: cast.src[gep.arg]), + # NOTE: this has to be two rules since the dtypes must be the same *[(NOp(UOps.VECTORIZE, dtypes.float.vec(i), tuple(NOp(UOps.GEP, dtypes.float, - src=(NOp.var('x', dtype=dtypes.float.vec(i)),), arg=j) for j in range(i))), lambda x: x) for i in [2, 4, 8]], + src=(NOp.var('x', dtype=dtypes.float.vec(i)),), arg=j) for j in range(i))), lambda x: x) for i in [2, 4, 8, 16]], + *[(NOp(UOps.VECTORIZE, dtypes.half.vec(i), tuple(NOp(UOps.GEP, dtypes.half, + src=(NOp.var('x', dtype=dtypes.half.vec(i)),), arg=j) for j in range(i))), lambda x: x) for i in [2, 4, 8, 16]], # tensor core with a 0 input is acc (NOp(UOps.WMMA, src=(NOp.const(None, 0.0), NOp.var(), NOp.var('acc'))), lambda acc: acc), (NOp(UOps.WMMA, src=(NOp.var(), NOp.const(None, 0.0), NOp.var('acc'))), lambda acc: acc), diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 8d5014945f..ba81402b77 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -147,7 +147,7 @@ def verify_lazyop(ast:LazyOp) -> Dict[LazyOp, ShapeTracker]: def assert_valid(op:LazyOp, st:ShapeTracker): if op in sts: return # restore globals from the two stage reduce - if op.op is BufferOps.LOAD and op.arg.idx == -1: + if op.op is BufferOps.LOAD and op.arg.idx < 0: assert_valid(local_reduce:=op.src[0].src[0], op.arg.st) return sts.setdefault(op, sts[local_reduce]) for x in op.src: assert_valid(x, st) diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index ad36e2951b..8cfb4d7386 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -300,6 +300,11 @@ return c;}}""") return super().render_kernel(function_name, kernel, bufs, uops, prefix=prefix) + def get_kernel_modifier(self, uops:UOpGraph) -> str: + maxThreadsPerBlock = prod(u.arg[1] for u in uops if u.op is UOps.SPECIAL and u.arg[0][0] == "l") + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html + return f"__launch_bounds__({maxThreadsPerBlock}) " + code_for_op_hip = { UnaryOps.SQRT: lambda x,dtype: f"__ocml_sqrt_f{ {dtypes.half:16, dtypes.double:64}.get(dtype, 32)}({x})", UnaryOps.SIN: lambda x,dtype: f"__ocml_sin_f{ {dtypes.half:16, dtypes.double:64}.get(dtype, 32)}({x})", UnaryOps.LOG2: lambda x,dtype: f"__ocml_log2_f{ {dtypes.half:16, dtypes.double:64}.get(dtype, 32)}({x})",