From 381f3e92da81c54ea2f2e208df249c9d741b30e1 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sat, 28 Jan 2023 14:10:27 -0800 Subject: [PATCH] fix prints, add third conv --- test/external_test_gpu_ast.py | 10 ++++++++++ tinygrad/llops/ops_gpu.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/test/external_test_gpu_ast.py b/test/external_test_gpu_ast.py index 6d3c52d44a..2ca4fdd9fa 100644 --- a/test/external_test_gpu_ast.py +++ b/test/external_test_gpu_ast.py @@ -55,6 +55,16 @@ class TestAST(unittest.TestCase): ast = LazyOp(MovementOps.RESHAPE, (op8,), (64, 1024, 4)) compile_and_test_ast(ast) + def test_third_op_conv(self): + buf0 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 4096, 32, 0, 0, 0, 0, 4, 1), 0)]), hostbuf=GPUBuffer(shape=(64, 1024, 4), force_create=True)) + buf1 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 8, 4), views=[View((1, 64, 128, 4, 4, 1, 1, 8, 4), (0, 0, 0, 128, 4, 0, 0, 16, 1), 0)]), hostbuf=GPUBuffer(shape=(4, 32, 4), force_create=True)) + op0 = LazyOp(BinaryOps.MUL, (buf0,buf1,), None) + op1 = LazyOp(ReduceOps.SUM, (op0,), (1, 64, 128, 4, 4, 1, 1, 1, 1)) + buf2 = GPUBuffer(shape=ShapeTracker(shape=(1, 64, 128, 4, 4, 1, 1, 1, 1), views=[View((1, 64, 128, 4, 4, 1, 1, 1, 1), (0, 0, 0, 4, 1, 1, 1, 1, 1), 0)]), hostbuf=GPUBuffer(shape=(16,), force_create=True)) + op2 = LazyOp(BinaryOps.ADD, (op1,buf2,), None) + ast = LazyOp(MovementOps.RESHAPE, (op2,), (64, 512, 4)) + compile_and_test_ast(ast) + # VALIDHACKS=1 IMAGE=2 DEBUG=4 PYTHONPATH="." GPU=1 OPT=2 python3 test/external_test_gpu_ast.py TestAST.test_reduce_op # 164 time 27.75 ms running re_S128_4 with [128] None count 4 runtime 1016.06 us 2.07 GFLOPS () -> (128, 1) # 169 time 22.51 ms running matmul with [4, 16, 128] [4, 16, 16] count 5 runtime 110.08 us 19.06 GFLOPS ('-DMATMUL',) -> (128, 1) diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index 71e15bf19b..78364d2742 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -193,7 +193,7 @@ class CLASTKernel(ASTKernel): if self.first_reduce == 2 and isinstance(self.bufs[0]._buf, CLImage): base_shape = self.bufs[0]._base_shape if all([(base_shape[0]*base_shape[1])%st.shape[0] == 0 and st.shape[0]//base_shape[0] != 0 for st in self.sts]): - if DEBUG >= 3: print("split opencl", base_shape, self.shapes[0]) + if DEBUG >= 3: print("split opencl", base_shape, self.sts[0].shape) self.reshape_and_permute(lambda x: [base_shape[0], x[0]//base_shape[0]]+list(x[1:]), None) self.simplify_ones() @@ -305,7 +305,7 @@ class CLASTKernel(ASTKernel): def print(self): super().print() for i in range(len(self.bufs)): - print(self.buftokens[i], self.bufs[i] in self.earlybufs, self.shapes[i], self.strides[i]) + print(self.buftokens[i], self.bufs[i] in self.earlybufs, self.sts[i]) print(self.fxn.prg) class GPUBuffer(ExplicitExecAST):