diff --git a/test/test_ops.py b/test/test_ops.py index 8b2df0ef94..cf17732a2a 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -37,30 +37,22 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=1e-7, grad_atol=1e-7, gpu print("testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp)) class TestOps(unittest.TestCase): + gpu = False def test_add(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add) - @unittest.skipUnless(GPU, "Requires GPU") - def test_add_gpu(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, gpu=True) + helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, gpu=self.gpu) def test_sub(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub) + helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, gpu=self.gpu) def test_mul(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul) - @unittest.skipUnless(GPU, "Requires GPU") - def test_mul_gpu(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, gpu=True) + helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, gpu=self.gpu) def test_div(self): # TODO: why does this need more tolerance? - helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, atol=1e-3, grad_atol=1e-3) + helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, atol=1e-3, grad_atol=1e-3, gpu=self.gpu) def test_pow(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow) + helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, gpu=self.gpu) def test_sqrt(self): - helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt) + helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, gpu=self.gpu) def test_dot(self): - helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, atol=1e-5) - @unittest.skipUnless(GPU, "Requires GPU") - def test_dot_gpu(self): - helper_test_op([(3,4), (4,5)], lambda x,y: x.matmul(y), Tensor.dot, atol=1e-5, gpu=True) + helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, atol=1e-5, gpu=self.gpu) def test_conv2d(self): for bs in [1,8]: @@ -70,7 +62,7 @@ class TestOps(unittest.TestCase): for W in [2,3,5]: helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)], lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=2e-5, grad_atol=2e-6) + lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu) def test_strided_conv2d(self): bs = 4 @@ -78,22 +70,26 @@ class TestOps(unittest.TestCase): H,W = 3,3 helper_test_op([(bs,cin,11,28), (4,cin,H,W)], lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6) + lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu) helper_test_op([(bs,cin,11,28), (4,cin,H,W)], lambda x,w: torch.nn.functional.conv2d(x,w,stride=(2,1)).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6) + lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu) def test_maxpool2x2(self): - helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d) + helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d, gpu=self.gpu) def test_maxpool_sizes(self): for sz in [(2,2), (3,3), (3,2), (5,5), (5,1)]: helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=sz), - lambda x: Tensor.max_pool2d(x, kernel_size=sz)) + lambda x: Tensor.max_pool2d(x, kernel_size=sz), gpu=self.gpu) def test_avgpool2x2(self): - helper_test_op([(32,2,111,28)], lambda x: torch.nn.functional.avg_pool2d(x, (2,2)), Tensor.avg_pool2d) + helper_test_op([(32,2,111,28)], lambda x: torch.nn.functional.avg_pool2d(x, (2,2)), Tensor.avg_pool2d, gpu=self.gpu) + +if GPU: + class TestOpsGPU(TestOps): + gpu = True if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/tinygrad/opsgpu.py b/tinygrad/opsgpu.py index 16b9ba3c21..7d26ad5a50 100644 --- a/tinygrad/opsgpu.py +++ b/tinygrad/opsgpu.py @@ -16,7 +16,7 @@ def buffer_like(ctx, x): def clbuild(cl_ctx, prg): return cl.Program(cl_ctx, prg).build() -def in_place_op(ctx, code, x, y): +def binary_op(ctx, code, x, y): ret = buffer_like(ctx, x) prg = clbuild(ctx.cl_ctx, """ __kernel void add( @@ -29,10 +29,23 @@ def in_place_op(ctx, code, x, y): prg.add(ctx.cl_queue, [np.prod(ret.shape)], None, x, y, ret) return ret +def unary_op(ctx, code, x): + ret = buffer_like(ctx, x) + prg = clbuild(ctx.cl_ctx, """ + __kernel void relu( + __global const float *a_g, __global float *res_g) + { + int gid = get_global_id(0); + res_g[gid] = min(a_g[gid], (float)0.); + } + """) + prg.relu(ctx.cl_queue, [np.prod(ret.shape)], None, x, ret) + return ret + class Add(Function): @staticmethod def forward(ctx, x, y): - return in_place_op(ctx, 'res_g[gid] = a_g[gid] + b_g[gid];', x, y) + return binary_op(ctx, 'res_g[gid] = a_g[gid] + b_g[gid];', x, y) @staticmethod def backward(ctx, grad_output): @@ -42,7 +55,7 @@ register('add', Add, gpu=True) class Sub(Function): @staticmethod def forward(ctx, x, y): - return in_place_op(ctx, 'res_g[gid] = a_g[gid] - b_g[gid];', x, y) + return binary_op(ctx, 'res_g[gid] = a_g[gid] - b_g[gid];', x, y) @staticmethod def backward(ctx, grad_output): @@ -57,9 +70,9 @@ class Mul(Function): # HACK if y.shape == (1,): - return in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[0];', x, y) + return binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[0];', x, y) elif x.shape == y.shape: - return in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, y) + return binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, y) else: raise Exception("mismatched shapes %r %r" % (x.shape, y.shape)) @@ -68,8 +81,8 @@ class Mul(Function): @staticmethod def backward(ctx, grad_output): x,y = ctx.saved_tensors - return in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', y, grad_output),\ - in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, grad_output) + return binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', y, grad_output),\ + binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, grad_output) register('mul', Mul, gpu=True) class Sum(Function): @@ -168,17 +181,7 @@ register('matmul', Dot, gpu=True) class ReLU(Function): @staticmethod def forward(ctx, x): - ret = buffer_like(ctx, x) - prg = clbuild(ctx.cl_ctx, """ - __kernel void relu( - __global const float *a_g, __global float *res_g) - { - int gid = get_global_id(0); - res_g[gid] = min(a_g[gid], (float)0.); - } - """) - prg.relu(ctx.cl_queue, [np.prod(ret.shape)], None, x, ret) - return ret + return unary_op(ctx, 'res_g[gid] = min(a_g[gid], (float)0.);', x) @staticmethod def backward(ctx, grad_output):