From dffde3de5aa91aefcca87a9f8094f0ed0fa1e2fb Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 26 Jun 2022 17:59:25 -0700 Subject: [PATCH] support both asymmetric and negative padding --- test/test_ops.py | 21 +++++++++++++++++++++ tinygrad/helpers.py | 11 ++++++----- tinygrad/llops/ops_cpu.py | 3 ++- tinygrad/llops/ops_gpu.py | 2 +- tinygrad/llops/ops_torch.py | 6 ++++-- tinygrad/mlops.py | 6 +++--- 6 files changed, 37 insertions(+), 12 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 615594c00a..3c9e9a410f 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -266,6 +266,27 @@ class TestOps(unittest.TestCase): lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(), lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=1e-4) + def test_negative_padding_conv2d(self): + n,k = 10, 3 + helper_test_op([(1,1,n,n), (1,1,k,k)], + lambda x,w: torch.nn.functional.conv2d(x[:, :, 1:-1, 1:-1],w).relu(), + lambda x,w: Tensor.conv2d(x,w,padding=-1).relu(), atol=1e-4) + helper_test_op([(1,1,n,n), (1,1,k,k)], + lambda x,w: torch.nn.functional.conv2d(x[:, :, 1:, 1:],w).relu(), + lambda x,w: Tensor.conv2d(x,w,padding=(-1,0,-1,0)).relu(), atol=1e-4) + + def test_asymmetric_padding_conv2d(self): + for p in [(0,1,0,1), (2,1,2,1), (2,0,2,1)]: + with self.subTest(padding := p): + for n in [3,4]: + for k in [2]: + helper_test_op([(1,1,n,n), (1,1,k,k)], + lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w).relu(), + lambda x,w: Tensor.conv2d(x,w,padding=p).relu(), atol=1e-4) + helper_test_op([(1,1,n,n), (1,1,k,k)], + lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w).relu(), + lambda x,w: Tensor.conv2d(x,w,padding=p).relu(), atol=1e-4) + def test_padded_conv2d(self): bs = 4 cin = 3 diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index c025f208a9..df2bc65dda 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -5,18 +5,19 @@ def prod(x): return math.prod(x) def reduce_shape(shape, axis): return [1 if i in axis else shape[i] for i in range(len(shape))] -ConvArgs = namedtuple('ConvArgs', ['H', 'W', 'groups', 'rcout', 'cin', 'oy', 'ox', 'iy', 'ix', 'ys', 'xs', 'bs', 'cout', 'py', 'px', 'dy', 'dx', 'out_shape']) +ConvArgs = namedtuple('ConvArgs', ['H', 'W', 'groups', 'rcout', 'cin', 'oy', 'ox', 'iy', 'ix', 'ys', 'xs', 'bs', 'cout', 'py', 'py_', 'px', 'px_', 'dy', 'dx', 'out_shape']) def get_conv_args(x_shape, w_shape, stride=1, groups=1, padding=0, dilation=1): # TODO: https://docs.nvidia.com/deeplearning/performance/dl-performance-convolutional/index.html#tensor-layout cout,cin,H,W = w_shape ys,xs = (stride, stride) if isinstance(stride, int) else stride - py,px = (padding, padding) if isinstance(padding, int) else padding + if not isinstance(padding, int) and len(padding) == 4: px,px_,py,py_ = padding + else: py,px = (padding, padding) if isinstance(padding, int) else padding; py_, px_ = py, px dy,dx = (dilation, dilation) if isinstance(dilation, int) else dilation bs,cin_,iy,ix = x_shape # TODO: should be easy to support asymmetric padding by changing output size # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html describes these sizes well - oy = (iy + 2*py - dy * (H-1) - 1)//ys + 1 - ox = (ix + 2*px - dx * (W-1) - 1)//xs + 1 + oy = (iy + py + py_ - dy * (H-1) - 1)//ys + 1 + ox = (ix + px + px_ - dx * (W-1) - 1)//xs + 1 if cin*groups != cin_: raise Exception(f"Input Tensor shape {x_shape} does not match the shape of the weights {w_shape}. ({cin*groups} vs. {cin_})") assert cout % groups == 0 - return ConvArgs(H, W, groups, cout//groups, cin, oy, ox, iy, ix, ys, xs, bs, cout, py, px, dy, dx, (bs, cout, oy, ox)) + return ConvArgs(H, W, groups, cout//groups, cin, oy, ox, iy, ix, ys, xs, bs, cout, py, py_, px, px_, dy, dx, (bs, cout, oy, ox)) diff --git a/tinygrad/llops/ops_cpu.py b/tinygrad/llops/ops_cpu.py index e807a4c750..f2d83f5743 100644 --- a/tinygrad/llops/ops_cpu.py +++ b/tinygrad/llops/ops_cpu.py @@ -61,7 +61,8 @@ class CPUBuffer(np.ndarray): def processing_op(x,op,w,C): assert op == ProcessingOps.CONV, f"{op} isn't supported" - if C.px > 0 or C.py > 0: x = np.pad(x, [(0,0), (0,0), (C.py, C.py), (C.px, C.px)]) + if C.px != 0 or C.py != 0 or C.px_ != 0 or C.py_ != 0: + x = x.movement_op(MovementOps.SLICE, ((0, x.shape[0]), (0, x.shape[1]), (-C.py, x.shape[2]+C.py_), (-C.px, x.shape[3]+C.px_))) gx = x.reshape(C.bs,C.groups,C.cin,x.shape[2],x.shape[3]) tx = np.lib.stride_tricks.as_strided(gx, shape=(C.bs, C.groups, C.cin, C.oy, C.ox, C.H, C.W), diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index 7c4f1b0a29..27e6b8588f 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -126,7 +126,7 @@ class GPUBuffer: if C is not None: ints = ''.join(f"int {x} = {getattr(C, x)};" for x in ["H", "W", "ys", "xs", "dx", "dy", "px", "py", "groups", "rcout", "cin"]) params = [(f"int {x}", getattr(C, x)) for x in ["oy", "ox", "iy", "ix"]] - if C.px == 0 and C.py == 0: options.append("-DALLVALID") + if C.px == 0 and C.py == 0 and C.px_ == 0 and C.py_ == 0: options.append("-DALLVALID") if C.oy == 1 and C.ox == 1: options.append("-DONEBYONE") global_size = [C.bs*C.cout, C.oy, C.ox] assert bufs[0][0] == "input" and bufs[1][0] == "weight" diff --git a/tinygrad/llops/ops_torch.py b/tinygrad/llops/ops_torch.py index 422e31f55b..21571cc3ce 100644 --- a/tinygrad/llops/ops_torch.py +++ b/tinygrad/llops/ops_torch.py @@ -1,7 +1,7 @@ import torch import numpy as np from tinygrad.llops.ops_cpu import CPUBuffer -from tinygrad.ops import ProcessingOps +from tinygrad.ops import MovementOps, ProcessingOps device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") class TorchBuffer(torch.Tensor): @@ -23,4 +23,6 @@ class TorchBuffer(torch.Tensor): def processing_op(x,op,w,C): assert op == ProcessingOps.CONV, f"{op} isn't supported" - return torch.conv2d(x, w, stride=(C.ys, C.xs), groups=C.groups, dilation=(C.dy, C.dx), padding=(C.py, C.px)) + if C.px != C.px_ or C.py != C.py_: padding, x = 0, x.movement_op(MovementOps.SLICE, ((0, x.shape[0]), (0, x.shape[1]), (-C.py, x.shape[2]+C.py_), (-C.px, x.shape[3]+C.px_))) + else: padding = (C.py, C.px) + return torch.conv2d(x, w, stride=(C.ys, C.xs), groups=C.groups, dilation=(C.dy, C.dx), padding=padding) diff --git a/tinygrad/mlops.py b/tinygrad/mlops.py index aa077a042e..bc2b12fb49 100644 --- a/tinygrad/mlops.py +++ b/tinygrad/mlops.py @@ -203,8 +203,8 @@ class Conv2D(Function): wt = ctx.movement_op(MovementOps.FLIP, wt, (3, 4)) wt = ctx.movement_op(MovementOps.PERMUTE, wt, (0, 2, 1, 3, 4)) wt = ctx.movement_op(MovementOps.RESHAPE, wt, (C.groups*C.cin, C.rcout, C.H, C.W)) - Cdx = get_conv_args(xt.shape, wt.shape, dilation=(C.dy, C.dx), padding=((C.H-1)*C.dy-C.py,(C.W-1)*C.dx-C.px), groups=C.groups) - # TODO: this shape can be wrong. support asymmetric padding to remove the slice + Cdx = get_conv_args(xt.shape, wt.shape, dilation=(C.dy, C.dx), padding=((C.W-1)*C.dx-C.px, (C.W-1)*C.dx-C.px_, (C.H-1)*C.dy-C.py, (C.H-1)*C.dy-C.py_), groups=C.groups) + # TODO: this shape can be wrong strided. support asymmetric padding to remove the slice dx = ctx._conv(xt, wt, Cdx) dx = ctx.movement_op(MovementOps.SLICE, dx, [(0,s) for s in x.shape]) @@ -215,7 +215,7 @@ class Conv2D(Function): xdw = ctx.movement_op(MovementOps.RESHAPE, xdw, (C.cin, C.groups*C.bs, C.iy, C.ix)) grad_output_dw = ctx.movement_op(MovementOps.PERMUTE, grad_output, (1,0,2,3)) grad_output_dw = ctx.movement_op(MovementOps.RESHAPE, grad_output_dw, (C.cout, C.bs, C.oy, C.ox)) - Cdw = get_conv_args(xdw.shape, grad_output_dw.shape, padding=(C.py, C.px), stride=(C.dy, C.dx), dilation=(C.ys, C.xs), groups=C.groups) + Cdw = get_conv_args(xdw.shape, grad_output_dw.shape, padding=(C.px, C.px_, C.py, C.py_), stride=(C.dy, C.dx), dilation=(C.ys, C.xs), groups=C.groups) grad_weight = ctx._conv(xdw, grad_output_dw, Cdw) grad_weight = ctx.movement_op(MovementOps.PERMUTE, grad_weight, (1,0,2,3)) # TODO: remove this slice using asymmetric padding