diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 47e224b198..f81e3b1383 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -250,37 +250,26 @@ class LazyBuffer: x = self if IMAGE >= 1: + x = x.movement_op(MovementOps.RESHAPE, (C.bs, C.groups, C.cin, C.iy, C.ix)) w = w.movement_op(MovementOps.RESHAPE, (C.groups, C.rcout, C.cin, C.H, C.W)) - - if C.bs > 1 and C.py > 0: - # explicitly add y-padding for batched inputs - # N C H W - xs = [(0, 0) for _ in x.shape] - xs[2] = (C.py, C.py) - x = x.movement_op(MovementOps.PAD, xs) - C = C._replace(iy=C.iy + C.py*2, py=0) + added_output_channels = 0 # hack for non multiples of 4 on C.cin if C.cin % 4 != 0 and not (C.cin == 1 and C.groups%4 == 0): to_add = 4 - (C.cin % 4) w = w.movement_op(MovementOps.PAD, [(0, to_add) if i == 2 else (0, 0) for i in range(len(w.shape))]) - - x = x.movement_op(MovementOps.RESHAPE, (C.bs, C.groups, C.cin, C.iy, C.ix)) x = x.movement_op(MovementOps.PAD, [(0, to_add) if i == 2 else (0, 0) for i in range(len(x.shape))]) C = C._replace(cin = C.cin + to_add) - x = x.movement_op(MovementOps.RESHAPE, (C.bs, C.groups*C.cin, C.iy, C.ix)) # hack for non multiples of 4 on C.rcout if C.rcout % 4 != 0 and not (C.rcout == 1 and C.groups%4 == 0): added_output_channels = 4 - (C.rcout % 4) w = w.movement_op(MovementOps.PAD, [(0, added_output_channels) if i == 1 else (0, 0) for i in range(len(w.shape))]) C = C._replace(rcout = C.rcout + added_output_channels, cout = C.groups * (C.rcout + added_output_channels)) - else: - added_output_channels = 0 # packed assert (C.groups*C.cin) % 4 == 0 - x = x.movement_op(MovementOps.PERMUTE, (0,2,3,1)) + x = x.movement_op(MovementOps.PERMUTE, (0,3,4,1,2)) x = x.movement_op(MovementOps.RESHAPE, (C.bs*C.iy, C.ix*C.groups*C.cin//4, 4)) assert C.cout % 4 == 0 @@ -348,9 +337,10 @@ class LazyBuffer: ret = ret.movement_op(MovementOps.PERMUTE, (0,3,1,2)) return ret - # TODO: fixup C? - if NOCONV or not getattr(x.dbuffer, "SUPPORTS_PADDING", False): + # add padding if the backend can't handle it + if NOCONV or (not getattr(x.dbuffer, "SUPPORTS_PADDING", False) and not (getattr(x.dbuffer, "SUPPORTS_SIMPLE_PADDING", False) and C.px == C.px_ and C.py == C.py_ and C.px >= 0 and C.py >= 0)): x = x.slice(((0, x.shape[0]), (0, x.shape[1]), (-C.py, x.shape[2]+C.py_), (-C.px, x.shape[3]+C.px_))) + C = C._replace(px=0, px_=0, py=0, py_=0) if NOCONV or not getattr(x.dbuffer, "processing_op", False): # universal conv, just mul and reduce diff --git a/tinygrad/llops/ops_cpu.py b/tinygrad/llops/ops_cpu.py index 08ba212d61..a90faf3328 100644 --- a/tinygrad/llops/ops_cpu.py +++ b/tinygrad/llops/ops_cpu.py @@ -39,6 +39,7 @@ class CPUBuffer(np.ndarray, GenericExecAST): def processing_op(x,op,w,C): assert op == ProcessingOps.CONV, f"{op} isn't supported" + assert C.px == 0 and C.px_ == 0 and C.py == 0 and C.py_ == 0, "padding in conv is not supported" tx = x.movement_op(MovementOps.STRIDED, ( (C.bs, C.groups*C.cin*x.shape[2]*x.shape[3]), (C.groups, C.cin*x.shape[2]*x.shape[3]), (C.oy, C.sy*x.shape[3]), (C.ox, C.sx), (C.cin, x.shape[2]*x.shape[3]), (C.H, C.dy*x.shape[3]), (C.W, C.dx))) diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index bb75ad16bc..93049416f6 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -397,10 +397,6 @@ class GPUBuffer(ExplicitExecAST): @property def cl(self): if self._buf is None: - possible_split_shape = [x for x in self._base_shape if x != 1] - # TODO: this is broken, and a hack. I suspect the issue is unaligned float4 accesses, would be caught by the Image valid thing if it worked. - if IMAGE >= 3 and len(possible_split_shape) == 1 and possible_split_shape[0] % 4 == 0 and self._backing is None and possible_split_shape[0] != 6140: - self._base_shape = (1, possible_split_shape[0]//4, 4) self._buf = CLImage(self._base_shape) if (len(self._base_shape) == 3 and self._base_shape[2] == 4 and IMAGE >= 2) else CLBuffer(4*prod(self._base_shape)) if self._backing is not None: CL().enqueue_copy(self._buf.cl, self._backing, is_blocking=False) diff --git a/tinygrad/llops/ops_torch.py b/tinygrad/llops/ops_torch.py index 4040d29628..0bd6e2bfd9 100644 --- a/tinygrad/llops/ops_torch.py +++ b/tinygrad/llops/ops_torch.py @@ -13,6 +13,8 @@ class TorchBuffer(torch.Tensor, GenericExecAST): unary_op, binary_op, reduce_op, movement_op = CPUBuffer.unary_op, CPUBuffer.binary_op, CPUBuffer.reduce_op, CPUBuffer.movement_op + SUPPORTS_SIMPLE_PADDING = True def processing_op(x,op,w,C): assert op == ProcessingOps.CONV, f"{op} isn't supported" - return torch.conv2d(x, w, stride=(C.sy, C.sx), groups=C.groups, dilation=(C.dy, C.dx)) + assert C.px == C.px_ and C.py == C.py_, "asymmetric padding in conv is not supported" + return torch.conv2d(x, w, stride=(C.sy, C.sx), groups=C.groups, dilation=(C.dy, C.dx), padding=(C.py, C.px))