diff --git a/docs/abstractions.py b/docs/abstractions.py index 2b6dbd1115..6d66104b48 100644 --- a/docs/abstractions.py +++ b/docs/abstractions.py @@ -98,8 +98,8 @@ class LazyOp: src: Tuple[Union[LazyOp, LazyBuffer], ...] # the sources arg: Optional[Any] = None # and an optional static argument -# there's currently 28 Ops you have to implement for an accelerator. -class UnaryOps(Enum): NOOP = auto(); EXP2 = auto(); LOG2 = auto(); CAST = auto(); SIN = auto(); SQRT = auto() +# there's currently 27 Ops you have to implement for an accelerator. +class UnaryOps(Enum): EXP2 = auto(); LOG2 = auto(); CAST = auto(); SIN = auto(); SQRT = auto() class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); CMPLT = auto(); MAX = auto() class ReduceOps(Enum): SUM = auto(); MAX = auto() class MovementOps(Enum): RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); STRIDE = auto() diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py index d596b0ae68..06bdcfb094 100644 --- a/extra/optimization/helpers.py +++ b/extra/optimization/helpers.py @@ -8,6 +8,8 @@ inf, nan = float('inf'), float('nan') # HACK: it used to be called MEM setattr(BufferOps, "MEM", BufferOps.LOAD) +# HACK: no more NOOP +setattr(UnaryOps, "NOOP", UnaryOps.NEG) # kernel unpacker from tinygrad.codegen.linearizer import Linearizer diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 181ac43527..40f520f8aa 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -487,7 +487,6 @@ class Linearizer(Kernel): def ast_parse(self, x:LazyOp, acc: List[UOp], offs:Optional[List[int]], loaded_buffers:Dict[Union[MemBuffer, ConstBuffer, LocalBuffer], List[UOp]], do_reduce=False, loop_ctx=tuple()) -> List[UOp]: if x.op in BufferOps: return loaded_buffers[x.arg] - if x.op == UnaryOps.NOOP: return self.ast_parse(cast(LazyOp, x.src[0]), acc, offs, loaded_buffers) if x.op == UnaryOps.CAST: return [self.uop(UOps.CAST, x.arg[0], (u,), x.arg) if not isinstance(x.arg[0], ImageDType) else u for u in self.ast_parse(cast(LazyOp, x.src[0]), acc, offs, loaded_buffers)] if x.op in ReduceOps and not do_reduce: assert offs is None, "not available if we aren't doing reduce" diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 77da55446b..e7fd20307f 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -70,7 +70,7 @@ def _replace_bufferops(op:LazyOp) -> Tuple[LazyOp, List[LazyBuffer]]: replacements[x] = LazyOp(BufferOps.CONST, (), ConstBuffer(float(x.base.op.arg), x.dtype, st)) else: raise NotImplementedError(f"not handled {x}") - return (op.src[0] if op.op == MovementOps.RESHAPE else op).map_buffers(replacements), base_bufs + return (op.src[0] if op.op in {MovementOps.RESHAPE, LoadOps.CONTIGUOUS} else op).map_buffers(replacements), base_bufs # **** lazy operations **** @@ -151,9 +151,7 @@ class LazyBuffer: seen.add(self) if self.base is not self: return self.base.schedule(seen) - # rewrite unbased CONTIGUOUS into UnaryOps.NOOP - op = self.op if self.op.op != LoadOps.CONTIGUOUS else LazyOp(UnaryOps.NOOP, self.op.src) - + op = self.op if self.optype is BinaryOps: op = _ast_binaryops(op, self.shape) elif self.optype is ReduceOps: op = _ast_reduceops(op) @@ -163,7 +161,6 @@ class LazyBuffer: var_vals = merge_dicts([self.st.var_vals] + [buf.st.var_vals for buf in op.buffers]) - # run the ast and log the op op, base_bufs = _replace_bufferops(op) # add the store @@ -204,7 +201,6 @@ class LazyBuffer: # this will turn into nothing, it's based and a copy # TODO: based lazybuffers shouldn't take dtype or var_vals, same issue in movementops return create_lazybuffer(self.device, ShapeTracker.from_shape(tuple(self.shape)), LoadOps, LazyOp(LoadOps.CONTIGUOUS, (self,), None), self.dtype, base=self.base) - # real contiguous, this will turn into a UnaryOps.NOOP return LazyBuffer.loadop(LoadOps.CONTIGUOUS, self.shape, self.dtype, self.device, src=self) @staticmethod diff --git a/tinygrad/ops.py b/tinygrad/ops.py index cdb4090063..887a6fa058 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -10,7 +10,7 @@ from dataclasses import dataclass # the Enum class doesn't work with mypy, this is static. sorry it's ugly # NOTE: MOD, CMPLT don't have to be implemented on vectors, just scalars # NOTE: rdna3 only has RECIP and not DIV. DIV and POW are on the chopping block -class UnaryOps(Enum): NOOP = auto(); EXP2 = auto(); LOG2 = auto(); CAST = auto(); SIN = auto(); SQRT = auto(); RECIP = auto(); NEG = auto() # noqa: E702 +class UnaryOps(Enum): EXP2 = auto(); LOG2 = auto(); CAST = auto(); SIN = auto(); SQRT = auto(); RECIP = auto(); NEG = auto() # noqa: E702 class BinaryOps(Enum): ADD = auto(); SUB = auto(); MUL = auto(); DIV = auto(); MAX = auto(); MOD = auto(); CMPLT = auto() # noqa: E702 class TernaryOps(Enum): MULACC = auto(); WHERE = auto() # noqa: E702 class ReduceOps(Enum): SUM = auto(); MAX = auto() # noqa: E702 diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 91253d9e63..5bc645c24b 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -26,7 +26,7 @@ def einsum_mulacc(einsum, get_strides, expand): numpy_fxn_for_op: Dict[Op, Callable] = { BufferOps.CONST: lambda val, dtype: np.array(val, dtype=dtype.np), - UnaryOps.NOOP: lambda x: np.require(x, requirements='C'), UnaryOps.EXP2: np.exp2, UnaryOps.LOG2: np.log2, UnaryOps.SIN: np.sin, + UnaryOps.EXP2: np.exp2, UnaryOps.LOG2: np.log2, UnaryOps.SIN: np.sin, UnaryOps.CAST: lambda x,y: x.view(y[0].np) if y[1] else x.astype(y[0].np, copy=False), UnaryOps.NEG: lambda x: np.logical_not(x) if x.dtype == np.bool_ else np.negative(x), BinaryOps.MAX: np.maximum, BinaryOps.CMPLT: lambda x,y: (x memoryview: return memoryview(self.ud.mem).cast("B")[self.offset:self.offset+self.size*self.dtype.itemsize] -disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.NOOP: lambda x: x, UnaryOps.CAST: DiskBuffer.cast, MovementOps.AS_STRIDED: DiskBuffer.as_strided } +disk_fxn_for_op: Dict[Op, Callable] = { UnaryOps.CAST: DiskBuffer.cast, MovementOps.AS_STRIDED: DiskBuffer.as_strided } MAP_LOCKED, MAP_POPULATE = 0x2000, 0x008000 class DiskAllocator(Allocator): diff --git a/tinygrad/runtime/ops_torch.py b/tinygrad/runtime/ops_torch.py index c661f6abb5..c4be5bcfbc 100644 --- a/tinygrad/runtime/ops_torch.py +++ b/tinygrad/runtime/ops_torch.py @@ -26,7 +26,7 @@ torch_fxn_for_op: Dict[Op, Callable] = { # TODO: torch.tensor should work here. it doesn't due to "overflow" in uint8 #BufferOps.CONST: lambda val, dtype: torch.tensor(val, device=device, dtype=inverse_type_map[dtype]), BufferOps.CONST: lambda val, dtype: torch.from_numpy(np.array(val, dtype=dtype.np)).to(device), - UnaryOps.NOOP: lambda x: x.contiguous(), UnaryOps.SQRT: lambda x: x.sqrt(), UnaryOps.EXP2: lambda x: x.exp2(), UnaryOps.LOG2: lambda x: x.log2(), UnaryOps.SIN: torch.sin, + UnaryOps.SQRT: lambda x: x.sqrt(), UnaryOps.EXP2: lambda x: x.exp2(), UnaryOps.LOG2: lambda x: x.log2(), UnaryOps.SIN: torch.sin, UnaryOps.CAST: lambda x,y: (x.view if y[1] else x.type)(next(k for k,v in type_map.items() if v==y[0])), UnaryOps.NEG: lambda x: torch.logical_not(x) if x.dtype is torch.bool else torch.neg(x), BinaryOps.MAX: torch.maximum, BinaryOps.CMPLT: lambda x,y: (x