diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 3cc08a012b..26725cd77e 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -195,7 +195,7 @@ class Linearizer(Kernel): if isinstance(buf, MemBuffer): self.buf_uops[i] = self.uops.add(UOps.DEFINE_GLOBAL, buf.dtype if isinstance(buf.dtype, ImageDType) else PtrDType(buf.dtype), (), - (buf.idx, f"data{buf.idx}", i == 0)) + (buf.idx, f"data{buf.idx}", buf.idx == 0)) # add var vals for i,var in enumerate(self.ast.vars()): assert var.expr is not None @@ -327,7 +327,8 @@ class Linearizer(Kernel): assert not locals_to_store, "storing locals isn't supported here" # load earlybufs - loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs[1:], start=1) if b in self.earlybufs}) # noqa: E501 + loaded_buffers.update({b:self.global_load(self.bufs.index(self.local_alias[i]) if i in self.local_alias else i, + global_idxs+local_idxs+reduce_idxs+full_upcast_idxs) for i,b in enumerate(self.bufs) if b in self.earlybufs}) # run early AST (with reduce) self.ast_parse(self.reduceop, acc, self.acc_offsets(self.full_buf_index), loaded_buffers, do_reduce=True, loop_ctx=loop_ctx) @@ -382,7 +383,8 @@ class Linearizer(Kernel): local_idxs = local_idxs[:self.local_dims] + [NumNode(0) for i in range(self.group_for_reduces)] # load latebufs - loaded_buffers.update({b:self.global_load(i, global_idxs+local_idxs+fake_reduce_idxs+upcast_idxs) for i,b in enumerate(self.bufs) if b not in self.earlybufs and i != 0 and b.__class__ is not LocalBuffer}) # noqa: E501 + loaded_buffers.update({b:self.global_load(i, global_idxs+local_idxs+fake_reduce_idxs+upcast_idxs) \ + for i,b in enumerate(self.bufs) if b not in self.earlybufs and b.__class__ is not LocalBuffer}) # run late AST (without the store) val = self.ast_parse(self.ast.src[0], acc, None, loaded_buffers) diff --git a/tinygrad/codegen/uops.py b/tinygrad/codegen/uops.py index 3d0dd3e280..dbdd091c41 100644 --- a/tinygrad/codegen/uops.py +++ b/tinygrad/codegen/uops.py @@ -19,7 +19,7 @@ class UOps(Enum): @dataclass(eq=False) class UOp: uop: UOps - dtype: Optional[DType] + dtype: Optional[DType] = None vin: Tuple[UOp, ...] = tuple() arg: Any = None def __repr__(self): diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 1c340bed8a..0084817777 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -50,8 +50,9 @@ class LazyBuffer: @staticmethod def loadop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, - src:Optional[LazyBuffer]=None, enable_cache=False, _buf:Optional[Buffer]=None) -> LazyBuffer: - ret = create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, (src,) if src is not None else (), enable_cache=enable_cache) + src:Tuple[LazyBuffer, ...]=(), enable_cache=False, _buf:Optional[Buffer]=None) -> LazyBuffer: + assert isinstance(src, tuple) + ret = create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, src, enable_cache=enable_cache) if _buf is not None: ret.realized = _buf return ret @@ -82,8 +83,8 @@ class LazyBuffer: if self.device.startswith("EXT") or self.device.startswith("DISK"): # DISK/EXT don't sync return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, None, (self,), enable_cache=False) - sync = LazyBuffer.loadop(LoadOps.SYNC, (sync_size,), dtypes.uint32, self.device, src=self, enable_cache=True) - wait = LazyBuffer.loadop(LoadOps.WAIT, (0,), dtypes.uint32, device, src=sync, enable_cache=True) + sync = LazyBuffer.loadop(LoadOps.SYNC, (sync_size,), dtypes.uint32, self.device, src=(self,), enable_cache=True) + wait = LazyBuffer.loadop(LoadOps.WAIT, (0,), dtypes.uint32, device, src=(sync,), enable_cache=True) return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, None, (self, wait), enable_cache=False) def copy_to_device(self, device:str) -> LazyBuffer: diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py index c5e58941b9..f863a48391 100644 --- a/tinygrad/renderer/cstyle.py +++ b/tinygrad/renderer/cstyle.py @@ -118,9 +118,8 @@ def uops_to_cstyle(lang:CStyleLanguage, function_name:str, uops:UOpGraph) -> str kk("}") elif uop is UOps.STORE: assert vin[0].dtype is not None and vin[2].dtype is not None - if len(vin) > 3: kk(f"if ({r[vin[3]]}) {{") - kk(lang.render_store(r[vin[0]], vin[0].dtype, r[vin[2]], vin[2].dtype, strip_parens(r[vin[1]]), vin[0].uop is UOps.DEFINE_LOCAL)) - if len(vin) > 3: kk("}") + rendered_store = lang.render_store(r[vin[0]], vin[0].dtype, r[vin[2]], vin[2].dtype, strip_parens(r[vin[1]]), vin[0].uop is UOps.DEFINE_LOCAL) + kk(f"if ({r[vin[3]]}) {{ {rendered_store} }}" if len(vin) > 3 else rendered_store) else: assert dtype is not None, f"None dtype for uop {uop}" if uop is UOps.LOOP: diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index d39ef74185..c84b29541a 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -37,7 +37,7 @@ class Function: import tinygrad.mlops as mlops -def _loadop(op, shape:Tuple[sint,...], dtype:DType, device:Union[str, Tuple[str, ...]], arg=None, src:Optional[LazyBuffer]=None): +def _loadop(op, shape:Tuple[sint,...], dtype:DType, device:Union[str, Tuple[str, ...]], arg=None, src:Tuple[LazyBuffer, ...]=()): if isinstance(device, str): return LazyBuffer.loadop(op, shape, dtype, device, arg, src) return MultiLazyBuffer([LazyBuffer.loadop(op, shape, dtype, d, arg, src) for d in device], None)