From 2977fb17f6875a1c11885a5f00dd02a2d3d982c5 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Mon, 26 Jun 2023 15:41:23 -0700
Subject: [PATCH] various touchups (#1058)

* op isn't optional

* barrier + named local buffers

* end global and local loop together to avoid useless if statement

* better comments
---
 test/test_speed_v_torch.py     |  3 ---
 tinygrad/codegen/cstyle.py     |  5 +++--
 tinygrad/codegen/linearizer.py | 18 ++++++++++--------
 tinygrad/lazy.py               |  5 ++---
 tinygrad/ops.py                |  2 +-
 5 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py
index 4792c0af51..33ff921706 100644
--- a/test/test_speed_v_torch.py
+++ b/test/test_speed_v_torch.py
@@ -119,9 +119,6 @@ class TestBigSpeed(unittest.TestCase):
   def test_exp(self):
     def f(a, b): return a.exp()
     helper_test_generic_square('exp', 16384, f, f, onearg=True)
-  def test_gemm_1024(self):
-    def f(a, b): return a @ b
-    helper_test_generic_square('gemm', 1024, f, f)
   def test_gemm_2048(self):
     def f(a, b): return a @ b
     helper_test_generic_square('gemm', 2048, f, f)
diff --git a/tinygrad/codegen/cstyle.py b/tinygrad/codegen/cstyle.py
index 12d3dff9e4..963c260239 100644
--- a/tinygrad/codegen/cstyle.py
+++ b/tinygrad/codegen/cstyle.py
@@ -63,7 +63,7 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
   local_size = []
   pend_close = None
 
-  bufnames = ["temp" if isinstance(b, LocalBuffer) else f"data{i}" for i,b in enumerate(bufs)]
+  bufnames = [b.name if isinstance(b, LocalBuffer) else f"data{i}" for i,b in enumerate(bufs)]
 
   depth = 0
   def kk(s): kernel.append("  "*depth+s)
@@ -88,10 +88,11 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
           else:
             kk(f"for (int {var.expr} = {var.min}; {var.expr} <= {var.max}; ++{var.expr}) {{")
       depth += 1
+    elif uop == UOps.BARRIER:
+      kk(lang.barrier)
     elif uop == UOps.ENDLOOP:
       if args[1] == "local" and len(lang.lid):
         # TODO: this is a bit of a hack. the local loop isn't real on the GPU
-        kk(lang.barrier)
         kk(f"if ({Variable.sum(args[0]).render(render_cl)} == 0) {{")
         pend_close = "}"*(len(args[0])+1) + f" /* {args[1]} */"
       else:
diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py
index 0283cecc40..082deb1924 100644
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -10,10 +10,11 @@ from tinygrad.ops import MovementOps, ReduceOps, BinaryOps, FusedOps
 from tinygrad.shape.shapetracker import ShapeTracker, strides_for_shape
 from tinygrad.shape.symbolic import Variable
 
-class UOps(Enum): LOOP = auto(); DEFINE_LOCAL = auto(); LOAD = auto(); ALU = auto(); CONST = auto(); ENDLOOP = auto(); STORE = auto(); CAST = auto(); \
+class UOps(Enum): LOOP = auto(); DEFINE_LOCAL = auto(); LOAD = auto(); ALU = auto(); CONST = auto(); ENDLOOP = auto(); STORE = auto(); CAST = auto(); BARRIER = auto(); \
                   SPECIAL = auto(); DEFINE_REGISTER = auto(); LABEL = auto(); COND_BRANCH = auto() # noqa: E702
 
 class LocalBuffer(NamedTuple):
+  name: str
   dtype: DType = dtypes.float32
   realized: None = None
 
@@ -223,7 +224,7 @@ class Linearizer:
 
     # add a local buffer for multistage reduce
     if len(self.group_for_reduce):
-      self.bufs.append(LocalBuffer())
+      self.bufs.append(LocalBuffer("temp"))
       # TODO: the strides of this can be controlled
       self.sts.append(ShapeTracker(tuple([1] * self.first_reduce + self.group_for_reduce + [1] * (self.shape_len - self.upcasted - len(self.group_for_reduce) - self.first_reduce) + [x[0] for x in self.upcasted_axis(0)])))
       self.uop(UOps.DEFINE_LOCAL, None, [], ("temp", self.sts[-1].size()))
@@ -280,7 +281,8 @@ class Linearizer:
       if self.group_for_reduce:
         fake_global_idxs = [x*0 for x in global_idxs]
         self.global_store(-1, fake_global_idxs+local_idxs+fake_reduce_idxs, acc, ssa)  # store accumulators
-        self.uop(UOps.ENDLOOP, None, [], (local_idxs, "local"))   # this is a barrier on GPUs
+        self.uop(UOps.BARRIER, None, [], ())
+        self.uop(UOps.ENDLOOP, None, [], (local_idxs, "local"))
 
         # local indexs are over, 0 them out
         local_idxs = [x*0 for x in local_idxs]
@@ -320,11 +322,11 @@ class Linearizer:
     self.global_store(0, global_idxs+local_idxs+fake_reduce_idxs, val, ssa)
 
     if not self.group_for_reduce:
-      # end the local loop
-      self.uop(UOps.ENDLOOP, None, [], (local_idxs, "local"))
-
-    # end the global loop
-    self.uop(UOps.ENDLOOP, None, [], (global_idxs, "global"))
+      # end the global+local loop
+      self.uop(UOps.ENDLOOP, None, [], (global_idxs+local_idxs, "global+local"))
+    else:
+      # end the global loop
+      self.uop(UOps.ENDLOOP, None, [], (global_idxs, "global"))
 
   _OT = TypeVar("_OT")
   def uop(self, uop:UOps, out:_OT, vin:List[Token], arg:Any=None) -> _OT:
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index d191927a37..b24cef4eea 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -90,9 +90,8 @@ class LazyBuffer:
     # TODO: does children have to be a ref count instead of a set? can a Buffer be a double child?
     self.children: LightWeakSet = LightWeakSet()
     # NOTE: op should be read only after construction of LazyBuffer
-    if op:
-      self.op: LazyOp = op
-      for x in op.buffers: x.children.add(self)
+    self.op: LazyOp = op
+    for x in op.buffers: x.children.add(self)
     if not LAZY: self.realize()
 
     # log phantom ops to the graph
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 4f035ba199..3f755e0180 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -33,7 +33,7 @@ class LazyOp:
     self.op = op
     self.src = src
     self.arg = arg
-    # TODO: this hasattr is required because the key function maps the buffers to ints
+    # TODO: this hasattr is required because the linearizer's key function maps the buffers to ints
     self.buffers = functools.reduce(lambda x,s: (x+s.buffers) if hasattr(s, 'buffers') else x, src, tuple())
 
   def __repr__(self): return f"LazyOp(op={self.op}, src={self.src}, arg={self.arg})"