From 039f0d372f5ecf7beaf4c235819399d4b6109f3d Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Thu, 15 Jun 2023 16:24:45 -0700
Subject: [PATCH] delete ltypes (#984)

* delete ltypes

* only upcast float types

* test dtype on mac passes

* ugh, these upcasts
---
 test/test_dtype.py             | 11 ++++----
 tinygrad/codegen/cstyle.py     | 23 ++++++++--------
 tinygrad/codegen/linearizer.py | 50 ++++++++++++++++------------------
 tinygrad/helpers.py            |  8 +++++-
 tinygrad/lazy.py               |  1 +
 5 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/test/test_dtype.py b/test/test_dtype.py
index 8fb1202c52..b0fc00ff26 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -3,7 +3,7 @@ import numpy as np
 from tinygrad.helpers import getenv, DType, DEBUG
 from tinygrad.lazy import Device
 from tinygrad.tensor import Tensor, dtypes
-
+from extra.utils import OSX
 
 def _test_to_np(a:Tensor, np_dtype, target):
   print(a)
@@ -31,7 +31,7 @@ def _test_matmul_upcast(a:Tensor, b:Tensor, target_dtype:DType, target): _test_o
 @unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["LLVM"], "float16 broken in some CI backends")
 class TestHalfDtype(unittest.TestCase):
   def test_half_to_np(self): _test_to_np(Tensor([1,2,3,4], dtype=dtypes.float16), np.float16, [1,2,3,4])
-  
+
   def test_half_to_float(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float32, [1,2,3,4])
   def test_half_to_int8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.int8, [1,2,3,4])
   def test_half_to_uint8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.uint8, [1,2,3,4])
@@ -40,11 +40,11 @@ class TestHalfDtype(unittest.TestCase):
   def test_float_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float16, [1,2,3,4])
   def test_int8_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int8), dtypes.float16, [1,2,3,4])
   def test_uint8_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.uint8), dtypes.float16, [1,2,3,4])
-  
+
   def test_half_add(self): _test_add(Tensor([1,2,3,4], dtype=dtypes.float16), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [2,4,6,8])
   def test_half_mul(self): _test_mul(Tensor([1,2,3,4], dtype=dtypes.float16), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [1,4,9,16])
   def test_half_matmul(self): _test_matmul(Tensor([[1,2],[3,4]], dtype=dtypes.float16), Tensor.eye(2, dtype=dtypes.float16), dtypes.float16, [[1,2],[3,4]])
-  
+
   def test_half_add_upcast_float(self): _test_add_upcast(Tensor([1,2,3,4], dtype=dtypes.float16), Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float32, [2,4,6,8])
   def test_int8_add_upcast_half(self): _test_add_upcast(Tensor([1,2,3,4], dtype=dtypes.int8), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [2,4,6,8])
   def test_int8_mul_upcast_half(self): _test_mul_upcast(Tensor([1,2,3,4], dtype=dtypes.int8), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [1,4,9,16])
@@ -52,7 +52,8 @@ class TestHalfDtype(unittest.TestCase):
   def test_half_matmul_upcast_float(self): _test_matmul_upcast(Tensor([[1,2],[3,4]], dtype=dtypes.float16), Tensor.eye(2, dtype=dtypes.float32), dtypes.float32, [[1,2],[3,4]])
   def test_int8_matmul_upcast_half(self): _test_matmul_upcast(Tensor([[1,2],[3,4]], dtype=dtypes.int8), Tensor.eye(2, dtype=dtypes.float16), dtypes.float16, [[1,2],[3,4]])
 
-class TestFloatDtype(unittest.TestCase):
+@unittest.skipIf(OSX and Device.DEFAULT in ["GPU", "METAL"], "GPU on Mac doesn't support float64")
+class TestFloat64Dtype(unittest.TestCase):
   def test_float64_to_np(self): _test_to_np(Tensor([1,2,3,4], dtype=dtypes.float64), np.float64, [1,2,3,4])
   def test_float64_add(self): _test_add(Tensor([1,2,3,4], dtype=dtypes.float64), Tensor([1,2,3,4], dtype=dtypes.float64), dtypes.float64, [2,4,6,8])
   def test_float64_mul(self): _test_mul(Tensor([1,2,3,4], dtype=dtypes.float64), Tensor([1,2,3,4], dtype=dtypes.float64), dtypes.float64, [1,4,9,16])
diff --git a/tinygrad/codegen/cstyle.py b/tinygrad/codegen/cstyle.py
index d0d3a3f141..f16802f0f8 100644
--- a/tinygrad/codegen/cstyle.py
+++ b/tinygrad/codegen/cstyle.py
@@ -1,6 +1,6 @@
 from typing import Final, Dict, Callable, ClassVar, List, Optional, NamedTuple, DefaultDict, Tuple, Set, Union
 import math, collections
-from tinygrad.codegen.linearizer import Linearizer, UOps, UOp, LocalBuffer, LocalTypes
+from tinygrad.codegen.linearizer import Linearizer, UOps, UOp, LocalBuffer
 from tinygrad.ops import ASTRunner, Op, UnaryOps, BinaryOps, FusedOps
 from tinygrad.helpers import partition, ImageDType, DEBUG, dtypes, colored
 from tinygrad.runtime.lib import RawConst
@@ -105,7 +105,7 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
       assert newvar is not None
       if args == -math.inf:
         kk(f"{newvar.render(True)} = -INFINITY;")
-      elif newvar.ltype == LocalTypes.float4:
+      elif newvar.dtype == dtypes._float4:
         kk(f"{newvar.render(True)} = {{ {args}f, {args}f, {args}f, {args}f }};")
       else:
         kk(f"{newvar.render(True)} = {args}f;")
@@ -118,42 +118,42 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
     elif uop == UOps.LOAD and newvar is not None:
       # TODO: merge with CONST?
       if bufs[args.i] is not None and isinstance(bufs[args.i].realized, RawConst):
-        assert newvar.ltype == LocalTypes.float, "const can't be float4"
+        assert newvar.dtype == dtypes.float, "const can't be float4"
         x = bufs[args.i].realized._buf
         if math.isnan(x): val = "NAN"
         elif math.isinf(x): val = ("-" if x < 0 else "") + "INFINITY"
         else: val = f"{x}" +  ("f" if not dtypes.is_int(bufs[args.i].dtype) else "")
       elif isinstance(bufs[args.i].dtype, ImageDType):
-        assert newvar.ltype == LocalTypes.float4, "image must be float4"
+        assert newvar.dtype == dtypes._float4, "image must be float4"
         prekernel.add("const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n")
         idx, idy = to_image_idx(bufs[args.i].dtype.shape, args.idx, args.valid)
         val = f"read_imagef({bufnames[args.i]}, smp, (int2)({idx.render(render_cl)}, {idy.render(render_cl)}))"
       else:
         if lang.uses_vload and bufs[args.i].dtype == dtypes.float16:
-          if newvar.ltype == LocalTypes.float4:
+          if newvar.dtype == dtypes._float4:
             val = f"vload_half4({(args.idx//4).render(render_cl)}, {bufnames[args.i]})"
           else:
             val = f"vload_half({args.idx.render(render_cl)}, {bufnames[args.i]})"
         else:
-          if newvar.ltype == LocalTypes.float4:
-            val = f"({newvar.ltype.name})((({lang.smem_prefix if isinstance(bufs[args.i], LocalBuffer) else lang.buffer_prefix}{bufs[args.i].dtype.name}4*){bufnames[args.i]})[{(args.idx//4).render(render_cl)}])"
+          if newvar.dtype == dtypes._float4:
+            val = f"({newvar.dtype.name})((({lang.smem_prefix if isinstance(bufs[args.i], LocalBuffer) else lang.buffer_prefix}{bufs[args.i].dtype.name}4*){bufnames[args.i]})[{(args.idx//4).render(render_cl)}])"
           else:
             val = f"{bufnames[args.i]}[{args.idx.render(render_cl)}]"
       # NOTE: if min and max are both 0, it should be a CONST in the Linearizer
       if args.valid.min == 1: kk(f"{newvar.render(True)} = {val};")
       else:
-        casts = {LocalTypes.float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), LocalTypes.half: ("(half)", "(half)(0.0f)"), LocalTypes.float: ("(float)", "0.0f")}[newvar.ltype]
+        casts = {dtypes._float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), dtypes.half: ("(half)", "(half)(0.0f)"), dtypes.float: ("(float)", "0.0f")}[newvar.dtype]
         kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? {casts[0]}({val}) : {casts[1]};")
-    elif uop == UOps.STORE and (vin[0].ltype == LocalTypes.float or (vin[0].ltype == LocalTypes.float4 and vin[0].offset is not None)):
+    elif uop == UOps.STORE and (vin[0].dtype == dtypes.float or (vin[0].dtype == dtypes._float4 and vin[0].offset is not None)):
       assert not isinstance(bufs[args.i].dtype, ImageDType), "image store must be float4"
       assert args.valid.min == 1, "store must be valid"
       if lang.uses_vload and bufs[args.i].dtype == dtypes.float16:
         kk(f"vstore_half({vin[0].render()}, {args.idx.render(render_cl)}, {bufnames[args.i]});")
       else:
         kk(f"{bufnames[args.i]}[{args.idx.render(render_cl)}] = {vin[0].render()};")
-    elif uop == UOps.CAST and newvar is not None and newvar.ltype == LocalTypes.float4:
+    elif uop == UOps.CAST and newvar is not None and newvar.dtype == dtypes._float4:
       kk(f"{newvar.render(True)} = {lang.float4}({','.join([x.render() for x in vin])});")
-    elif uop == UOps.STORE and len(vin) != 0 and vin[0].ltype == LocalTypes.float4 and vin[0].offset is None:
+    elif uop == UOps.STORE and len(vin) != 0 and vin[0].dtype == dtypes._float4 and vin[0].offset is None:
       assert args.valid.min == 1, "store must be valid"
       if isinstance(bufs[args[0]].dtype, ImageDType):
         idx, idy = to_image_idx(bufs[args.i].dtype.shape, args[1], args[2])
@@ -172,7 +172,6 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan
     [', '.join([f'{t} {bufnames[i]}' for i,t in buftypes] + lang.extra_args)] +
     [") {\n"] + list(prekernel) + ['\n'.join(kernel), "\n}"])
 
-  
   if lang.half_prekernel: prg =''.join([f"{lang.half_prekernel}", "\n", prg])
   if lang.double_prekernel: prg = ''.join([f"{lang.double_prekernel}", "\n", prg])
   return prg, global_size, local_size
diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py
index 7c47b39b3b..ed52fa9f24 100644
--- a/tinygrad/codegen/linearizer.py
+++ b/tinygrad/codegen/linearizer.py
@@ -18,21 +18,18 @@ class LocalBuffer(NamedTuple):
   dtype: DType = dtypes.float32
   realized: None = None
 
-# NOTE: half and half4 are not actually used yet
-class LocalTypes(Enum): float = auto(); float4 = auto(); half = auto(); half4 = auto(); simdgroup_float8x8 = auto() # noqa: E702
-
 class Token(NamedTuple):
   name: str
-  ltype: LocalTypes
+  dtype: DType
   offset: Optional[int] = None
   def render(self, with_type=False):
     if with_type:
       assert self.offset is None
-      return f"{self.ltype.name} {self.name}"
+      return f"{self.dtype.name} {self.name}"
     if self.offset is None: return self.name
-    assert self.ltype == LocalTypes.float4
+    assert self.dtype == dtypes._float4
     return self.name+"."+"xyzw"[int(self.offset)]
-  def __repr__(self): return f"<{self.name}>" if self.offset is None and self.ltype == LocalTypes.float else f"<{self.name}:{self.ltype.name}:{self.offset}>"
+  def __repr__(self): return f"<{self.name}>" if self.offset is None and self.dtype == dtypes.float32 else f"<{self.name}:{self.dtype.name}:{self.offset}>"
 
 # TODO: the next three functions are poorly written
 def get_grouped_float4_idxs(acc:List[Token]) -> Optional[List[int]]:
@@ -40,13 +37,13 @@ def get_grouped_float4_idxs(acc:List[Token]) -> Optional[List[int]]:
   for i,a in enumerate(acc):
     if idxs is None: break
     if i in idxs: continue
-    if a.ltype == LocalTypes.float4 and a.offset == 0:
+    if a.dtype.sz > 1 and a.offset == 0:
       idxs.append(i)
       friends: List[int] = []
       for j,b in enumerate(acc):
         if len(friends) == 3: break
         if j in idxs: continue
-        if a.name == b.name and b.ltype == LocalTypes.float4 and b.offset == len(friends)+1:
+        if a.name == b.name and b.dtype.sz > 1 and b.offset == len(friends)+1:
           friends.append(j)
       if len(friends) == 3: idxs += friends
       else: idxs = None
@@ -56,8 +53,8 @@ def get_grouped_float4_idxs(acc:List[Token]) -> Optional[List[int]]:
 
 def to_float4(x:List[Token]) -> Optional[Token]:
   if all_same(x): return x[0]
-  if all_same([y.name for y in x]) and all([y.ltype == LocalTypes.float4 and y.offset == i for i,y in enumerate(x)]):
-    return Token(x[0].name, LocalTypes.float4)
+  if all_same([y.name for y in x]) and all([y.dtype == dtypes._float4 and y.offset == i for i,y in enumerate(x)]):
+    return Token(x[0].name, dtypes._float4)
   return None
 
 def get_grouped_maybe_float4(*values:List[Token], grouping_allowed=True):
@@ -170,10 +167,10 @@ class Linearizer:
     return store_offset_float4
 
   def global_load(self, i, idxs:List[Variable], const=None) -> List[Token]:
-    load_offset: Dict[Tuple[int, ...], Any] = {uidxs:(LocalTypes.float,uidxs)+self.sts[i].expr_idxs(idxs+[Variable.num(x) for x in uidxs[::-1]]) for uidxs in self.shape_offsets(i)}
+    load_offset: Dict[Tuple[int, ...], Any] = {uidxs:(dtypes.float,uidxs)+self.sts[i].expr_idxs(idxs+[Variable.num(x) for x in uidxs[::-1]]) for uidxs in self.shape_offsets(i)}
 
     # float4 grouping (optional)
-    should_upcast = self.supports_float4 and len(self.float4_axis(i)) == 1
+    should_upcast = self.supports_float4 and (self.bufs[i].dtype in [dtypes.float32, dtypes.float16] or isinstance(self.bufs[i].dtype, ImageDType)) and len(self.float4_axis(i)) == 1
     if should_upcast:
       load_offset_new = {}
       for k,out_tokens in self._group_float4(i, load_offset).items():
@@ -183,7 +180,7 @@ class Linearizer:
           # idxs not in order, valids don't match, or idx doesn't evenly divide 4. use normal float
           for x in out_tokens: load_offset_new[x[1]] = x
         else:
-          load_offset_new[k] = (LocalTypes.float4, [x[1] for x in out_tokens], out_tokens[0][2], out_tokens[0][3])
+          load_offset_new[k] = (dtypes._float4, [x[1] for x in out_tokens], out_tokens[0][2], out_tokens[0][3])
       load_offset = load_offset_new
 
     # do loads
@@ -193,9 +190,9 @@ class Linearizer:
       key = f"{localtype}{idx.render()}{valid.render()}"
       if key not in cache:
         cache[key] = self.uop(UOps.LOAD, Token(f"val{mnum(i)}_{len(cache)}", localtype), [], MemOp(i, idx, valid)) if const is None else self.uop(UOps.CONST, Token(f"acc{mnum(i)}_{len(cache)}", localtype), [], const)
-      if localtype == LocalTypes.float4:
+      if localtype == dtypes._float4:
         for j,uidx in enumerate(uidx_list):
-          loaded[uidx] = Token(cache[key].name, LocalTypes.float4, j)
+          loaded[uidx] = Token(cache[key].name, dtypes._float4, j)
       else:
         loaded[uidxs] = cache[key]
     return [loaded[uidxs] for uidxs in self.shape_offsets(i)]
@@ -204,14 +201,15 @@ class Linearizer:
     store_offset: Dict[Tuple[int, ...], Token] = dict(zip(self.shape_offsets(i), store))
 
     # float4 grouping (optional)
-    should_upcast = self.supports_float4 and (self.bufs[i].dtype not in (dtypes.float16, dtypes.int8, dtypes.uint8)) and len(self.float4_axis(i)) == 1
+    # TODO: why does this not work for float16?
+    should_upcast = self.supports_float4 and (self.bufs[i].dtype == dtypes.float32 or isinstance(self.bufs[i].dtype, ImageDType)) and len(self.float4_axis(i)) == 1
     if should_upcast:
       store_offset_new = {}
       for k,out_tokens in self._group_float4(i, store_offset).items():
         if all_same([x.name for x in out_tokens]) and tuple(range(4)) == tuple(x.offset for x in out_tokens):
-          store_offset_new[k] = Token(out_tokens[0].name, LocalTypes.float4)
+          store_offset_new[k] = Token(out_tokens[0].name, dtypes._float4)
         else:
-          store_offset_new[k] = self.uop(UOps.CAST, ssa("alu", LocalTypes.float4), out_tokens)
+          store_offset_new[k] = self.uop(UOps.CAST, ssa("alu", dtypes._float4), out_tokens)
       store_offset = store_offset_new
 
     # do stores
@@ -242,7 +240,7 @@ class Linearizer:
 
     # ssa
     _ssa:DefaultDict[str,int] = defaultdict(int)
-    def ssa(name, ltype=LocalTypes.float) -> Token:
+    def ssa(name, ltype=dtypes.float) -> Token:
       _ssa[name] += 1
       return Token(f"{name}{_ssa[name]-1}", ltype)
 
@@ -345,12 +343,12 @@ class Linearizer:
     if isinstance(x.op, (ReduceOps, FusedOps)):
       ret = [(idx, self.uop(UOps.ALU, val[0], list(val), {ReduceOps.SUM:BinaryOps.ADD, ReduceOps.MAX:BinaryOps.MAX, FusedOps.MULACC:FusedOps.MULACC}[x.op])) for idx, val in get_grouped_maybe_float4(acc, *values, grouping_allowed=self.supports_float4_alu)]
     else:
-      ret = [(idx, self.uop(UOps.ALU, ssa('alu', LocalTypes.float4) if any(x.ltype == LocalTypes.float4 and x.offset is None for x in val) else ssa('alu'), list(val), x.op)) for idx, val in get_grouped_maybe_float4(*values, grouping_allowed=self.supports_float4_alu and x.op!=BinaryOps.CMPEQ)]
+      ret = [(idx, self.uop(UOps.ALU, ssa('alu', dtypes._float4) if any(x.dtype == dtypes._float4 and x.offset is None for x in val) else ssa('alu'), list(val), x.op)) for idx, val in get_grouped_maybe_float4(*values, grouping_allowed=self.supports_float4_alu and x.op!=BinaryOps.CMPEQ)]
     ordered_ret: List[Optional[Token]] = [None]*len(values[0])
     # scatter
     for i,j in ret:
       for o,k in enumerate(i):
-        ordered_ret[k] = Token(j.name, j.ltype, o) if j.ltype == LocalTypes.float4 else j
+        ordered_ret[k] = Token(j.name, j.dtype, o) if j.dtype == dtypes._float4 else j
     assert all(isinstance(x, Token) for x in ordered_ret), "some tokens didn't get scattered?"
     return cast(List[Token], ordered_ret)
 
@@ -537,7 +535,7 @@ class Linearizer:
 
     # if nothing at all is upcasted and it's easy to, do an upcast
     # TODO: this is breaking the tests
-    #for splits in [4]:
-    #  if self.upcasted == 0 and len(self.full_unupcasted_shape) > 0 and self.full_unupcasted_shape[-1] % splits == 0:
-    #    self.shift_to(len(self.full_unupcasted_shape)-1, splits, insert_before=len(self.full_unupcasted_shape))
-    #    self.upcast()
+    for splits in [4]:
+      if self.upcasted == 0 and len(self.full_unupcasted_shape) > 0 and self.full_unupcasted_shape[-1] % splits == 0:
+        self.shift_to(len(self.full_unupcasted_shape)-1, splits, insert_before=len(self.full_unupcasted_shape))
+        self.upcast()
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 82521ca2b7..4505ee0a16 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -56,7 +56,8 @@ class DType(NamedTuple):
   priority: int  # this determines when things get upcasted
   itemsize: int
   name: str
-  np: type  # TODO: someday this will be removed with the "remove numpy" project
+  np: Optional[type]  # TODO: someday this will be removed with the "remove numpy" project
+  sz: int = 1
   def __repr__(self): return f"dtypes.{self.name}"
 
 # dependent typing?
@@ -80,7 +81,9 @@ class dtypes:
   def from_np(x) -> DType: return asdict(dtypes())[np.dtype(x).name]
   bool: Final[DType] = DType(0, 1, "bool", bool)
   float16: Final[DType] = DType(0, 2, "half", np.float16)
+  half = float16
   float32: Final[DType] = DType(4, 4, "float", np.float32)
+  float = float32
   float64: Final[DType] = DType(5, 8, "double", np.float64)
   int8: Final[DType] = DType(0, 1, "char", np.int8)
   int32: Final[DType] = DType(1, 4, "int", np.int32)
@@ -89,6 +92,9 @@ class dtypes:
   uint32: Final[DType] = DType(1, 4, "uint", np.uint32)
   uint64: Final[DType] = DType(2, 8, "uint64", np.uint64)
 
+  # NOTE: these are internal dtypes, should probably check for that
+  _half4: Final[DType] = DType(0, 2*4, "half4", None, 4)
+  _float4: Final[DType] = DType(4, 4*4, "float4", None, 4)
 
 class GlobalCounters:
   global_ops: ClassVar[int] = 0
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index ef53218fb6..6ba2b976bb 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -136,6 +136,7 @@ class LazyBuffer:
           self.realized = Device[self.device].buffer(prod(self.shape), self.dtype, **self._device_extra_args())
         elif self.op.op == LoadOps.RAND:
           rng = np.random.default_rng(self.op.arg)
+          assert self.dtype.np is not None, "internal dtypes don't work with LoadOps.RAND"
           self.realized = Device[self.device].buffer.fromCPU(rng.random(size=self.shape, dtype=self.dtype.np), **self._device_extra_args())
         elif self.op.op == LoadOps.CONST:
           if hasattr(Device[self.device].codegen, 'supports_constant_folding'):