From 039f0d372f5ecf7beaf4c235819399d4b6109f3d Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Thu, 15 Jun 2023 16:24:45 -0700 Subject: [PATCH] delete ltypes (#984) * delete ltypes * only upcast float types * test dtype on mac passes * ugh, these upcasts --- test/test_dtype.py | 11 ++++---- tinygrad/codegen/cstyle.py | 23 ++++++++-------- tinygrad/codegen/linearizer.py | 50 ++++++++++++++++------------------ tinygrad/helpers.py | 8 +++++- tinygrad/lazy.py | 1 + 5 files changed, 49 insertions(+), 44 deletions(-) diff --git a/test/test_dtype.py b/test/test_dtype.py index 8fb1202c52..b0fc00ff26 100644 --- a/test/test_dtype.py +++ b/test/test_dtype.py @@ -3,7 +3,7 @@ import numpy as np from tinygrad.helpers import getenv, DType, DEBUG from tinygrad.lazy import Device from tinygrad.tensor import Tensor, dtypes - +from extra.utils import OSX def _test_to_np(a:Tensor, np_dtype, target): print(a) @@ -31,7 +31,7 @@ def _test_matmul_upcast(a:Tensor, b:Tensor, target_dtype:DType, target): _test_o @unittest.skipIf(getenv("CI", "") != "" and Device.DEFAULT in ["LLVM"], "float16 broken in some CI backends") class TestHalfDtype(unittest.TestCase): def test_half_to_np(self): _test_to_np(Tensor([1,2,3,4], dtype=dtypes.float16), np.float16, [1,2,3,4]) - + def test_half_to_float(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float32, [1,2,3,4]) def test_half_to_int8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.int8, [1,2,3,4]) def test_half_to_uint8(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.uint8, [1,2,3,4]) @@ -40,11 +40,11 @@ class TestHalfDtype(unittest.TestCase): def test_float_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float16, [1,2,3,4]) def test_int8_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.int8), dtypes.float16, [1,2,3,4]) def test_uint8_to_half(self): _test_cast(Tensor([1,2,3,4], dtype=dtypes.uint8), dtypes.float16, [1,2,3,4]) - + def test_half_add(self): _test_add(Tensor([1,2,3,4], dtype=dtypes.float16), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [2,4,6,8]) def test_half_mul(self): _test_mul(Tensor([1,2,3,4], dtype=dtypes.float16), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [1,4,9,16]) def test_half_matmul(self): _test_matmul(Tensor([[1,2],[3,4]], dtype=dtypes.float16), Tensor.eye(2, dtype=dtypes.float16), dtypes.float16, [[1,2],[3,4]]) - + def test_half_add_upcast_float(self): _test_add_upcast(Tensor([1,2,3,4], dtype=dtypes.float16), Tensor([1,2,3,4], dtype=dtypes.float32), dtypes.float32, [2,4,6,8]) def test_int8_add_upcast_half(self): _test_add_upcast(Tensor([1,2,3,4], dtype=dtypes.int8), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [2,4,6,8]) def test_int8_mul_upcast_half(self): _test_mul_upcast(Tensor([1,2,3,4], dtype=dtypes.int8), Tensor([1,2,3,4], dtype=dtypes.float16), dtypes.float16, [1,4,9,16]) @@ -52,7 +52,8 @@ class TestHalfDtype(unittest.TestCase): def test_half_matmul_upcast_float(self): _test_matmul_upcast(Tensor([[1,2],[3,4]], dtype=dtypes.float16), Tensor.eye(2, dtype=dtypes.float32), dtypes.float32, [[1,2],[3,4]]) def test_int8_matmul_upcast_half(self): _test_matmul_upcast(Tensor([[1,2],[3,4]], dtype=dtypes.int8), Tensor.eye(2, dtype=dtypes.float16), dtypes.float16, [[1,2],[3,4]]) -class TestFloatDtype(unittest.TestCase): +@unittest.skipIf(OSX and Device.DEFAULT in ["GPU", "METAL"], "GPU on Mac doesn't support float64") +class TestFloat64Dtype(unittest.TestCase): def test_float64_to_np(self): _test_to_np(Tensor([1,2,3,4], dtype=dtypes.float64), np.float64, [1,2,3,4]) def test_float64_add(self): _test_add(Tensor([1,2,3,4], dtype=dtypes.float64), Tensor([1,2,3,4], dtype=dtypes.float64), dtypes.float64, [2,4,6,8]) def test_float64_mul(self): _test_mul(Tensor([1,2,3,4], dtype=dtypes.float64), Tensor([1,2,3,4], dtype=dtypes.float64), dtypes.float64, [1,4,9,16]) diff --git a/tinygrad/codegen/cstyle.py b/tinygrad/codegen/cstyle.py index d0d3a3f141..f16802f0f8 100644 --- a/tinygrad/codegen/cstyle.py +++ b/tinygrad/codegen/cstyle.py @@ -1,6 +1,6 @@ from typing import Final, Dict, Callable, ClassVar, List, Optional, NamedTuple, DefaultDict, Tuple, Set, Union import math, collections -from tinygrad.codegen.linearizer import Linearizer, UOps, UOp, LocalBuffer, LocalTypes +from tinygrad.codegen.linearizer import Linearizer, UOps, UOp, LocalBuffer from tinygrad.ops import ASTRunner, Op, UnaryOps, BinaryOps, FusedOps from tinygrad.helpers import partition, ImageDType, DEBUG, dtypes, colored from tinygrad.runtime.lib import RawConst @@ -105,7 +105,7 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan assert newvar is not None if args == -math.inf: kk(f"{newvar.render(True)} = -INFINITY;") - elif newvar.ltype == LocalTypes.float4: + elif newvar.dtype == dtypes._float4: kk(f"{newvar.render(True)} = {{ {args}f, {args}f, {args}f, {args}f }};") else: kk(f"{newvar.render(True)} = {args}f;") @@ -118,42 +118,42 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan elif uop == UOps.LOAD and newvar is not None: # TODO: merge with CONST? if bufs[args.i] is not None and isinstance(bufs[args.i].realized, RawConst): - assert newvar.ltype == LocalTypes.float, "const can't be float4" + assert newvar.dtype == dtypes.float, "const can't be float4" x = bufs[args.i].realized._buf if math.isnan(x): val = "NAN" elif math.isinf(x): val = ("-" if x < 0 else "") + "INFINITY" else: val = f"{x}" + ("f" if not dtypes.is_int(bufs[args.i].dtype) else "") elif isinstance(bufs[args.i].dtype, ImageDType): - assert newvar.ltype == LocalTypes.float4, "image must be float4" + assert newvar.dtype == dtypes._float4, "image must be float4" prekernel.add("const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n") idx, idy = to_image_idx(bufs[args.i].dtype.shape, args.idx, args.valid) val = f"read_imagef({bufnames[args.i]}, smp, (int2)({idx.render(render_cl)}, {idy.render(render_cl)}))" else: if lang.uses_vload and bufs[args.i].dtype == dtypes.float16: - if newvar.ltype == LocalTypes.float4: + if newvar.dtype == dtypes._float4: val = f"vload_half4({(args.idx//4).render(render_cl)}, {bufnames[args.i]})" else: val = f"vload_half({args.idx.render(render_cl)}, {bufnames[args.i]})" else: - if newvar.ltype == LocalTypes.float4: - val = f"({newvar.ltype.name})((({lang.smem_prefix if isinstance(bufs[args.i], LocalBuffer) else lang.buffer_prefix}{bufs[args.i].dtype.name}4*){bufnames[args.i]})[{(args.idx//4).render(render_cl)}])" + if newvar.dtype == dtypes._float4: + val = f"({newvar.dtype.name})((({lang.smem_prefix if isinstance(bufs[args.i], LocalBuffer) else lang.buffer_prefix}{bufs[args.i].dtype.name}4*){bufnames[args.i]})[{(args.idx//4).render(render_cl)}])" else: val = f"{bufnames[args.i]}[{args.idx.render(render_cl)}]" # NOTE: if min and max are both 0, it should be a CONST in the Linearizer if args.valid.min == 1: kk(f"{newvar.render(True)} = {val};") else: - casts = {LocalTypes.float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), LocalTypes.half: ("(half)", "(half)(0.0f)"), LocalTypes.float: ("(float)", "0.0f")}[newvar.ltype] + casts = {dtypes._float4: ("", f"{lang.float4}(0.0f, 0.0f, 0.0f, 0.0f)"), dtypes.half: ("(half)", "(half)(0.0f)"), dtypes.float: ("(float)", "0.0f")}[newvar.dtype] kk(f"{newvar.render(True)} = ({args.valid.render(render_cl)}) ? {casts[0]}({val}) : {casts[1]};") - elif uop == UOps.STORE and (vin[0].ltype == LocalTypes.float or (vin[0].ltype == LocalTypes.float4 and vin[0].offset is not None)): + elif uop == UOps.STORE and (vin[0].dtype == dtypes.float or (vin[0].dtype == dtypes._float4 and vin[0].offset is not None)): assert not isinstance(bufs[args.i].dtype, ImageDType), "image store must be float4" assert args.valid.min == 1, "store must be valid" if lang.uses_vload and bufs[args.i].dtype == dtypes.float16: kk(f"vstore_half({vin[0].render()}, {args.idx.render(render_cl)}, {bufnames[args.i]});") else: kk(f"{bufnames[args.i]}[{args.idx.render(render_cl)}] = {vin[0].render()};") - elif uop == UOps.CAST and newvar is not None and newvar.ltype == LocalTypes.float4: + elif uop == UOps.CAST and newvar is not None and newvar.dtype == dtypes._float4: kk(f"{newvar.render(True)} = {lang.float4}({','.join([x.render() for x in vin])});") - elif uop == UOps.STORE and len(vin) != 0 and vin[0].ltype == LocalTypes.float4 and vin[0].offset is None: + elif uop == UOps.STORE and len(vin) != 0 and vin[0].dtype == dtypes._float4 and vin[0].offset is None: assert args.valid.min == 1, "store must be valid" if isinstance(bufs[args[0]].dtype, ImageDType): idx, idy = to_image_idx(bufs[args.i].dtype.shape, args[1], args[2]) @@ -172,7 +172,6 @@ def uops_to_cstyle(uops:List[UOp], bufs:List[Union[LocalBuffer,LazyBuffer]], lan [', '.join([f'{t} {bufnames[i]}' for i,t in buftypes] + lang.extra_args)] + [") {\n"] + list(prekernel) + ['\n'.join(kernel), "\n}"]) - if lang.half_prekernel: prg =''.join([f"{lang.half_prekernel}", "\n", prg]) if lang.double_prekernel: prg = ''.join([f"{lang.double_prekernel}", "\n", prg]) return prg, global_size, local_size diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 7c47b39b3b..ed52fa9f24 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -18,21 +18,18 @@ class LocalBuffer(NamedTuple): dtype: DType = dtypes.float32 realized: None = None -# NOTE: half and half4 are not actually used yet -class LocalTypes(Enum): float = auto(); float4 = auto(); half = auto(); half4 = auto(); simdgroup_float8x8 = auto() # noqa: E702 - class Token(NamedTuple): name: str - ltype: LocalTypes + dtype: DType offset: Optional[int] = None def render(self, with_type=False): if with_type: assert self.offset is None - return f"{self.ltype.name} {self.name}" + return f"{self.dtype.name} {self.name}" if self.offset is None: return self.name - assert self.ltype == LocalTypes.float4 + assert self.dtype == dtypes._float4 return self.name+"."+"xyzw"[int(self.offset)] - def __repr__(self): return f"<{self.name}>" if self.offset is None and self.ltype == LocalTypes.float else f"<{self.name}:{self.ltype.name}:{self.offset}>" + def __repr__(self): return f"<{self.name}>" if self.offset is None and self.dtype == dtypes.float32 else f"<{self.name}:{self.dtype.name}:{self.offset}>" # TODO: the next three functions are poorly written def get_grouped_float4_idxs(acc:List[Token]) -> Optional[List[int]]: @@ -40,13 +37,13 @@ def get_grouped_float4_idxs(acc:List[Token]) -> Optional[List[int]]: for i,a in enumerate(acc): if idxs is None: break if i in idxs: continue - if a.ltype == LocalTypes.float4 and a.offset == 0: + if a.dtype.sz > 1 and a.offset == 0: idxs.append(i) friends: List[int] = [] for j,b in enumerate(acc): if len(friends) == 3: break if j in idxs: continue - if a.name == b.name and b.ltype == LocalTypes.float4 and b.offset == len(friends)+1: + if a.name == b.name and b.dtype.sz > 1 and b.offset == len(friends)+1: friends.append(j) if len(friends) == 3: idxs += friends else: idxs = None @@ -56,8 +53,8 @@ def get_grouped_float4_idxs(acc:List[Token]) -> Optional[List[int]]: def to_float4(x:List[Token]) -> Optional[Token]: if all_same(x): return x[0] - if all_same([y.name for y in x]) and all([y.ltype == LocalTypes.float4 and y.offset == i for i,y in enumerate(x)]): - return Token(x[0].name, LocalTypes.float4) + if all_same([y.name for y in x]) and all([y.dtype == dtypes._float4 and y.offset == i for i,y in enumerate(x)]): + return Token(x[0].name, dtypes._float4) return None def get_grouped_maybe_float4(*values:List[Token], grouping_allowed=True): @@ -170,10 +167,10 @@ class Linearizer: return store_offset_float4 def global_load(self, i, idxs:List[Variable], const=None) -> List[Token]: - load_offset: Dict[Tuple[int, ...], Any] = {uidxs:(LocalTypes.float,uidxs)+self.sts[i].expr_idxs(idxs+[Variable.num(x) for x in uidxs[::-1]]) for uidxs in self.shape_offsets(i)} + load_offset: Dict[Tuple[int, ...], Any] = {uidxs:(dtypes.float,uidxs)+self.sts[i].expr_idxs(idxs+[Variable.num(x) for x in uidxs[::-1]]) for uidxs in self.shape_offsets(i)} # float4 grouping (optional) - should_upcast = self.supports_float4 and len(self.float4_axis(i)) == 1 + should_upcast = self.supports_float4 and (self.bufs[i].dtype in [dtypes.float32, dtypes.float16] or isinstance(self.bufs[i].dtype, ImageDType)) and len(self.float4_axis(i)) == 1 if should_upcast: load_offset_new = {} for k,out_tokens in self._group_float4(i, load_offset).items(): @@ -183,7 +180,7 @@ class Linearizer: # idxs not in order, valids don't match, or idx doesn't evenly divide 4. use normal float for x in out_tokens: load_offset_new[x[1]] = x else: - load_offset_new[k] = (LocalTypes.float4, [x[1] for x in out_tokens], out_tokens[0][2], out_tokens[0][3]) + load_offset_new[k] = (dtypes._float4, [x[1] for x in out_tokens], out_tokens[0][2], out_tokens[0][3]) load_offset = load_offset_new # do loads @@ -193,9 +190,9 @@ class Linearizer: key = f"{localtype}{idx.render()}{valid.render()}" if key not in cache: cache[key] = self.uop(UOps.LOAD, Token(f"val{mnum(i)}_{len(cache)}", localtype), [], MemOp(i, idx, valid)) if const is None else self.uop(UOps.CONST, Token(f"acc{mnum(i)}_{len(cache)}", localtype), [], const) - if localtype == LocalTypes.float4: + if localtype == dtypes._float4: for j,uidx in enumerate(uidx_list): - loaded[uidx] = Token(cache[key].name, LocalTypes.float4, j) + loaded[uidx] = Token(cache[key].name, dtypes._float4, j) else: loaded[uidxs] = cache[key] return [loaded[uidxs] for uidxs in self.shape_offsets(i)] @@ -204,14 +201,15 @@ class Linearizer: store_offset: Dict[Tuple[int, ...], Token] = dict(zip(self.shape_offsets(i), store)) # float4 grouping (optional) - should_upcast = self.supports_float4 and (self.bufs[i].dtype not in (dtypes.float16, dtypes.int8, dtypes.uint8)) and len(self.float4_axis(i)) == 1 + # TODO: why does this not work for float16? + should_upcast = self.supports_float4 and (self.bufs[i].dtype == dtypes.float32 or isinstance(self.bufs[i].dtype, ImageDType)) and len(self.float4_axis(i)) == 1 if should_upcast: store_offset_new = {} for k,out_tokens in self._group_float4(i, store_offset).items(): if all_same([x.name for x in out_tokens]) and tuple(range(4)) == tuple(x.offset for x in out_tokens): - store_offset_new[k] = Token(out_tokens[0].name, LocalTypes.float4) + store_offset_new[k] = Token(out_tokens[0].name, dtypes._float4) else: - store_offset_new[k] = self.uop(UOps.CAST, ssa("alu", LocalTypes.float4), out_tokens) + store_offset_new[k] = self.uop(UOps.CAST, ssa("alu", dtypes._float4), out_tokens) store_offset = store_offset_new # do stores @@ -242,7 +240,7 @@ class Linearizer: # ssa _ssa:DefaultDict[str,int] = defaultdict(int) - def ssa(name, ltype=LocalTypes.float) -> Token: + def ssa(name, ltype=dtypes.float) -> Token: _ssa[name] += 1 return Token(f"{name}{_ssa[name]-1}", ltype) @@ -345,12 +343,12 @@ class Linearizer: if isinstance(x.op, (ReduceOps, FusedOps)): ret = [(idx, self.uop(UOps.ALU, val[0], list(val), {ReduceOps.SUM:BinaryOps.ADD, ReduceOps.MAX:BinaryOps.MAX, FusedOps.MULACC:FusedOps.MULACC}[x.op])) for idx, val in get_grouped_maybe_float4(acc, *values, grouping_allowed=self.supports_float4_alu)] else: - ret = [(idx, self.uop(UOps.ALU, ssa('alu', LocalTypes.float4) if any(x.ltype == LocalTypes.float4 and x.offset is None for x in val) else ssa('alu'), list(val), x.op)) for idx, val in get_grouped_maybe_float4(*values, grouping_allowed=self.supports_float4_alu and x.op!=BinaryOps.CMPEQ)] + ret = [(idx, self.uop(UOps.ALU, ssa('alu', dtypes._float4) if any(x.dtype == dtypes._float4 and x.offset is None for x in val) else ssa('alu'), list(val), x.op)) for idx, val in get_grouped_maybe_float4(*values, grouping_allowed=self.supports_float4_alu and x.op!=BinaryOps.CMPEQ)] ordered_ret: List[Optional[Token]] = [None]*len(values[0]) # scatter for i,j in ret: for o,k in enumerate(i): - ordered_ret[k] = Token(j.name, j.ltype, o) if j.ltype == LocalTypes.float4 else j + ordered_ret[k] = Token(j.name, j.dtype, o) if j.dtype == dtypes._float4 else j assert all(isinstance(x, Token) for x in ordered_ret), "some tokens didn't get scattered?" return cast(List[Token], ordered_ret) @@ -537,7 +535,7 @@ class Linearizer: # if nothing at all is upcasted and it's easy to, do an upcast # TODO: this is breaking the tests - #for splits in [4]: - # if self.upcasted == 0 and len(self.full_unupcasted_shape) > 0 and self.full_unupcasted_shape[-1] % splits == 0: - # self.shift_to(len(self.full_unupcasted_shape)-1, splits, insert_before=len(self.full_unupcasted_shape)) - # self.upcast() + for splits in [4]: + if self.upcasted == 0 and len(self.full_unupcasted_shape) > 0 and self.full_unupcasted_shape[-1] % splits == 0: + self.shift_to(len(self.full_unupcasted_shape)-1, splits, insert_before=len(self.full_unupcasted_shape)) + self.upcast() diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 82521ca2b7..4505ee0a16 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -56,7 +56,8 @@ class DType(NamedTuple): priority: int # this determines when things get upcasted itemsize: int name: str - np: type # TODO: someday this will be removed with the "remove numpy" project + np: Optional[type] # TODO: someday this will be removed with the "remove numpy" project + sz: int = 1 def __repr__(self): return f"dtypes.{self.name}" # dependent typing? @@ -80,7 +81,9 @@ class dtypes: def from_np(x) -> DType: return asdict(dtypes())[np.dtype(x).name] bool: Final[DType] = DType(0, 1, "bool", bool) float16: Final[DType] = DType(0, 2, "half", np.float16) + half = float16 float32: Final[DType] = DType(4, 4, "float", np.float32) + float = float32 float64: Final[DType] = DType(5, 8, "double", np.float64) int8: Final[DType] = DType(0, 1, "char", np.int8) int32: Final[DType] = DType(1, 4, "int", np.int32) @@ -89,6 +92,9 @@ class dtypes: uint32: Final[DType] = DType(1, 4, "uint", np.uint32) uint64: Final[DType] = DType(2, 8, "uint64", np.uint64) + # NOTE: these are internal dtypes, should probably check for that + _half4: Final[DType] = DType(0, 2*4, "half4", None, 4) + _float4: Final[DType] = DType(4, 4*4, "float4", None, 4) class GlobalCounters: global_ops: ClassVar[int] = 0 diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index ef53218fb6..6ba2b976bb 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -136,6 +136,7 @@ class LazyBuffer: self.realized = Device[self.device].buffer(prod(self.shape), self.dtype, **self._device_extra_args()) elif self.op.op == LoadOps.RAND: rng = np.random.default_rng(self.op.arg) + assert self.dtype.np is not None, "internal dtypes don't work with LoadOps.RAND" self.realized = Device[self.device].buffer.fromCPU(rng.random(size=self.shape, dtype=self.dtype.np), **self._device_extra_args()) elif self.op.op == LoadOps.CONST: if hasattr(Device[self.device].codegen, 'supports_constant_folding'):