diff --git a/test/external/external_test_speed_llama.py b/test/external/external_test_speed_llama.py index 31647b3332..15c896206e 100644 --- a/test/external/external_test_speed_llama.py +++ b/test/external/external_test_speed_llama.py @@ -46,7 +46,7 @@ class TestLLaMASpeed(unittest.TestCase): # test no compiler use for this Device[Device.DEFAULT].compiler = None run_llama("methodcache", False) - with Profiling(sort='time', frac=0.1, fn="/tmp/llama.prof"): + with Profiling(sort='time', frac=0.1, fn="/tmp/llama.prof", ts=5): run_llama("profile", False) Device[Device.DEFAULT].runtime = backup_program diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 2ec42dfd7e..c020159474 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -14,7 +14,7 @@ N = 128 # shard_x is "data parallel" # shard_w is "model parallel" -@unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA"}, "no GPU CI") +@unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL"}, "no GPU CI") class TestMultiTensor(unittest.TestCase): def test_shard(self): X = Tensor.ones(256).contiguous().realize() diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index 5cfd82ffcc..67b0372b74 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -90,16 +90,24 @@ class Timing(contextlib.ContextDecorator): self.et = time.perf_counter_ns() - self.st if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else "")) +def _format_fcn(fcn): return f"{fcn[0]}:{fcn[2]}" if fcn[2] != "" else f"{fcn[0]}:{fcn[1]}" class Profiling(contextlib.ContextDecorator): - def __init__(self, enabled=True, sort='cumtime', frac=0.2, fn=None): self.enabled, self.sort, self.frac, self.fn = enabled, sort, frac, fn + def __init__(self, enabled=True, sort='cumtime', frac=0.2, fn=None, ts=1): + self.enabled, self.sort, self.frac, self.fn, self.time_scale = enabled, sort, frac, fn, 1e3/ts def __enter__(self): - self.pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6) + self.pr = cProfile.Profile() if self.enabled: self.pr.enable() def __exit__(self, *exc): if self.enabled: self.pr.disable() if self.fn: self.pr.dump_stats(self.fn) - pstats.Stats(self.pr).strip_dirs().sort_stats(self.sort).print_stats(self.frac) + stats = pstats.Stats(self.pr).strip_dirs().sort_stats(self.sort) + for fcn in stats.fcn_list[0:int(len(stats.fcn_list)*self.frac)]: # type: ignore[attr-defined] + (_primitive_calls, num_calls, tottime, cumtime, callers) = stats.stats[fcn] # type: ignore[attr-defined] + scallers = sorted(callers.items(), key=lambda x: -x[1][2]) + print(f"n:{num_calls:8d} tm:{tottime*self.time_scale:7.2f}ms tot:{cumtime*self.time_scale:7.2f}ms", + colored(_format_fcn(fcn), "yellow") + " "*(50-len(_format_fcn(fcn))), + colored(f"<- {(scallers[0][1][2]/tottime)*100:3.0f}% {_format_fcn(scallers[0][0])}", "BLACK") if len(scallers) else '') # *** universal database cache *** diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 0d07bc539a..bbf7c93560 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -18,17 +18,13 @@ sys.setrecursionlimit(10000) lazycache: Dict[Any, ReferenceType[LazyBuffer]] = {} def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(), - base:Optional[LazyBuffer]=None): - if 0 in st.shape: st, op, arg, srcs = ShapeTracker.from_shape(st.shape), LoadOps.CONST, 0, () + base:Optional[LazyBuffer]=None, enable_cache=bool(getenv("LAZYCACHE", 1))): + if 0 in st.shape: st, op, arg, srcs, base = ShapeTracker.from_shape(st.shape), LoadOps.CONST, 0, (), None - cache_key = (device, st, dtype, op, arg, tuple(ref(x) for x in srcs), ref(base) if base else None) + cache_key = (device, st, dtype, op, arg, tuple(ref(x) for x in srcs)) if base is None else (st, ref(base)) if (rret := lazycache.get(cache_key, None)): return cast(LazyBuffer, rret()) # NOTE: this should always be a live reference - ret = LazyBuffer(device, st, dtype, op, arg, srcs, base=base, cache_key=cache_key) - # TODO: remove LoadOps.CONST here while keeping a pretty graph and working fusions - # TODO: might be possible to remove LoadOps.COPY - if op not in {LoadOps.EMPTY, LoadOps.CUSTOM, LoadOps.CONST, LoadOps.COPY} and getenv("LAZYCACHE", 1): lazycache[cache_key] = ref(ret) - return ret + return LazyBuffer(device, st, dtype, op, arg, srcs, base=base, cache_key=cache_key if enable_cache else None) class LazyBuffer: def __init__(self, device:str, st:ShapeTracker, dtype:DType, @@ -47,6 +43,7 @@ class LazyBuffer: # properties on view assert base.base == base, "base must be a base itself" self._base = base + if cache_key is not None: lazycache[cache_key] = ref(self) def __del__(self): lazycache.pop(self.cache_key, None) @@ -59,7 +56,7 @@ class LazyBuffer: @staticmethod def loadop(op, shape:Tuple[sint,...], dtype:DType, device:str, arg=None, src:Optional[LazyBuffer]=None) -> LazyBuffer: - return create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, (src,) if src is not None else ()) + return create_lazybuffer(device, ShapeTracker.from_shape(shape), dtype, op, arg, (src,) if src is not None else (), enable_cache=False) def const(self, val:Union[float, int]) -> LazyBuffer: return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, self.device, arg=val).reshape((1,)*len(self.shape)).expand(self.shape) @@ -102,10 +99,10 @@ class LazyBuffer: # if it's a shrink, do the shrink before the copy with CONTIGUOUS if prod(self.st.shape) < prod(self.base.st.shape): - return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, srcs=(self.contiguous(),)) + return LazyBuffer.loadop(LoadOps.COPY, self.shape, self.dtype, device, src=self.contiguous()) # copy the base and apply the shapetracker on the new device - return create_lazybuffer(device, self.base.st, self.dtype, LoadOps.COPY, srcs=(self.base,))._view(self.st) + return LazyBuffer.loadop(LoadOps.COPY, self.base.shape, self.dtype, device, src=self.base)._view(self.st) def e(self, op:Union[LoadOps, UnaryOps, BinaryOps, TernaryOps], *in_srcs:LazyBuffer, arg:Optional[Any]=None) -> LazyBuffer: srcs: List[LazyBuffer] = [] diff --git a/tinygrad/shape/shapetracker.py b/tinygrad/shape/shapetracker.py index bef67b619c..5a57e8092c 100644 --- a/tinygrad/shape/shapetracker.py +++ b/tinygrad/shape/shapetracker.py @@ -4,7 +4,7 @@ import functools, itertools, operator from dataclasses import dataclass from typing import Tuple, List, Optional, Dict, Set, cast, Union, Iterable from tinygrad.ops import MovementOps -from tinygrad.helpers import prod, DEBUG, merge_dicts, getenv +from tinygrad.helpers import prod, merge_dicts, getenv from tinygrad.shape.symbolic import Variable, MulNode, Node, SumNode, NumNode, sint from tinygrad.shape.view import View, _merge_dims @@ -42,6 +42,10 @@ def merge_views(vm2:View, vm1:View) -> Optional[View]: if None in (strides := ShapeTracker((vm2, vm1)).real_strides()): return None return View.create(vm1.shape, cast(Tuple[sint, ...], strides), vm2.offset, vm1.mask) +def simplify(views:Tuple[View, ...]) -> Tuple[View, ...]: + if len(views) >= 2 and (new_view := merge_views(views[-2], views[-1])) is not None: return simplify(views[:-2] + (new_view,)) + return views + @functools.lru_cache(maxsize=None) def idxs_to_idx(shape:Tuple[int, ...], idxs:Tuple[Node, ...]) -> Node: assert len(idxs) == len(shape), "need an idx for all dimensions" @@ -56,9 +60,9 @@ class ShapeTracker: views: Tuple[View, ...] def __add__(self, st:ShapeTracker) -> ShapeTracker: - base = ShapeTracker(self.views) - for v in st.views: base = ShapeTracker(base.views + (v,)).simplify() # one view at a time = better simplification - return base + new_views = self.views + for v in st.views: new_views = simplify(new_views + (v,)) # one view at a time = better simplification + return ShapeTracker(new_views) def invert(self, out_shape:Tuple[sint, ...]) -> Optional[ShapeTracker]: ret = tuple(v.invert(s) for v,s in zip(self.views[::-1], [x.shape for x in self.views[::-1][1:]]+[out_shape])) @@ -74,7 +78,7 @@ class ShapeTracker: def shape(self) -> Tuple[sint, ...]: return self.views[-1].shape @property - def size(self) -> int: return prod([x.max if isinstance(x, Node) else x for x in self.views[-1].shape]) + def size(self) -> int: return self.views[-1].size() def real_size(self) -> int: if 0 in self.shape: return 0 @@ -153,11 +157,7 @@ class ShapeTracker: _, valid = self.expr_idxs() return f'idx{axis}' in [v.expr for v in valid.vars()] - def simplify(self) -> ShapeTracker: - if len(self.views) >= 2 and (new_view := merge_views(self.views[-2], self.views[-1])) is not None: - if DEBUG >= 5: print(f"st simplify : {self.views[-2]} + {self.views[-1]} = {new_view}") - return ShapeTracker(self.views[:-2] + (new_view,)).simplify() - return self + def simplify(self) -> ShapeTracker: return ShapeTracker(simplify(self.views)) # *** under this line are the movement ops *** diff --git a/tinygrad/shape/view.py b/tinygrad/shape/view.py index f78dc209fa..4631923189 100644 --- a/tinygrad/shape/view.py +++ b/tinygrad/shape/view.py @@ -75,6 +75,9 @@ class View: mask:Optional[Tuple[Tuple[sint, sint], ...]] contiguous:bool + @functools.lru_cache(maxsize=None) # pylint: disable=method-cache-max-size-none + def size(self) -> int: return prod([x.max if isinstance(x, Node) else x for x in self.shape]) + @staticmethod @functools.lru_cache(maxsize=None) def create(shape:Tuple[sint, ...], strides:Optional[Tuple[sint, ...]]=None, offset:sint=0, mask:Optional[Tuple[Tuple[sint, sint], ...]]=None): diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 1c2f412722..16f4e273f6 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -293,10 +293,11 @@ class Tensor: def reshape(self, shape, *args) -> Tensor: new_shape = argfix(shape, *args) - return mlops.Reshape.apply(self, shape=tuple([-prod(self.shape) // prod(new_shape) if s == -1 else (s if s is not None else self.shape[i]) for i,s in enumerate(new_shape)])) # noqa: E501 + new_shape = tuple([-prod(self.shape) // prod(new_shape) if s == -1 else (s if s is not None else self.shape[i]) for i,s in enumerate(new_shape)]) + return mlops.Reshape.apply(self, shape=new_shape) if new_shape != self.shape else self def expand(self, shape, *args) -> Tensor: - if shape == self.shape: return self - return mlops.Expand.apply(self, shape=tuple([x if x != -1 else s for s,x in zip(self.shape, argfix(shape, *args))])) + if (new_shape := argfix(shape, *args)) == self.shape: return self + return mlops.Expand.apply(self, shape=tuple([x if x != -1 else s for s,x in zip(self.shape, new_shape)])) def permute(self, order, *args) -> Tensor: return mlops.Permute.apply(self, order=argfix(order, *args)) def flip(self, axis, *args) -> Tensor: return mlops.Flip.apply(self, axis=[x if x >= 0 else x+len(self.shape) for x in argfix(axis, *args)]) def shrink(self, arg:Tuple[Optional[Tuple[sint, sint]], ...]) -> Tensor: