diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py index 06e82e263b..4312997d57 100755 --- a/examples/llm.c/export.py +++ b/examples/llm.c/export.py @@ -6,7 +6,8 @@ Device.DEFAULT = "CLANG" from train_gpt2 import GPT, GPTConfig from tinygrad.helpers import dedup, to_function_name, flatten, getenv, GRAPH, GlobalCounters, ansilen, to_function_name from tinygrad.engine.schedule import create_schedule -from tinygrad.engine.realize import memory_planner, run_schedule +from tinygrad.engine.realize import run_schedule +from tinygrad.engine.memory import memory_planner from tinygrad.ops import BufferOps, LoadOps TIMING = getenv("TIMING") diff --git a/openpilot/compile2.py b/openpilot/compile2.py index c650f293c2..835b6f5e8a 100644 --- a/openpilot/compile2.py +++ b/openpilot/compile2.py @@ -18,7 +18,8 @@ from tinygrad.buffer import Buffer from tinygrad.dtype import ImageDType from tinygrad.device import CompiledRunner from tinygrad.helpers import partition, Context, fetch, getenv, DEBUG -from tinygrad.engine.realize import run_schedule, memory_planner, lower_schedule, ExecItem +from tinygrad.engine.realize import run_schedule, lower_schedule, ExecItem +from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import create_schedule from tinygrad.ops import LoadOps, ScheduleItem Device.DEFAULT = "GPU" diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 317a5f5685..3a60f0f30a 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -1,12 +1,12 @@ import unittest, functools, random from typing import List from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes -from tinygrad.device import BufferCopy, CompiledRunner +from tinygrad.device import CompiledRunner from tinygrad.ops import LoadOps, ReduceOps from tinygrad.helpers import CI, prod, Context from tinygrad.nn.state import get_parameters, get_state_dict from tinygrad.engine.schedule import create_schedule -from tinygrad.engine.realize import lower_schedule +from tinygrad.engine.realize import lower_schedule, BufferCopy from tinygrad.features.multi import all_reduce, MultiLazyBuffer from random import randint import numpy as np diff --git a/tinygrad/buffer.py b/tinygrad/buffer.py index b8b06c3ab0..802c9b6974 100644 --- a/tinygrad/buffer.py +++ b/tinygrad/buffer.py @@ -1,7 +1,9 @@ from __future__ import annotations -from typing import Any, Optional +from typing import Any, Optional, Dict, Tuple +import ctypes +from collections import defaultdict from dataclasses import dataclass -from tinygrad.helpers import GlobalCounters, flat_mv +from tinygrad.helpers import GlobalCounters, flat_mv, from_mv, getenv from tinygrad.dtype import DType, ImageDType @dataclass(frozen=True, eq=True) @@ -90,3 +92,40 @@ class Buffer: assert offset < self.nbytes, "offset must be less than nbytes" if self._base is not None: return Buffer(self.device, size, dtype, base=self._base, offset=self.offset+offset) return Buffer(self.device, size, dtype, base=self, offset=offset) + +# TODO: size, dest, src are the same type. can we enforce this? +class Allocator: + def alloc(self, size:int, options:Optional[BufferOptions]=None): + assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}" + return self._alloc(size, options if options is not None else BufferOptions()) + def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc") + def free(self, opaque, size:int, options:Optional[BufferOptions]=None): + self._free(opaque, options if options is not None else BufferOptions()) + def _free(self, opaque, options:BufferOptions): pass # if opaque is a Python object, you don't need a free + def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin") + def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout") + +class LRUAllocator(Allocator): # pylint: disable=abstract-method + def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list) + def alloc(self, size:int, options:Optional[BufferOptions]=None): + if len(c := self.cache[(size, options)]): return c.pop() + try: return super().alloc(size, options) + except (RuntimeError, MemoryError): + self.free_cache() + return super().alloc(size, options) + def free_cache(self): + for (sz,options),opaques in self.cache.items(): + for opaque in opaques: super().free(opaque, sz, options) + opaques.clear() + def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None): + if getenv("LRU", 1) and (options is None or not options.nolru): self.cache[(size, options)].append(opaque) + else: super().free(opaque, size, options) + +class _MallocAllocator(LRUAllocator): + def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)() + def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src)) + def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src)) + def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest)) + def offset(self, buf, size:int, offset:int): return from_mv(self.as_buffer(buf)[offset:offset+size]) + +MallocAllocator = _MallocAllocator() diff --git a/tinygrad/device.py b/tinygrad/device.py index 139ba37bb5..b879017a66 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -1,13 +1,12 @@ from __future__ import annotations import multiprocessing -from collections import defaultdict from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, Any, List, Optional, Dict, Tuple, ClassVar -import importlib, inspect, functools, pathlib, time, ctypes, os -from tinygrad.helpers import prod, getenv, colored, all_int, to_function_name, from_mv, flat_mv, diskcache_get, diskcache_put, DEBUG, BEAM, NOOPT +from typing import TYPE_CHECKING, List, Optional, Dict, Tuple, ClassVar +import importlib, inspect, functools, pathlib, os +from tinygrad.helpers import prod, getenv, all_int, to_function_name, diskcache_get, diskcache_put, DEBUG, BEAM, NOOPT from tinygrad.shape.symbolic import Variable, sym_infer, sint from tinygrad.ops import LazyOp, get_lazyop_info -from tinygrad.buffer import Buffer, BufferOptions +from tinygrad.buffer import Buffer, Allocator from tinygrad.codegen.uops import UOpGraph if TYPE_CHECKING: @@ -53,74 +52,6 @@ class Runner: def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False) -> Optional[float]: raise NotImplementedError("override this") -# **************** Buffer / Allocator **************** - -class BufferCopy(Runner): - def __init__(self, total_sz, dest_device, src_device): - if total_sz >= 1e6: name = f"{type(self).__name__[6:].lower()} {total_sz/1e6:7.2f}M, {dest_device[:7]:>7s} <- {src_device[:7]:7s}" - else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}" - super().__init__(colored(name, "yellow"), dest_device, 0, total_sz) - def copy(self, dest, src): - if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_fd') and src.nbytes >= 4096 and hasattr(src.allocator.device, 'fd'): - dest.allocator.copy_from_fd(dest._buf, src.allocator.device.fd, src._buf.offset, src.nbytes) - elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'): - # fast(ish) path, uses readinto in diskbuffers - src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf) - else: - dest.copyin(src.as_buffer(allow_zero_copy=True)) # may allocate a CPU buffer depending on allow_zero_copy - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): - dest, src = rawbufs[0:2] - assert dest.size == src.size and dest.dtype == src.dtype, f"buffer copy mismatch, {dest.size} != {src.size}, {dest.dtype} != {src.dtype}" - st = time.perf_counter() - self.copy(dest, src) - if wait: - Device[dest.device].synchronize() - return time.perf_counter() - st - -class BufferXfer(BufferCopy): - def copy(self, dest, src): - if hasattr(dest.allocator.device, "track_cross_buffer") and hasattr(src.allocator, "track_cross_device"): - dest.allocator.device.track_cross_buffer.append(src) - src.allocator.track_cross_device.add(dest.allocator.device) - dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device) - -# TODO: size, dest, src are the same type. can we enforce this? -class Allocator: - def alloc(self, size:int, options:Optional[BufferOptions]=None): - assert not isinstance(size, int) or size > 0, f"alloc size must be positve, getting {size}" - return self._alloc(size, options if options is not None else BufferOptions()) - def _alloc(self, size:int, options:BufferOptions): raise NotImplementedError("need alloc") - def free(self, opaque, size:int, options:Optional[BufferOptions]=None): - self._free(opaque, options if options is not None else BufferOptions()) - def _free(self, opaque, options:BufferOptions): pass # if opaque is a Python object, you don't need a free - def copyin(self, dest, src:memoryview): raise NotImplementedError("need copyin") - def copyout(self, dest:memoryview, src): raise NotImplementedError("need copyout") - -class LRUAllocator(Allocator): # pylint: disable=abstract-method - def __init__(self): self.cache: Dict[Tuple[int, Optional[BufferOptions]], Any] = defaultdict(list) - def alloc(self, size:int, options:Optional[BufferOptions]=None): - if len(c := self.cache[(size, options)]): return c.pop() - try: return super().alloc(size, options) - except (RuntimeError, MemoryError): - self.free_cache() - return super().alloc(size, options) - def free_cache(self): - for (sz,options),opaques in self.cache.items(): - for opaque in opaques: super().free(opaque, sz, options) - opaques.clear() - def free(self, opaque:Any, size:int, options:Optional[BufferOptions]=None): - if getenv("LRU", 1) and (options is None or not options.nolru): self.cache[(size, options)].append(opaque) - else: super().free(opaque, size, options) - -class _MallocAllocator(LRUAllocator): - def _alloc(self, size:int, options:BufferOptions): return (ctypes.c_uint8 * size)() - def as_buffer(self, src) -> memoryview: return flat_mv(memoryview(src)) - def copyin(self, dest, src:memoryview): ctypes.memmove(dest, from_mv(src), len(src)) - def copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src, len(dest)) - def offset(self, buf, size:int, offset:int): return from_mv(self.as_buffer(buf)[offset:offset+size]) - -MallocAllocator = _MallocAllocator() - # **************** for Compiled Devices **************** @dataclass(frozen=True) @@ -259,4 +190,3 @@ class Compiled: else: method_cache[ckey] = method_cache[bkey] = ret = self.to_runner(self.get_linearizer(*ast)) return ret - diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py index 48c0a10cae..c4c7eac2d0 100644 --- a/tinygrad/engine/jit.py +++ b/tinygrad/engine/jit.py @@ -4,11 +4,12 @@ import functools, itertools, collections from tinygrad.tensor import Tensor from tinygrad.lazy import LazyBuffer from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, all_int, GraphException, colored, JIT -from tinygrad.device import Buffer, CompiledRunner, BufferXfer, Compiled, Device, Runner +from tinygrad.device import Buffer, CompiledRunner, Compiled, Device, Runner from tinygrad.dtype import DType from tinygrad.shape.shapetracker import ShapeTracker from tinygrad.shape.symbolic import Variable, sint -from tinygrad.engine.realize import ExecItem, capturing, _internal_memory_planner, EmptyOp, ViewOp +from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer +from tinygrad.engine.memory import _internal_memory_planner from tinygrad.nn.state import get_parameters from weakref import WeakKeyDictionary diff --git a/tinygrad/engine/memory.py b/tinygrad/engine/memory.py new file mode 100644 index 0000000000..9ebcf0961e --- /dev/null +++ b/tinygrad/engine/memory.py @@ -0,0 +1,43 @@ +from typing import List, Dict, DefaultDict, Tuple, Union +from collections import defaultdict +from tinygrad.dtype import DType +from tinygrad.buffer import Buffer +from tinygrad.helpers import getenv, DEBUG, dedup +from tinygrad.ops import ScheduleItem + +def _internal_memory_planner(buffers:List[Union[List[Buffer], Tuple[Buffer, ...]]], debug_prefix="") -> Dict[Buffer, Buffer]: + if getenv("NO_MEMORY_PLANNER"): return {} + last_appearance = {} + for i,u in enumerate(buffers): + for buf in u: last_appearance[buf] = i + + # LRU algorithm + assigned: Dict[Buffer, Buffer] = {} + local_cache: DefaultDict[Tuple[str, int, DType], List[Buffer]] = defaultdict(list) + + def handle_buffer(buf): + key = (buf.device, buf.size, buf.dtype) + if buf not in assigned: + if len(ll:=local_cache[key]): assigned[buf] = ll.pop() + else: assigned[buf] = Buffer(*key) + if i == last_appearance[buf]: + if assigned[buf] not in local_cache[key]: local_cache[key].append(assigned[buf]) + + for i,u in enumerate(buffers): + for buf in u: + # all unallocated unparented buffers are fair game to replace + if buf.is_allocated() or buf.lb_refcount > 0: continue + # handle view buffers + if buf._base is not None: + assigned[buf] = Buffer(buf.device, buf.size, buf.dtype, base=assigned.get(buf._base, buf._base), offset=buf.offset) + else: + handle_buffer(buf) + + if DEBUG >= 1 and len(ak:=dedup(assigned.keys())) != len(av:=dedup(assigned.values())): + print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,", + f"{len(ak)} -> {len(av)} bufs") + return assigned + +def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]: + assigned = _internal_memory_planner([si.bufs for si in schedule]) + return [ScheduleItem(si.ast, tuple(assigned.get(x, x) for x in si.bufs)) for si in schedule] diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py index 2659c3d736..5688891c48 100644 --- a/tinygrad/engine/realize.py +++ b/tinygrad/engine/realize.py @@ -1,13 +1,60 @@ -from typing import List, Dict, Optional, cast, Generator, DefaultDict, Tuple, Union -from collections import defaultdict +from typing import List, Dict, Optional, cast, Generator, Tuple +import time from dataclasses import dataclass -from tinygrad.dtype import DType -from tinygrad.helpers import colored, getenv, dedup, DEBUG, GlobalCounters, ansilen -from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, copy_ast -from tinygrad.device import Runner, Device, BufferCopy, BufferXfer +from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen +from tinygrad.ops import ScheduleItem, BufferOps, LoadOps, copy_ast, LazyOp +from tinygrad.device import Runner, Device from tinygrad.buffer import Buffer from tinygrad.shape.symbolic import Variable, sym_infer +# **************** Runners **************** + +class CustomOp(Runner): + def __init__(self, fxn): + self.fxn = fxn + super().__init__(self.fxn.__name__, "CUSTOM", 0, 0) + def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): self.fxn(*rawbufs) + +class EmptyOp(Runner): + def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device) + def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass + +class ViewOp(Runner): + def __init__(self, buf:Buffer): super().__init__(colored(f"view {buf.nbytes:8d} @ {buf.offset:<10d}", "yellow"), buf.device) + def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): + assert rawbufs[0]._base is not None and rawbufs[0]._base == rawbufs[1].base, f"must be base {rawbufs}" + +class BufferCopy(Runner): + def __init__(self, total_sz, dest_device, src_device): + if total_sz >= 1e6: name = f"{type(self).__name__[6:].lower()} {total_sz/1e6:7.2f}M, {dest_device[:7]:>7s} <- {src_device[:7]:7s}" + else: name = f"{type(self).__name__[6:].lower()} {total_sz:8d}, {dest_device[:7]:>7s} <- {src_device[:7]:7s}" + super().__init__(colored(name, "yellow"), dest_device, 0, total_sz) + def copy(self, dest, src): + if src.device.startswith("DISK") and hasattr(dest.allocator, 'copy_from_fd') and src.nbytes >= 4096 and hasattr(src.allocator.device, 'fd'): + dest.allocator.copy_from_fd(dest._buf, src.allocator.device.fd, src._buf.offset, src.nbytes) + elif src.device.startswith("DISK") and hasattr(dest.allocator, 'as_buffer'): + # fast(ish) path, uses readinto in diskbuffers + src.allocator.copyout(dest.allocator.as_buffer(dest._buf), src._buf) + else: + dest.copyin(src.as_buffer(allow_zero_copy=True)) # may allocate a CPU buffer depending on allow_zero_copy + def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): + dest, src = rawbufs[0:2] + assert dest.size == src.size and dest.dtype == src.dtype, f"buffer copy mismatch, {dest.size} != {src.size}, {dest.dtype} != {src.dtype}" + st = time.perf_counter() + self.copy(dest, src) + if wait: + Device[dest.device].synchronize() + return time.perf_counter() - st + +class BufferXfer(BufferCopy): + def copy(self, dest, src): + if hasattr(dest.allocator.device, "track_cross_buffer") and hasattr(src.allocator, "track_cross_device"): + dest.allocator.device.track_cross_buffer.append(src) + src.allocator.track_cross_device.add(dest.allocator.device) + dest.allocator.transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.device, dest_dev=dest.allocator.device) + +# **************** lowering functions **************** + @dataclass(frozen=True) class ExecItem: prg: Runner @@ -27,34 +74,20 @@ class ExecItem: self.prg.first_run = False return et -class CustomOp(Runner): - def __init__(self, fxn): - self.fxn = fxn - super().__init__(self.fxn.__name__, "CUSTOM", 0, 0) - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): self.fxn(*rawbufs) - -class EmptyOp(Runner): - def __init__(self, buf:Buffer): super().__init__(colored(f"empty {buf.size:10d} {buf.dtype}", "yellow"), buf.device) - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): pass - -class ViewOp(Runner): - def __init__(self, buf:Buffer): super().__init__(colored(f"view {buf.nbytes:8d} @ {buf.offset:<10d}", "yellow"), buf.device) - def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False): - assert rawbufs[0]._base is not None and rawbufs[0]._base == rawbufs[1].base, f"must be base {rawbufs}" - -def lower_runner(runner:Runner, bufs) -> ExecItem: +def lower_runner(dname:str, ast:Tuple[LazyOp, ...], bufs) -> ExecItem: + runner = Device[dname].get_runner(*ast) # TODO: globals isn't on the stupid diskrunner, remove the need for it return ExecItem(runner, [bufs[x[0]] for x in runner.p.globals] if hasattr(runner, 'p') else bufs) def lower_schedule_item(si:ScheduleItem) -> ExecItem: assert len(set(x.device for x in si.bufs)) == 1 or si.ast[0].op is LoadOps.COPY - if si.ast[0].op is BufferOps.STORE: return lower_runner(Device[si.outputs[0].device].get_runner(*si.ast), si.bufs) + if si.ast[0].op is BufferOps.STORE: return lower_runner(si.outputs[0].device, si.ast, si.bufs) assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput" out, ast = si.outputs[0], si.ast[0] if ast.op is LoadOps.COPY: kernel_type = BufferCopy if hasattr(Device[out.device].allocator, 'transfer') and out.device.split(":")[0] == si.inputs[0].device.split(":")[0]: - if getenv("USE_COPY_KERNEL"): return lower_runner(Device[out.device].get_runner(copy_ast(ast.arg)), si.bufs) + if getenv("USE_COPY_KERNEL"): return lower_runner(out.device, (copy_ast(ast.arg),), si.bufs) kernel_type = BufferXfer return ExecItem(kernel_type(ast.arg, out.device, si.inputs[0].device), list(si.bufs)) if ast.op is LoadOps.CUSTOM: return ExecItem(CustomOp(ast.arg), list(si.bufs)) @@ -65,45 +98,10 @@ def lower_schedule_item(si:ScheduleItem) -> ExecItem: def lower_schedule(schedule:List[ScheduleItem]) -> Generator[ExecItem, None, None]: while len(schedule): yield lower_schedule_item(schedule.pop(0)) +# **************** main run function **************** + capturing: List = [] # put classes with an add method in here -def _internal_memory_planner(buffers:List[Union[List[Buffer], Tuple[Buffer, ...]]], debug_prefix="") -> Dict[Buffer, Buffer]: - if getenv("NO_MEMORY_PLANNER"): return {} - last_appearance = {} - for i,u in enumerate(buffers): - for buf in u: last_appearance[buf] = i - - # LRU algorithm - assigned: Dict[Buffer, Buffer] = {} - local_cache: DefaultDict[Tuple[str, int, DType], List[Buffer]] = defaultdict(list) - - def handle_buffer(buf): - key = (buf.device, buf.size, buf.dtype) - if buf not in assigned: - if len(ll:=local_cache[key]): assigned[buf] = ll.pop() - else: assigned[buf] = Buffer(*key) - if i == last_appearance[buf]: - if assigned[buf] not in local_cache[key]: local_cache[key].append(assigned[buf]) - - for i,u in enumerate(buffers): - for buf in u: - # all unallocated unparented buffers are fair game to replace - if buf.is_allocated() or buf.lb_refcount > 0: continue - # handle view buffers - if buf._base is not None: - assigned[buf] = Buffer(buf.device, buf.size, buf.dtype, base=assigned.get(buf._base, buf._base), offset=buf.offset) - else: - handle_buffer(buf) - - if DEBUG >= 1 and len(ak:=dedup(assigned.keys())) != len(av:=dedup(assigned.values())): - print(debug_prefix+f"memory reduced from {sum([x.nbytes for x in ak])/1e6:.2f} MB -> {sum([x.nbytes for x in av])/1e6:.2f} MB,", - f"{len(ak)} -> {len(av)} bufs") - return assigned - -def memory_planner(schedule:List[ScheduleItem]) -> List[ScheduleItem]: - assigned = _internal_memory_planner([si.bufs for si in schedule]) - return [ScheduleItem(si.ast, tuple(assigned.get(x, x) for x in si.bufs)) for si in schedule] - def run_schedule(schedule:List[ScheduleItem], var_vals:Optional[Dict[Variable, int]]=None): for ei in lower_schedule(schedule): if len(capturing): capturing[0].add(ei) diff --git a/tinygrad/runtime/graph/cuda.py b/tinygrad/runtime/graph/cuda.py index 3ca47f8322..3f2ad0f4cc 100644 --- a/tinygrad/runtime/graph/cuda.py +++ b/tinygrad/runtime/graph/cuda.py @@ -2,10 +2,10 @@ import ctypes from typing import Any, Optional, Tuple, Dict, List, cast import tinygrad.runtime.autogen.cuda as cuda from tinygrad.helpers import init_c_var, GraphException -from tinygrad.device import CompiledRunner, Buffer, BufferXfer, Device +from tinygrad.device import CompiledRunner, Buffer, Device from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution from tinygrad.shape.symbolic import Variable -from tinygrad.engine.realize import ExecItem +from tinygrad.engine.realize import ExecItem, BufferXfer from tinygrad.engine.jit import MultiGraphRunner class CUDAGraph(MultiGraphRunner): diff --git a/tinygrad/runtime/graph/hsa.py b/tinygrad/runtime/graph/hsa.py index ba345fede4..5da2992ba4 100644 --- a/tinygrad/runtime/graph/hsa.py +++ b/tinygrad/runtime/graph/hsa.py @@ -2,10 +2,10 @@ import ctypes, collections, time, itertools from typing import List, Any, Dict, cast, Optional, Tuple from tinygrad.helpers import GraphException, init_c_var, round_up from tinygrad.buffer import Buffer, BufferOptions -from tinygrad.device import Compiled, CompiledRunner, BufferXfer, Device +from tinygrad.device import Compiled, CompiledRunner, Device from tinygrad.shape.symbolic import Variable from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler -from tinygrad.engine.realize import ExecItem +from tinygrad.engine.realize import ExecItem, BufferXfer from tinygrad.engine.jit import MultiGraphRunner import tinygrad.runtime.autogen.hsa as hsa from tinygrad.runtime.driver.hsa import check, AQLQueue, AQL_PACKET_SIZE, EMPTY_SIGNAL diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 6bd2db48f0..77949e93b3 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,8 +1,8 @@ from __future__ import annotations from typing import Tuple, List, Any, cast import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno, subprocess, time -from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions -from tinygrad.buffer import BufferOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import BufferOptions, LRUAllocator from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.runtime.driver.hip_comgr import compile_hip diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py index 2434f7576c..1f6b3f88f6 100644 --- a/tinygrad/runtime/ops_clang.py +++ b/tinygrad/runtime/ops_clang.py @@ -1,5 +1,6 @@ import ctypes, subprocess, pathlib, tempfile -from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import MallocAllocator from tinygrad.helpers import cpu_time_execution from tinygrad.renderer.cstyle import ClangRenderer diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 56c15ff938..5a4bf6bc6d 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -5,8 +5,8 @@ from dataclasses import replace from typing import Tuple, Optional, List import tinygrad.runtime.autogen.cuda as cuda from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, Compiler, CompilerOptions -from tinygrad.buffer import BufferOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import BufferOptions, LRUAllocator, MallocAllocator from tinygrad.renderer.cstyle import CUDARenderer from tinygrad.renderer.assembly import PTXRenderer if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 diff --git a/tinygrad/runtime/ops_disk.py b/tinygrad/runtime/ops_disk.py index 99899e8eea..7493fd4996 100644 --- a/tinygrad/runtime/ops_disk.py +++ b/tinygrad/runtime/ops_disk.py @@ -2,7 +2,8 @@ from __future__ import annotations import os, mmap, _posixshmem, io from typing import Optional from tinygrad.helpers import OSX -from tinygrad.device import Compiled, Allocator +from tinygrad.device import Compiled +from tinygrad.buffer import Allocator class DiskBuffer: def __init__(self, device:DiskDevice, size:int, offset=0): diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 3fc78121bd..8de6c98f58 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -4,8 +4,8 @@ import ctypes, functools, hashlib import tinygrad.runtime.autogen.opencl as cl from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG from tinygrad.renderer.cstyle import OpenCLRenderer -from tinygrad.buffer import BufferOptions -from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions +from tinygrad.buffer import BufferOptions, LRUAllocator +from tinygrad.device import Compiled, Compiler, CompilerOptions # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something OSX_TIMING_RATIO = (125/3) if OSX else 1.0 diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py index 60e97c6e0a..ed22db4129 100644 --- a/tinygrad/runtime/ops_hsa.py +++ b/tinygrad/runtime/ops_hsa.py @@ -3,8 +3,8 @@ import ctypes, functools, subprocess, io, atexit, collections, json from typing import Tuple, TypeVar, List, Dict, Any import tinygrad.runtime.autogen.hsa as hsa from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv -from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions -from tinygrad.buffer import BufferOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import BufferOptions, LRUAllocator from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue from tinygrad.runtime.driver.hip_comgr import compile_hip diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index 32d5db0d41..f52cf05287 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -1,7 +1,8 @@ from __future__ import annotations import ctypes, functools from typing import Tuple -from tinygrad.device import Compiled, MallocAllocator, Compiler, CompilerOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import MallocAllocator from tinygrad.helpers import DEBUG, cpu_time_execution from tinygrad.renderer.llvmir import uops_to_llvm_ir import llvmlite.binding as llvm diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 13b79cbcd8..c35c40ccd0 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -3,7 +3,8 @@ import os, subprocess, pathlib, ctypes, tempfile, functools import Metal, libdispatch from typing import List, Set, Any, Tuple, Optional from tinygrad.helpers import prod, getenv, DEBUG, unwrap2 -from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import LRUAllocator from tinygrad.renderer.cstyle import MetalRenderer def wait_check(cbuf: Any): diff --git a/tinygrad/runtime/ops_npy.py b/tinygrad/runtime/ops_npy.py index c8121b9a09..3470d0edd2 100644 --- a/tinygrad/runtime/ops_npy.py +++ b/tinygrad/runtime/ops_npy.py @@ -1,6 +1,7 @@ import numpy as np from tinygrad.helpers import flat_mv -from tinygrad.device import Compiled, Allocator +from tinygrad.device import Compiled +from tinygrad.buffer import Allocator class NpyAllocator(Allocator): def copyout(self, dest:memoryview, src:np.ndarray): dest[:] = flat_mv(np.require(src, requirements='C').data) diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 71894bb1d0..0954d2f336 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -2,7 +2,8 @@ from __future__ import annotations import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time from typing import Tuple, List, Any, cast from dataclasses import replace -from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import LRUAllocator, BufferOptions from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod from tinygrad.renderer.cstyle import CUDARenderer from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 2d7fe1b22d..393cb8e528 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -5,7 +5,8 @@ from typing import Tuple, List, Optional, Any, Dict import pickle, base64, itertools, time, struct from tinygrad.dtype import DType, dtypes, ImageDType from tinygrad.helpers import all_same, getenv, flatten -from tinygrad.device import Compiled, Allocator, Compiler, CompilerOptions +from tinygrad.device import Compiled, Compiler, CompilerOptions +from tinygrad.buffer import Allocator from tinygrad.codegen.uops import UOpGraph, UOps from tinygrad.ops import BinaryOps, TernaryOps, exec_alu diff --git a/tinygrad/runtime/ops_rhip.py b/tinygrad/runtime/ops_rhip.py index 7bf252ba59..074509344f 100644 --- a/tinygrad/runtime/ops_rhip.py +++ b/tinygrad/runtime/ops_rhip.py @@ -1,5 +1,6 @@ import ctypes -from tinygrad.device import Compiled, MallocAllocator +from tinygrad.device import Compiled +from tinygrad.buffer import MallocAllocator from tinygrad.runtime.ops_hsa import HSACompiler rhip = ctypes.CDLL("/usr/local/lib/libremu.so") diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 3a062e3942..b64792e918 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -15,7 +15,8 @@ from tinygrad.ops import LoadOps, ScheduleItem from tinygrad.buffer import Buffer, BufferOptions from tinygrad.device import Device from tinygrad.shape.symbolic import sint, Variable, MulNode, Node -from tinygrad.engine.realize import run_schedule, memory_planner +from tinygrad.engine.realize import run_schedule +from tinygrad.engine.memory import memory_planner from tinygrad.engine.schedule import create_schedule_with_vars # **** start with two base classes, Tensor and Function ****