mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-24 22:38:16 -05:00
add return type for HCQCompatAllocator _alloc (#5267)
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
191463a919
commit
d3e4e21759
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
import multiprocessing
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
from typing import List, Optional, Dict, Tuple, Any, cast
|
||||
from typing import List, Optional, Dict, Tuple, Any, cast, Protocol
|
||||
import importlib, inspect, functools, pathlib, os, ctypes, atexit, time, contextlib
|
||||
from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, ProfileLogger, PROFILE
|
||||
from tinygrad.dtype import DType, ImageDType
|
||||
@@ -255,6 +255,9 @@ class HCQCompatCompiled(Compiled):
|
||||
self._set_signal(self.timeline_signal, 0)
|
||||
cast(HCQCompatAllocator, self.allocator).b_timeline = [0] * len(cast(HCQCompatAllocator, self.allocator).b)
|
||||
|
||||
# used for copying and transfering, allocator implementations must have this set up.
|
||||
class HCQCompatAllocRes(Protocol): va_addr: int; base: int; size: int; length: int # noqa: E702
|
||||
|
||||
class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
|
||||
self.device = device
|
||||
@@ -262,7 +265,9 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
self.b_timeline, self.b_next = [0] * len(self.b), 0
|
||||
super().__init__()
|
||||
|
||||
def copyin(self, dest, src: memoryview):
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes: raise NotImplementedError("need hcq compat alloc")
|
||||
|
||||
def copyin(self, dest: HCQCompatAllocRes, src: memoryview):
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
self.b_next = (self.b_next + 1) % len(self.b)
|
||||
@@ -274,7 +279,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
self.b_timeline[self.b_next] = self.device.timeline_value
|
||||
self.device.timeline_value += 1
|
||||
|
||||
def copy_from_disk(self, dest, src, size):
|
||||
def copy_from_disk(self, dest: HCQCompatAllocRes, src, size):
|
||||
def _get_temp_buf():
|
||||
# Check if the next buffer is safe to be used (its signal has passed) and reserve it.
|
||||
if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.device._read_signal(self.device.timeline_signal):
|
||||
@@ -290,7 +295,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
self.b_timeline[batch_info[1]] = self.device.timeline_value
|
||||
self.device.timeline_value += 1
|
||||
|
||||
def copyout(self, dest:memoryview, src):
|
||||
def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
|
||||
self.device.synchronize()
|
||||
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
|
||||
@@ -303,7 +308,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
|
||||
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
|
||||
|
||||
def transfer(self, dest, src, sz: int, src_dev, dest_dev):
|
||||
def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src_dev, dest_dev):
|
||||
src_dev._gpu_map(dest)
|
||||
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
|
||||
@@ -319,4 +324,5 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
.signal(dest_dev.timeline_signal, dest_dev.timeline_value).submit(dest_dev)
|
||||
dest_dev.timeline_value += 1
|
||||
|
||||
def offset(self, buf, size:int, offset:int): return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)
|
||||
def offset(self, buf, size:int, offset:int) -> HCQCompatAllocRes:
|
||||
return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
from typing import Tuple, List, Any
|
||||
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
@@ -385,7 +385,7 @@ class AMDProgram:
|
||||
class AMDAllocator(HCQCompatAllocator):
|
||||
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes:
|
||||
try:
|
||||
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
||||
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
|
||||
from typing import Tuple, List, Any
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
|
||||
@@ -348,7 +348,7 @@ class NVProgram:
|
||||
class NVAllocator(HCQCompatAllocator):
|
||||
def __init__(self, device:NVDevice): super().__init__(device)
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes:
|
||||
if options.host: return self.device._gpu_host_alloc(size)
|
||||
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user