add return type for HCQCompatAllocator _alloc (#5267)

Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
Vyacheslav Pachkov
2024-07-03 10:25:44 +03:00
committed by GitHub
parent 191463a919
commit d3e4e21759
3 changed files with 16 additions and 10 deletions

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import multiprocessing
from dataclasses import dataclass
from collections import defaultdict
from typing import List, Optional, Dict, Tuple, Any, cast
from typing import List, Optional, Dict, Tuple, Any, cast, Protocol
import importlib, inspect, functools, pathlib, os, ctypes, atexit, time, contextlib
from tinygrad.helpers import getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, from_mv, ProfileLogger, PROFILE
from tinygrad.dtype import DType, ImageDType
@@ -255,6 +255,9 @@ class HCQCompatCompiled(Compiled):
self._set_signal(self.timeline_signal, 0)
cast(HCQCompatAllocator, self.allocator).b_timeline = [0] * len(cast(HCQCompatAllocator, self.allocator).b)
# used for copying and transfering, allocator implementations must have this set up.
class HCQCompatAllocRes(Protocol): va_addr: int; base: int; size: int; length: int # noqa: E702
class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
self.device = device
@@ -262,7 +265,9 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
self.b_timeline, self.b_next = [0] * len(self.b), 0
super().__init__()
def copyin(self, dest, src: memoryview):
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes: raise NotImplementedError("need hcq compat alloc")
def copyin(self, dest: HCQCompatAllocRes, src: memoryview):
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
for i in range(0, src.nbytes, self.b[0].size):
self.b_next = (self.b_next + 1) % len(self.b)
@@ -274,7 +279,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
self.b_timeline[self.b_next] = self.device.timeline_value
self.device.timeline_value += 1
def copy_from_disk(self, dest, src, size):
def copy_from_disk(self, dest: HCQCompatAllocRes, src, size):
def _get_temp_buf():
# Check if the next buffer is safe to be used (its signal has passed) and reserve it.
if self.b_timeline[(self.b_next + 1) % len(self.b)] <= self.device._read_signal(self.device.timeline_signal):
@@ -290,7 +295,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
self.b_timeline[batch_info[1]] = self.device.timeline_value
self.device.timeline_value += 1
def copyout(self, dest:memoryview, src):
def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
self.device.synchronize()
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
@@ -303,7 +308,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
ctypes.memmove(from_mv(dest[i:]), self.b[0].va_addr, lsize)
def transfer(self, dest, src, sz: int, src_dev, dest_dev):
def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src_dev, dest_dev):
src_dev._gpu_map(dest)
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
@@ -319,4 +324,5 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
.signal(dest_dev.timeline_signal, dest_dev.timeline_value).submit(dest_dev)
dest_dev.timeline_value += 1
def offset(self, buf, size:int, offset:int): return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)
def offset(self, buf, size:int, offset:int) -> HCQCompatAllocRes:
return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Tuple, List, Any
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
from dataclasses import dataclass
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
@@ -385,7 +385,7 @@ class AMDProgram:
class AMDAllocator(HCQCompatAllocator):
def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
def _alloc(self, size:int, options:BufferOptions):
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes:
try:
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
from typing import Tuple, List, Any
from dataclasses import dataclass
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, Compiler, CompileError, BufferOptions
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
@@ -348,7 +348,7 @@ class NVProgram:
class NVAllocator(HCQCompatAllocator):
def __init__(self, device:NVDevice): super().__init__(device)
def _alloc(self, size:int, options:BufferOptions):
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes:
if options.host: return self.device._gpu_host_alloc(size)
return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20)))