mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-27 15:58:10 -05:00
use hcq_profile in nv/amd program (#5344)
This commit is contained in:
@@ -187,12 +187,17 @@ class Compiled:
|
||||
# **************** for HCQ Compatible Devices ****************
|
||||
|
||||
@contextlib.contextmanager
|
||||
def hcq_profile(dev, queue_type, enabled, desc):
|
||||
def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
|
||||
st, en = (dev._alloc_signal(), dev._alloc_signal()) if enabled else (None, None)
|
||||
if enabled: queue_type().timestamp(st).submit(dev)
|
||||
|
||||
if enabled and queue is not None: queue.timestamp(st)
|
||||
elif enabled: queue_type().timestamp(st).submit(dev)
|
||||
|
||||
try: yield (st, en)
|
||||
finally:
|
||||
if enabled: queue_type().timestamp(en).submit(dev)
|
||||
if enabled and queue is not None: queue.timestamp(en)
|
||||
elif enabled: queue_type().timestamp(en).submit(dev)
|
||||
|
||||
if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
|
||||
|
||||
class HCQCompatCompiled(Compiled):
|
||||
@@ -268,7 +273,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes: raise NotImplementedError("need hcq compat alloc")
|
||||
|
||||
def copyin(self, dest: HCQCompatAllocRes, src: memoryview):
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
|
||||
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
|
||||
for i in range(0, src.nbytes, self.b[0].size):
|
||||
self.b_next = (self.b_next + 1) % len(self.b)
|
||||
self.device._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
|
||||
@@ -287,7 +292,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
return (self.b[self.b_next].va_addr, self.b_next)
|
||||
return None
|
||||
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
|
||||
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
|
||||
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
|
||||
self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
||||
.copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
|
||||
@@ -298,7 +303,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
|
||||
self.device.synchronize()
|
||||
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
|
||||
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
|
||||
for i in range(0, dest.nbytes, self.b[0].size):
|
||||
self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
||||
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
|
||||
@@ -311,7 +316,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
|
||||
def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src_dev, dest_dev):
|
||||
src_dev._gpu_map(dest)
|
||||
|
||||
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
|
||||
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
|
||||
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
|
||||
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
|
||||
.copy(dest.va_addr, src.va_addr, sz) \
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
from typing import Tuple, List, Any
|
||||
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions, hcq_profile
|
||||
from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
@@ -365,20 +365,19 @@ class AMDProgram:
|
||||
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
||||
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
||||
|
||||
sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
||||
q = HWPM4Queue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
||||
|
||||
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
|
||||
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
||||
|
||||
q = HWPM4Queue()
|
||||
q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
|
||||
if wait or PROFILE: q.timestamp(sig_st)
|
||||
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
||||
if wait or PROFILE: q.timestamp(sig_en)
|
||||
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
||||
|
||||
self.device.timeline_value += 1
|
||||
self.device.kernargs_ptr += self.kernargs_alloc_size
|
||||
|
||||
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
||||
if wait:
|
||||
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
||||
if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
|
||||
return (sig_en.start_ts - sig_st.start_ts) / 1e8
|
||||
|
||||
class AMDAllocator(HCQCompatAllocator):
|
||||
@@ -499,8 +498,6 @@ class AMDDevice(HCQCompatCompiled):
|
||||
self._gpu_map(AMDDevice.event_page)
|
||||
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
|
||||
|
||||
self.time_event_st, self.time_event_en = AMDDevice._alloc_signal(), AMDDevice._alloc_signal()
|
||||
|
||||
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.kernargs_ptr = self.kernargs.va_addr
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ from __future__ import annotations
|
||||
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
|
||||
from typing import Tuple, List, Any
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
|
||||
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions, hcq_profile
|
||||
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
|
||||
@@ -341,21 +341,20 @@ class NVProgram:
|
||||
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
|
||||
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.va_addr)] + list(vals)
|
||||
|
||||
sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
|
||||
q = HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
|
||||
.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
|
||||
|
||||
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
|
||||
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
||||
|
||||
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
||||
|
||||
queue = HWComputeQueue()
|
||||
queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
|
||||
if wait or PROFILE: queue.timestamp(sig_st)
|
||||
queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
|
||||
queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
|
||||
if wait or PROFILE: queue.timestamp(sig_en)
|
||||
queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
|
||||
self.device.timeline_value += 1
|
||||
self.device.kernargs_ptr += self.kernargs_alloc_size
|
||||
|
||||
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
|
||||
if wait:
|
||||
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
|
||||
if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
|
||||
return (sig_en[1] - sig_st[1]) / 1e9
|
||||
|
||||
class NVAllocator(HCQCompatAllocator):
|
||||
@@ -544,8 +543,6 @@ class NVDevice(HCQCompatCompiled):
|
||||
|
||||
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
|
||||
|
||||
self.time_event_st, self.time_event_en = NVDevice._alloc_signal(), NVDevice._alloc_signal()
|
||||
|
||||
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
|
||||
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
|
||||
self.cmdq_wptr: int = 0 # in bytes
|
||||
|
||||
Reference in New Issue
Block a user