use hcq_profile in nv/amd program (#5344)

This commit is contained in:
nimlgen
2024-07-09 15:56:06 +03:00
committed by GitHub
parent bee96a19ff
commit e815c57039
3 changed files with 28 additions and 29 deletions

View File

@@ -187,12 +187,17 @@ class Compiled:
# **************** for HCQ Compatible Devices ****************
@contextlib.contextmanager
def hcq_profile(dev, queue_type, enabled, desc):
def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
st, en = (dev._alloc_signal(), dev._alloc_signal()) if enabled else (None, None)
if enabled: queue_type().timestamp(st).submit(dev)
if enabled and queue is not None: queue.timestamp(st)
elif enabled: queue_type().timestamp(st).submit(dev)
try: yield (st, en)
finally:
if enabled: queue_type().timestamp(en).submit(dev)
if enabled and queue is not None: queue.timestamp(en)
elif enabled: queue_type().timestamp(en).submit(dev)
if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))
class HCQCompatCompiled(Compiled):
@@ -268,7 +273,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes: raise NotImplementedError("need hcq compat alloc")
def copyin(self, dest: HCQCompatAllocRes, src: memoryview):
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
for i in range(0, src.nbytes, self.b[0].size):
self.b_next = (self.b_next + 1) % len(self.b)
self.device._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
@@ -287,7 +292,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
return (self.b[self.b_next].va_addr, self.b_next)
return None
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
.copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
@@ -298,7 +303,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
self.device.synchronize()
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
for i in range(0, dest.nbytes, self.b[0].size):
self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
@@ -311,7 +316,7 @@ class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src_dev, dest_dev):
src_dev._gpu_map(dest)
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
.copy(dest.va_addr, src.va_addr, sz) \

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Tuple, List, Any
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
from dataclasses import dataclass
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions, hcq_profile
from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
@@ -365,20 +365,19 @@ class AMDProgram:
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
q = HWPM4Queue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
q = HWPM4Queue()
q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
if wait or PROFILE: q.timestamp(sig_st)
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
if wait or PROFILE: q.timestamp(sig_en)
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
self.device.timeline_value += 1
self.device.kernargs_ptr += self.kernargs_alloc_size
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
if wait:
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
return (sig_en.start_ts - sig_st.start_ts) / 1e8
class AMDAllocator(HCQCompatAllocator):
@@ -499,8 +498,6 @@ class AMDDevice(HCQCompatCompiled):
self._gpu_map(AMDDevice.event_page)
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
self.time_event_st, self.time_event_en = AMDDevice._alloc_signal(), AMDDevice._alloc_signal()
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.kernargs_ptr = self.kernargs.va_addr

View File

@@ -2,7 +2,7 @@ from __future__ import annotations
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
from typing import Tuple, List, Any
from dataclasses import dataclass
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions, hcq_profile
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
@@ -341,21 +341,20 @@ class NVProgram:
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.va_addr)] + list(vals)
sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
q = HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
queue = HWComputeQueue()
queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
if wait or PROFILE: queue.timestamp(sig_st)
queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
if wait or PROFILE: queue.timestamp(sig_en)
queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
self.device.timeline_value += 1
self.device.kernargs_ptr += self.kernargs_alloc_size
if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
if wait:
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
return (sig_en[1] - sig_st[1]) / 1e9
class NVAllocator(HCQCompatAllocator):
@@ -544,8 +543,6 @@ class NVDevice(HCQCompatCompiled):
rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)
self.time_event_st, self.time_event_en = NVDevice._alloc_signal(), NVDevice._alloc_signal()
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
self.cmdq_wptr: int = 0 # in bytes