mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
amd cleanup sdma (#4796)
* amd cleanup sdma * faster enqueue for sdma * typo * remove commnted lines * fix overrun check * flushhdp better command
This commit is contained in:
@@ -129,6 +129,7 @@ generate_amd() {
|
||||
-o $BASE/amd_gpu.py
|
||||
|
||||
sed 's/^\(.*\)\(\s*\/\*\)\(.*\)$/\1 #\2\3/; s/^\(\s*\*\)\(.*\)$/#\1\2/' extra/hip_gpu_driver/nvd.h >> $BASE/amd_gpu.py # comments
|
||||
sed 's/^\(.*\)\(\s*\/\*\)\(.*\)$/\1 #\2\3/; s/^\(\s*\*\)\(.*\)$/#\1\2/' extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h >> $BASE/amd_gpu.py # comments
|
||||
sed -i 's/#\s*define\s*\([^ \t]*\)(\([^)]*\))\s*\(.*\)/def \1(\2): return \3/' $BASE/amd_gpu.py # #define name(x) (smth) -> def name(x): return (smth)
|
||||
sed -i '/#\s*define\s\+\([^ \t]\+\)\s\+\([^ ]\+\)/s//\1 = \2/' $BASE/amd_gpu.py # #define name val -> name = val
|
||||
|
||||
|
||||
5664
extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h
Normal file
5664
extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, List, Any, cast
|
||||
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
|
||||
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
@@ -50,29 +50,6 @@ def ioctls_from_header():
|
||||
return type("KIO", (object, ), fxns)
|
||||
kio = ioctls_from_header()
|
||||
|
||||
def create_sdma_packets():
|
||||
# TODO: clean up this, if we want to keep it
|
||||
structs = {}
|
||||
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
|
||||
names = set()
|
||||
fields = []
|
||||
for pkt_fields in pkt._fields_:
|
||||
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
|
||||
else:
|
||||
assert pkt_fields[1]._fields_[0][0] == '_0'
|
||||
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
|
||||
fname = union_fields[0]
|
||||
if fname in names: fname = pkt_fields[0]+fname
|
||||
names.add(fname)
|
||||
# merge together 64-bit fields, otherwise just append them
|
||||
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
|
||||
else: fields.append(tuple([fname, *union_fields[1:]]))
|
||||
new_name = name[16:-4].lower()
|
||||
structs[new_name] = init_c_struct_t(tuple(fields))
|
||||
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
||||
return type("SDMA_PKTS", (object, ), structs)
|
||||
sdma_pkts = create_sdma_packets()
|
||||
|
||||
class AMDCompiler(Compiler):
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
@@ -100,6 +77,7 @@ FORCE_START_AT_000 = 1 << 2
|
||||
CS_W32_EN = 1 << 15
|
||||
|
||||
def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
|
||||
def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
|
||||
|
||||
class HWPM4Queue:
|
||||
def __init__(self): self.q = []
|
||||
@@ -191,61 +169,80 @@ class HWPM4Queue:
|
||||
device.pm4_doorbell[0] = wptr + len(self.q)
|
||||
return self
|
||||
|
||||
# prebuilt sdma packets
|
||||
sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
|
||||
sdma_cache_inv = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
||||
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
sdma_cache_wb = sdma_pkts.gcr(op=amd_gpu.SDMA_OP_GCR, sub_op=amd_gpu.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
class HWCopyQueue:
|
||||
def __init__(self): self.q = []
|
||||
def __init__(self): self.q, self.cmd_sizes = [], []
|
||||
|
||||
def _q(self, arr):
|
||||
self.q += arr
|
||||
self.cmd_sizes.append(len(arr))
|
||||
|
||||
def copy(self, dest, src, copy_size):
|
||||
# HDP flush
|
||||
self._q([amd_gpu.SDMA_OP_POLL_REGMEM, 0, 0x80000000, 0, 0, 0])
|
||||
|
||||
# Invalidate cache inv
|
||||
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
|
||||
amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
|
||||
|
||||
copied = 0
|
||||
copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
||||
for _ in range(copy_commands):
|
||||
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
||||
|
||||
self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
||||
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
|
||||
|
||||
copied += step_copy_size
|
||||
|
||||
# Invalidate cache wb
|
||||
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
|
||||
|
||||
return self
|
||||
|
||||
def signal(self, signal: hsa.amd_signal_t, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])
|
||||
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
|
||||
self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
|
||||
|
||||
return self
|
||||
|
||||
def wait(self, signal: hsa.amd_signal_t, value=0):
|
||||
self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value, 0xffffffff,
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
|
||||
|
||||
return self
|
||||
|
||||
def submit(self, device:AMDDevice):
|
||||
read_ptr = device.sdma_read_pointer[0]
|
||||
if (device.sdma_doorbell_value-read_ptr) > device.sdma_ring.size: raise RuntimeError("SDMA queue overrun")
|
||||
for cmd in self.q:
|
||||
if (cmdsz:=ctypes.sizeof(cmd)) > (fill:=device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size):
|
||||
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, fill)
|
||||
device.sdma_doorbell_value += fill
|
||||
ctypes.memmove(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), ctypes.addressof(cmd), cmdsz)
|
||||
device.sdma_doorbell_value += cmdsz
|
||||
|
||||
sdma_buffer_view = to_mv(device.sdma_ring.va_addr, device.sdma_ring.size).cast("I")
|
||||
|
||||
tail_blit_dword = 0
|
||||
for cmdsz in self.cmd_sizes:
|
||||
if (tail_blit_dword + cmdsz) * 4 >= device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size: break
|
||||
tail_blit_dword += cmdsz
|
||||
|
||||
start_idx = (device.sdma_doorbell_value % device.sdma_ring.size) // 4
|
||||
sdma_buffer_view[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
|
||||
device.sdma_doorbell_value += tail_blit_dword * 4
|
||||
|
||||
if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
|
||||
zero_fill = device.sdma_ring.size - device.sdma_doorbell_value % device.sdma_ring.size
|
||||
ctypes.memset(device.sdma_ring.va_addr + (device.sdma_doorbell_value % device.sdma_ring.size), 0, zero_fill)
|
||||
device.sdma_doorbell_value += zero_fill
|
||||
|
||||
sdma_buffer_view[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
|
||||
device.sdma_doorbell_value += rem_packet_cnt * 4
|
||||
|
||||
device.sdma_write_pointer[0] = device.sdma_doorbell_value
|
||||
device.sdma_doorbell[0] = device.sdma_doorbell_value
|
||||
return self
|
||||
|
||||
def timestamp(self, addr):
|
||||
self.q.append(sdma_pkts.timestamp(op=amd_gpu.SDMA_OP_TIMESTAMP, sub_op=amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=addr))
|
||||
return self
|
||||
|
||||
def copy(self, dest, src, copy_size):
|
||||
self.q.append(sdma_flush_hdp_pkt) # TODO: do I need this?
|
||||
self.q.append(sdma_cache_inv)
|
||||
copied = 0
|
||||
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
||||
for _ in range(copies_commands):
|
||||
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
||||
self.q.append(sdma_pkts.copy_linear(op=amd_gpu.SDMA_OP_COPY, sub_op=amd_gpu.SDMA_SUBOP_COPY_LINEAR,
|
||||
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
|
||||
copied += step_copy_size
|
||||
self.q.append(sdma_cache_wb)
|
||||
return self
|
||||
|
||||
def signal(self, signal:hsa.amd_signal_t, value=0):
|
||||
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
|
||||
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
|
||||
return self
|
||||
|
||||
def wait(self, signal:hsa.amd_signal_t, value=0):
|
||||
self.q.append(sdma_pkts.poll_regmem(op=amd_gpu.SDMA_OP_POLL_REGMEM, mem_poll=1, func=WAIT_REG_MEM_FUNCTION_GEQ,
|
||||
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET,
|
||||
value=value, mask=0xffffffff, interval=0x04, retry_count=0xfff))
|
||||
return self
|
||||
|
||||
SHT_PROGBITS, SHF_ALLOC = 0x1, 0x2
|
||||
class AMDProgram:
|
||||
def __init__(self, device:AMDDevice, name:str, lib:bytes):
|
||||
|
||||
Reference in New Issue
Block a user