mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 06:58:11 -05:00
amd mockgpu graph support (#10385)
For testing remote graph stuff (prompted by #10371) in ci
This commit is contained in:
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -616,7 +616,7 @@ jobs:
|
||||
if: matrix.backend=='amdllvm'
|
||||
run: PYTHONPATH="." python test/test_amd_llvm.py
|
||||
- name: Run pytest (amd)
|
||||
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_hcq.py test/external/external_test_am.py --durations=20
|
||||
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
|
||||
- name: Run TRANSCENDENTAL math
|
||||
run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
|
||||
- name: Run process replay tests
|
||||
|
||||
@@ -39,8 +39,9 @@ class AMDDriver(VirtDriver):
|
||||
def __init__(self, gpus=6):
|
||||
super().__init__()
|
||||
|
||||
# NOTE: gpu ids start from one (id 0 is skipped in KFDIface._is_usable_gpu)
|
||||
self.tracked_files += [VirtFile('/dev/kfd', functools.partial(KFDFileDesc, driver=self))] + \
|
||||
[VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i) for i in range(gpus)]))]
|
||||
[VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i+1) for i in range(gpus)]))]
|
||||
|
||||
self.gpus = {}
|
||||
self.next_fd = (1 << 30)
|
||||
@@ -52,7 +53,7 @@ class AMDDriver(VirtDriver):
|
||||
self.next_doorbell = collections.defaultdict(int)
|
||||
self.mmu_event_ids = []
|
||||
|
||||
for i in range(gpus): self._prepare_gpu(i)
|
||||
for i in range(gpus): self._prepare_gpu(i+1)
|
||||
|
||||
def _alloc_fd(self):
|
||||
my_fd = self.next_fd
|
||||
@@ -167,6 +168,4 @@ class AMDDriver(VirtDriver):
|
||||
any_progress = False
|
||||
for gpu in self.gpus.values():
|
||||
for q in gpu.queues:
|
||||
if (prev_rptr:=q.rptr[0]) != q.wptr[0]:
|
||||
q.execute()
|
||||
any_progress |= (prev_rptr != q.rptr[0])
|
||||
if q.executing: any_progress |= q.execute() > 0
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import ctypes, time
|
||||
from test.mockgpu.gpu import VirtGPU
|
||||
from tinygrad.helpers import to_mv, init_c_struct_t, mv_address
|
||||
from tinygrad.helpers import to_mv, init_c_struct_t
|
||||
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
||||
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
@@ -59,12 +59,16 @@ sdma_pkts = create_sdma_packets()
|
||||
class AMDQueue:
|
||||
def __init__(self, base, size, rptr, wptr):
|
||||
self.queue, self.size = to_mv(base, size).cast("I"), size
|
||||
self.rptr = to_mv(rptr, 8).cast("Q")
|
||||
self.wptr = to_mv(wptr, 8).cast("Q")
|
||||
self.rptr = to_mv(rptr, 8).cast("Q") if isinstance(rptr, int) else rptr
|
||||
self.wptr = to_mv(wptr, 8).cast("Q") if isinstance(wptr, int) else wptr
|
||||
|
||||
@property
|
||||
def executing(self): return self.rptr[0] < self.wptr[0]
|
||||
|
||||
class PM4Executor(AMDQueue):
|
||||
def __init__(self, gpu, base, size, rptr, wptr):
|
||||
self.gpu = gpu
|
||||
self.ib_executor: PM4Executor|None = None
|
||||
super().__init__(base, size, rptr, wptr)
|
||||
|
||||
def _next_dword(self):
|
||||
@@ -72,9 +76,17 @@ class PM4Executor(AMDQueue):
|
||||
self.rptr[0] += 1
|
||||
return x
|
||||
|
||||
@property
|
||||
def executing(self): return self.rptr[0] < self.wptr[0] or self.ib_executor is not None
|
||||
|
||||
def execute(self):
|
||||
while self.rptr[0] < self.wptr[0]:
|
||||
cont = True
|
||||
prev_rptr, executed_in_ib, cont = self.rptr[0], 0, True
|
||||
while self.executing and cont:
|
||||
if self.ib_executor is not None:
|
||||
executed_in_ib += self.ib_executor.execute()
|
||||
if self.ib_executor.executing: break
|
||||
self.ib_executor = None
|
||||
continue # this continue is needed if PACKET3_INDIRECT_BUFFER is the last packet and rptr == wptr
|
||||
header = self._next_dword()
|
||||
packet_type = header >> 30
|
||||
op = (header >> 8) & 0xFF
|
||||
@@ -88,7 +100,7 @@ class PM4Executor(AMDQueue):
|
||||
elif op == amd_gpu.PACKET3_INDIRECT_BUFFER: self._exec_indirect_buffer(n)
|
||||
elif op == amd_gpu.PACKET3_EVENT_WRITE: self._exec_event_write(n)
|
||||
else: raise RuntimeError(f"PM4: Unknown opcode: {op}")
|
||||
if not cont: return
|
||||
return (self.rptr[0] - prev_rptr) + executed_in_ib
|
||||
|
||||
def _exec_acquire_mem(self, n):
|
||||
assert n == 6
|
||||
@@ -171,8 +183,7 @@ class PM4Executor(AMDQueue):
|
||||
wptr = memoryview(bytearray(8)).cast('Q')
|
||||
rptr[0] = 0
|
||||
wptr[0] = buf_sz
|
||||
PM4Executor(self.gpu, (addr_hi << 32) | addr_lo, buf_sz * 4, mv_address(rptr), mv_address(wptr)).execute()
|
||||
assert rptr[0] == wptr[0], "not everything executed in amdgpu"
|
||||
self.ib_executor = PM4Executor(self.gpu, (addr_hi << 32) | addr_lo, buf_sz * 4, rptr, wptr)
|
||||
|
||||
def _exec_event_write(self, n):
|
||||
assert n == 0
|
||||
@@ -184,8 +195,8 @@ class SDMAExecutor(AMDQueue):
|
||||
super().__init__(base, size, rptr, wptr)
|
||||
|
||||
def execute(self):
|
||||
while self.rptr[0] < self.wptr[0]:
|
||||
cont = True
|
||||
prev_rptr, cont = self.rptr[0], True
|
||||
while self.executing and cont:
|
||||
header = self.queue[(self.rptr[0] // 4) % (self.size // 4)]
|
||||
op = (header >> 0) & 0xff
|
||||
if op == 0: self.rptr[0] += 4
|
||||
@@ -196,7 +207,7 @@ class SDMAExecutor(AMDQueue):
|
||||
elif op == amd_gpu.SDMA_OP_COPY: self._execute_copy()
|
||||
elif op == amd_gpu.SDMA_OP_TIMESTAMP: self._execute_timestamp()
|
||||
else: raise RuntimeError(f"Unknown SDMA op {op}")
|
||||
if not cont: return
|
||||
return self.rptr[0] - prev_rptr
|
||||
|
||||
def _execute_fence(self):
|
||||
struct = sdma_pkts.fence.from_address(self.base + self.rptr[0] % self.size)
|
||||
|
||||
Reference in New Issue
Block a user