amd mockgpu graph support (#10385)

For testing remote graph stuff (prompted by #10371) in ci
2026-01-09 06:58:11 -05:00 · 2025-05-18 21:43:16 +05:00
parent a3308e145d
commit 27c12be471
3 changed files with 27 additions and 17 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -616,7 +616,7 @@ jobs:
        if: matrix.backend=='amdllvm'
        run: PYTHONPATH="." python test/test_amd_llvm.py
      - name: Run pytest (amd)
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_hcq.py test/external/external_test_am.py --durations=20
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
      - name: Run TRANSCENDENTAL math
        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
      - name: Run process replay tests
--- a/test/mockgpu/amd/amddriver.py
+++ b/test/mockgpu/amd/amddriver.py
@@ -39,8 +39,9 @@ class AMDDriver(VirtDriver):
  def __init__(self, gpus=6):
    super().__init__()

+    # NOTE: gpu ids start from one (id 0 is skipped in KFDIface._is_usable_gpu)
    self.tracked_files += [VirtFile('/dev/kfd', functools.partial(KFDFileDesc, driver=self))] + \
-      [VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i) for i in range(gpus)]))]
+      [VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i+1) for i in range(gpus)]))]

    self.gpus = {}
    self.next_fd = (1 << 30)
@@ -52,7 +53,7 @@ class AMDDriver(VirtDriver):
    self.next_doorbell = collections.defaultdict(int)
    self.mmu_event_ids = []

-    for i in range(gpus): self._prepare_gpu(i)
+    for i in range(gpus): self._prepare_gpu(i+1)

  def _alloc_fd(self):
    my_fd = self.next_fd
@@ -167,6 +168,4 @@ class AMDDriver(VirtDriver):
      any_progress = False
      for gpu in self.gpus.values():
        for q in gpu.queues:
-          if (prev_rptr:=q.rptr[0]) != q.wptr[0]:
-            q.execute()
-            any_progress |= (prev_rptr != q.rptr[0])
+          if q.executing: any_progress |= q.execute() > 0
--- a/test/mockgpu/amd/amdgpu.py
+++ b/test/mockgpu/amd/amdgpu.py
@@ -1,6 +1,6 @@
 import ctypes, time
 from test.mockgpu.gpu import VirtGPU
-from tinygrad.helpers import to_mv, init_c_struct_t, mv_address
+from tinygrad.helpers import to_mv, init_c_struct_t
 import tinygrad.runtime.autogen.amd_gpu as amd_gpu

 SDMA_MAX_COPY_SIZE = 0x400000
@@ -59,12 +59,16 @@ sdma_pkts = create_sdma_packets()
 class AMDQueue:
  def __init__(self, base, size, rptr, wptr):
    self.queue, self.size = to_mv(base, size).cast("I"), size
-    self.rptr = to_mv(rptr, 8).cast("Q")
-    self.wptr = to_mv(wptr, 8).cast("Q")
+    self.rptr = to_mv(rptr, 8).cast("Q") if isinstance(rptr, int) else rptr
+    self.wptr = to_mv(wptr, 8).cast("Q") if isinstance(wptr, int) else wptr
+
+  @property
+  def executing(self): return self.rptr[0] < self.wptr[0]

 class PM4Executor(AMDQueue):
  def __init__(self, gpu, base, size, rptr, wptr):
    self.gpu = gpu
+    self.ib_executor: PM4Executor|None = None
    super().__init__(base, size, rptr, wptr)

  def _next_dword(self):
@@ -72,9 +76,17 @@ class PM4Executor(AMDQueue):
    self.rptr[0] += 1
    return x

+  @property
+  def executing(self): return self.rptr[0] < self.wptr[0] or self.ib_executor is not None
+
  def execute(self):
-    while self.rptr[0] < self.wptr[0]:
-      cont = True
+    prev_rptr, executed_in_ib, cont = self.rptr[0], 0, True
+    while self.executing and cont:
+      if self.ib_executor is not None:
+        executed_in_ib += self.ib_executor.execute()
+        if self.ib_executor.executing: break
+        self.ib_executor = None
+        continue # this continue is needed if PACKET3_INDIRECT_BUFFER is the last packet and rptr == wptr
      header = self._next_dword()
      packet_type = header >> 30
      op = (header >> 8) & 0xFF
@@ -88,7 +100,7 @@ class PM4Executor(AMDQueue):
      elif op == amd_gpu.PACKET3_INDIRECT_BUFFER: self._exec_indirect_buffer(n)
      elif op == amd_gpu.PACKET3_EVENT_WRITE: self._exec_event_write(n)
      else: raise RuntimeError(f"PM4: Unknown opcode: {op}")
-      if not cont: return
+    return (self.rptr[0] - prev_rptr) + executed_in_ib

  def _exec_acquire_mem(self, n):
    assert n == 6
@@ -171,8 +183,7 @@ class PM4Executor(AMDQueue):
    wptr = memoryview(bytearray(8)).cast('Q')
    rptr[0] = 0
    wptr[0] = buf_sz
-    PM4Executor(self.gpu, (addr_hi << 32) | addr_lo, buf_sz * 4, mv_address(rptr), mv_address(wptr)).execute()
-    assert rptr[0] == wptr[0], "not everything executed in amdgpu"
+    self.ib_executor = PM4Executor(self.gpu, (addr_hi << 32) | addr_lo, buf_sz * 4, rptr, wptr)

  def _exec_event_write(self, n):
    assert n == 0
@@ -184,8 +195,8 @@ class SDMAExecutor(AMDQueue):
    super().__init__(base, size, rptr, wptr)

  def execute(self):
-    while self.rptr[0] < self.wptr[0]:
-      cont = True
+    prev_rptr, cont = self.rptr[0], True
+    while self.executing and cont:
      header = self.queue[(self.rptr[0] // 4) % (self.size // 4)]
      op = (header >> 0) & 0xff
      if op == 0: self.rptr[0] += 4
@@ -196,7 +207,7 @@ class SDMAExecutor(AMDQueue):
      elif op == amd_gpu.SDMA_OP_COPY: self._execute_copy()
      elif op == amd_gpu.SDMA_OP_TIMESTAMP: self._execute_timestamp()
      else: raise RuntimeError(f"Unknown SDMA op {op}")
-      if not cont: return
+    return self.rptr[0] - prev_rptr

  def _execute_fence(self):
    struct = sdma_pkts.fence.from_address(self.base + self.rptr[0] % self.size)