From c6769badc24e27cbd466b856c6d02f3f8a154485 Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Mon, 29 Dec 2025 13:18:37 +0300
Subject: [PATCH] mockgpu: async support (#13868)

* mockgpu: async support

* cpu
---
 test/device/test_hcq.py       |  2 +-
 test/mockgpu/amd/amddriver.py | 21 +++++++++++-----
 test/mockgpu/nv/nvdriver.py   | 45 +++++++++++++++++++++++++----------
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/test/device/test_hcq.py b/test/device/test_hcq.py
index 2261bdad63..3eeb47dbb7 100644
--- a/test/device/test_hcq.py
+++ b/test/device/test_hcq.py
@@ -76,7 +76,7 @@ class TestHCQ(unittest.TestCase):
         TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
         TestHCQ.d0.timeline_value += 1
 
-  @unittest.skipIf(MOCKGPU or Device.DEFAULT in {"CPU"}, "Can't handle async update on MOCKGPU for now")
+  @unittest.skipIf(Device.DEFAULT in {"CPU"}, "Can't handle async update on CPU device")
   def test_wait_late_set(self):
     for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
       if queue_type is None: continue
diff --git a/test/mockgpu/amd/amddriver.py b/test/mockgpu/amd/amddriver.py
index 317f57a75f..29038a097a 100644
--- a/test/mockgpu/amd/amddriver.py
+++ b/test/mockgpu/amd/amddriver.py
@@ -52,6 +52,7 @@ class AMDDriver(VirtDriver):
     self.doorbells = {}
     self.next_doorbell = collections.defaultdict(int)
     self.mmu_event_ids = []
+    self._executing = False  # re-entrancy guard for _emulate_execute
 
     for i in range(gpus): self._prepare_gpu(i+1)
 
@@ -125,6 +126,9 @@ class AMDDriver(VirtDriver):
       if struct.gpu_id not in self.gpus: return -1
       struct.handle = self._alloc_handle()
       self.object_by_handle[struct.handle] = copy.deepcopy(struct) # save memory struct to know what mem it is
+      # Track signal memory (uncached + coherent) - progress queues when written to
+      if struct.flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED:
+        self.track_address(struct.va_addr, struct.va_addr + struct.size, lambda mv,off: None, lambda mv, off: self._emulate_execute())
     elif nr == kfd_ioctls.AMDKFD_IOC_FREE_MEMORY_OF_GPU:
       self.object_by_handle.pop(struct.handle)
     elif nr == kfd_ioctls.AMDKFD_IOC_MAP_MEMORY_TO_GPU:
@@ -173,9 +177,14 @@ class AMDDriver(VirtDriver):
     return 0
 
   def _emulate_execute(self):
-    any_progress = True
-    while any_progress:
-      any_progress = False
-      for gpu in self.gpus.values():
-        for q in gpu.queues:
-          if q.executing: any_progress |= q.execute() > 0
+    if self._executing: return  # prevent re-entrancy
+    self._executing = True
+    try:
+      any_progress = True
+      while any_progress:
+        any_progress = False
+        for gpu in self.gpus.values():
+          for q in gpu.queues:
+            if q.executing: any_progress |= q.execute() > 0
+    finally:
+      self._executing = False
diff --git a/test/mockgpu/nv/nvdriver.py b/test/mockgpu/nv/nvdriver.py
index f895ca5c67..31e85bfa4a 100644
--- a/test/mockgpu/nv/nvdriver.py
+++ b/test/mockgpu/nv/nvdriver.py
@@ -15,7 +15,7 @@ libc.munmap.restype = ctypes.c_int
 NVSubDevice = collections.namedtuple('NVSubDevice', ['device'])
 NVUserMode = collections.namedtuple('NVUserMode', ['subdevice'])
 NVVASpace = collections.namedtuple('NVVASpace', ['device'])
-NVAllocation = collections.namedtuple('NVAllocation', ['device', 'size'])
+NVAllocation = collections.namedtuple('NVAllocation', ['device', 'size', 'is_signal'])
 NVChannelGroup = collections.namedtuple('NVChannelGroup', ['device'])
 NVContextShare = collections.namedtuple('NVContextShare', ['channel_group'])
 NVGPFIFO = collections.namedtuple('NVGPFIFO', ['device', 'token'])
@@ -41,12 +41,14 @@ class NVDevFileDesc(VirtFileDesc):
     super().__init__(fd)
     self.driver, self.gpu = driver, gpu
     self._mapping_userland = False
+    self._mapping_signal = False
 
   def ioctl(self, fd, request, argp): return self.driver.dev_ioctl(self.gpu, request, argp)
   def mmap(self, start, sz, prot, flags, fd, offset):
     start = libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0)
-    if self._mapping_userland:
+    if self._mapping_userland or self._mapping_signal:
       self.driver.track_address(start, start+sz, lambda mv,off: None, lambda mv, off: self.driver._gpu_mmio_write(mv, off, self.gpu))
+      self._mapping_signal = False
     return start
 
 class NVDriver(VirtDriver):
@@ -65,6 +67,7 @@ class NVDriver(VirtDriver):
     self.object_by_handle = {}
     self.opened_fds = {}
     self.next_doorbell = collections.defaultdict(int)
+    self._executing = False  # re-entrancy guard for _gpu_mmio_write
 
     for i in range(gpus): self._prepare_gpu(i)
 
@@ -115,7 +118,8 @@ class NVDriver(VirtDriver):
       assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU)
       params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS.from_address(params_ptr)
       struct.hObjectNew = self._alloc_handle()
-      self.object_by_handle[struct.hObjectNew] = NVAllocation(self.object_by_handle[struct.hObjectParent], params.size)
+      is_signal = struct.hClass == nv_gpu.NV1_MEMORY_SYSTEM  # signal memory uses NV1_MEMORY_SYSTEM (uncached)
+      self.object_by_handle[struct.hObjectNew] = NVAllocation(self.object_by_handle[struct.hObjectParent], params.size, is_signal)
     elif struct.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A:
       assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU)
       struct.hObjectNew = self._alloc_handle()
@@ -206,7 +210,6 @@ class NVDriver(VirtDriver):
   def ctl_ioctl(self, req, argp):
     nr = req & 0xff
     if nr == nv_gpu.NV_ESC_RM_ALLOC: return self.rm_alloc(argp)
-    elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY: pass
     elif nr == nv_gpu.NV_ESC_RM_CONTROL: return self.rm_control(argp)
     elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
       st:Any = nv_gpu.nv_ioctl_nvos33_parameters_with_fd.from_address(argp)
@@ -215,6 +218,10 @@ class NVDriver(VirtDriver):
         file = self.opened_fds[st.fd]
         assert isinstance(file, NVDevFileDesc)
         file._mapping_userland = True
+      elif isinstance(obj, NVAllocation) and obj.is_signal:
+        file = self.opened_fds[st.fd]
+        assert isinstance(file, NVDevFileDesc)
+        file._mapping_signal = True
     elif nr == nv_gpu.NV_ESC_RM_FREE:
       st = nv_gpu.NVOS00_PARAMETERS.from_address(argp)
       self.object_by_handle.pop(st.hObjectOld)
@@ -256,12 +263,26 @@ class NVDriver(VirtDriver):
     else: raise RuntimeError(f"Unknown {nr} to nvidia-uvm")
     return 0
 
-  def dev_ioctl(self, dev, req, argp): return 0
+  def dev_ioctl(self, dev, req, argp):
+    nr = req & 0xff
+    # Handle NV_ESC_RM_ALLOC_MEMORY for host/signal memory
+    if nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
+      st:Any = nv_gpu.nv_ioctl_nvos02_parameters_with_fd.from_address(argp)
+      # Track host memory (signal memory) - progress queues when written to
+      if st.params.hClass == nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR:
+        self.track_address(st.params.pMemory, st.params.pMemory + st.params.limit + 1,
+                           lambda mv,off: None, lambda mv, off: self._gpu_mmio_write(mv, off, None))
+    return 0
   def _gpu_mmio_write(self, mv, off, gpu):
-    any_progress = True
-    while any_progress:
-      any_progress = False
-      for gpu in self.gpus.values():
-        for q in gpu.queues:
-          if q.ctrl.GPGet != q.ctrl.GPPut:
-            any_progress |= q.execute()
+    if self._executing: return  # prevent re-entrancy
+    self._executing = True
+    try:
+      any_progress = True
+      while any_progress:
+        any_progress = False
+        for gpu in self.gpus.values():
+          for q in gpu.queues:
+            if q.ctrl.GPGet != q.ctrl.GPPut:
+              any_progress |= q.execute()
+    finally:
+      self._executing = False