mockgpu: async support (#13868)

* mockgpu: async support

* cpu
This commit is contained in:
nimlgen
2025-12-29 13:18:37 +03:00
committed by GitHub
parent fc5278746f
commit c6769badc2
3 changed files with 49 additions and 19 deletions

View File

@@ -76,7 +76,7 @@ class TestHCQ(unittest.TestCase):
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@unittest.skipIf(MOCKGPU or Device.DEFAULT in {"CPU"}, "Can't handle async update on MOCKGPU for now")
@unittest.skipIf(Device.DEFAULT in {"CPU"}, "Can't handle async update on CPU device")
def test_wait_late_set(self):
for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
if queue_type is None: continue

View File

@@ -52,6 +52,7 @@ class AMDDriver(VirtDriver):
self.doorbells = {}
self.next_doorbell = collections.defaultdict(int)
self.mmu_event_ids = []
self._executing = False # re-entrancy guard for _emulate_execute
for i in range(gpus): self._prepare_gpu(i+1)
@@ -125,6 +126,9 @@ class AMDDriver(VirtDriver):
if struct.gpu_id not in self.gpus: return -1
struct.handle = self._alloc_handle()
self.object_by_handle[struct.handle] = copy.deepcopy(struct) # save memory struct to know what mem it is
# Track signal memory (uncached + coherent) - progress queues when written to
if struct.flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED:
self.track_address(struct.va_addr, struct.va_addr + struct.size, lambda mv,off: None, lambda mv, off: self._emulate_execute())
elif nr == kfd_ioctls.AMDKFD_IOC_FREE_MEMORY_OF_GPU:
self.object_by_handle.pop(struct.handle)
elif nr == kfd_ioctls.AMDKFD_IOC_MAP_MEMORY_TO_GPU:
@@ -173,9 +177,14 @@ class AMDDriver(VirtDriver):
return 0
def _emulate_execute(self):
any_progress = True
while any_progress:
any_progress = False
for gpu in self.gpus.values():
for q in gpu.queues:
if q.executing: any_progress |= q.execute() > 0
if self._executing: return # prevent re-entrancy
self._executing = True
try:
any_progress = True
while any_progress:
any_progress = False
for gpu in self.gpus.values():
for q in gpu.queues:
if q.executing: any_progress |= q.execute() > 0
finally:
self._executing = False

View File

@@ -15,7 +15,7 @@ libc.munmap.restype = ctypes.c_int
NVSubDevice = collections.namedtuple('NVSubDevice', ['device'])
NVUserMode = collections.namedtuple('NVUserMode', ['subdevice'])
NVVASpace = collections.namedtuple('NVVASpace', ['device'])
NVAllocation = collections.namedtuple('NVAllocation', ['device', 'size'])
NVAllocation = collections.namedtuple('NVAllocation', ['device', 'size', 'is_signal'])
NVChannelGroup = collections.namedtuple('NVChannelGroup', ['device'])
NVContextShare = collections.namedtuple('NVContextShare', ['channel_group'])
NVGPFIFO = collections.namedtuple('NVGPFIFO', ['device', 'token'])
@@ -41,12 +41,14 @@ class NVDevFileDesc(VirtFileDesc):
super().__init__(fd)
self.driver, self.gpu = driver, gpu
self._mapping_userland = False
self._mapping_signal = False
def ioctl(self, fd, request, argp): return self.driver.dev_ioctl(self.gpu, request, argp)
def mmap(self, start, sz, prot, flags, fd, offset):
start = libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0)
if self._mapping_userland:
if self._mapping_userland or self._mapping_signal:
self.driver.track_address(start, start+sz, lambda mv,off: None, lambda mv, off: self.driver._gpu_mmio_write(mv, off, self.gpu))
self._mapping_signal = False
return start
class NVDriver(VirtDriver):
@@ -65,6 +67,7 @@ class NVDriver(VirtDriver):
self.object_by_handle = {}
self.opened_fds = {}
self.next_doorbell = collections.defaultdict(int)
self._executing = False # re-entrancy guard for _gpu_mmio_write
for i in range(gpus): self._prepare_gpu(i)
@@ -115,7 +118,8 @@ class NVDriver(VirtDriver):
assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU)
params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS.from_address(params_ptr)
struct.hObjectNew = self._alloc_handle()
self.object_by_handle[struct.hObjectNew] = NVAllocation(self.object_by_handle[struct.hObjectParent], params.size)
is_signal = struct.hClass == nv_gpu.NV1_MEMORY_SYSTEM # signal memory uses NV1_MEMORY_SYSTEM (uncached)
self.object_by_handle[struct.hObjectNew] = NVAllocation(self.object_by_handle[struct.hObjectParent], params.size, is_signal)
elif struct.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A:
assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU)
struct.hObjectNew = self._alloc_handle()
@@ -206,7 +210,6 @@ class NVDriver(VirtDriver):
def ctl_ioctl(self, req, argp):
nr = req & 0xff
if nr == nv_gpu.NV_ESC_RM_ALLOC: return self.rm_alloc(argp)
elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY: pass
elif nr == nv_gpu.NV_ESC_RM_CONTROL: return self.rm_control(argp)
elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
st:Any = nv_gpu.nv_ioctl_nvos33_parameters_with_fd.from_address(argp)
@@ -215,6 +218,10 @@ class NVDriver(VirtDriver):
file = self.opened_fds[st.fd]
assert isinstance(file, NVDevFileDesc)
file._mapping_userland = True
elif isinstance(obj, NVAllocation) and obj.is_signal:
file = self.opened_fds[st.fd]
assert isinstance(file, NVDevFileDesc)
file._mapping_signal = True
elif nr == nv_gpu.NV_ESC_RM_FREE:
st = nv_gpu.NVOS00_PARAMETERS.from_address(argp)
self.object_by_handle.pop(st.hObjectOld)
@@ -256,12 +263,26 @@ class NVDriver(VirtDriver):
else: raise RuntimeError(f"Unknown {nr} to nvidia-uvm")
return 0
def dev_ioctl(self, dev, req, argp): return 0
def dev_ioctl(self, dev, req, argp):
nr = req & 0xff
# Handle NV_ESC_RM_ALLOC_MEMORY for host/signal memory
if nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
st:Any = nv_gpu.nv_ioctl_nvos02_parameters_with_fd.from_address(argp)
# Track host memory (signal memory) - progress queues when written to
if st.params.hClass == nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR:
self.track_address(st.params.pMemory, st.params.pMemory + st.params.limit + 1,
lambda mv,off: None, lambda mv, off: self._gpu_mmio_write(mv, off, None))
return 0
def _gpu_mmio_write(self, mv, off, gpu):
any_progress = True
while any_progress:
any_progress = False
for gpu in self.gpus.values():
for q in gpu.queues:
if q.ctrl.GPGet != q.ctrl.GPPut:
any_progress |= q.execute()
if self._executing: return # prevent re-entrancy
self._executing = True
try:
any_progress = True
while any_progress:
any_progress = False
for gpu in self.gpus.values():
for q in gpu.queues:
if q.ctrl.GPGet != q.ctrl.GPPut:
any_progress |= q.execute()
finally:
self._executing = False