From 798970cfad3fd2e56a9b929cc686302495836a02 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 12 Mar 2024 19:23:23 +0300 Subject: [PATCH] fix gpu hangs when exiting while aql queues are executing (#3700) --- tinygrad/runtime/ops_hsa.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py index 361679b1ad..5c76a1c3ed 100644 --- a/tinygrad/runtime/ops_hsa.py +++ b/tinygrad/runtime/ops_hsa.py @@ -155,7 +155,7 @@ class HSADevice(Compiled): def __init__(self, device:str=""): if not HSADevice.agents: check(hsa.hsa_init()) - atexit.register(lambda: hsa.hsa_shut_down()) + atexit.register(hsa_terminate) HSADevice.agents = scan_agents() HSADevice.cpu_agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_CPU][0] HSADevice.cpu_mempool = find_memory_pool(HSADevice.cpu_agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_CPU) @@ -222,3 +222,10 @@ class HSADevice(Compiled): self.kernarg_pool_sz: int = sz def flush_hdp(self): self.hdp_flush.HDP_MEM_FLUSH_CNTL[0] = 1 + +def hsa_terminate(): + # Need to stop/delete aql queue before hsa shut down, this leads to gpu hangs. + for dev in HSADevice.devices: + setattr(dev, 'synchronize', lambda: None) # some destructors might require to sync, but hw_queue is removed. + del dev.hw_queue + hsa.hsa_shut_down()