system: fix flock on pcidevs (#13123)

* system: fix locking of hcq devices

* rename and fullrun

* force ok

* fix

* fix
This commit is contained in:
nimlgen
2025-11-06 19:02:13 +08:00
committed by GitHub
parent 3126c89b84
commit 05e2ff4d87
6 changed files with 55 additions and 52 deletions

View File

@@ -1,39 +0,0 @@
import subprocess
import random
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
def run_test(i, full_run=False):
print(f"\rRunning iteration {i}...", end=" ", flush=True)
p = subprocess.Popen(['python3', 'test/test_tiny.py', 'TestTiny.test_plus'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if not full_run:
time.sleep(random.uniform(0, 1200) / 1000)
p.kill()
_, stderr = p.communicate()
else:
_, stderr = p.communicate()
if full_run:
stderr_text = stderr.decode()
print(stderr_text)
assert "Ran 1 test in" in stderr_text and "OK" in stderr_text
max_workers = 4
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for i in range(1000000):
if i % 100 == 0:
for future in as_completed(futures):
try: future.result()
except Exception as e:
print(f"\nError in iteration: {e}")
futures = []
run_test(i, True)
else:
future = executor.submit(run_test, i, False)
futures.append(future)
if len(futures) > max_workers * 2: futures = [f for f in futures if not f.done()]

44
test/external/external_fuzz_hcq_mp.py vendored Normal file
View File

@@ -0,0 +1,44 @@
import subprocess
import random
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from tinygrad.helpers import getenv
# checks that HCQ drivers can be killed during operation without causing issues
def run_test(i, full_run=False, force_ok=False):
print(f"\rRunning iteration {i}...", end=" ", flush=True)
p = subprocess.Popen(["python3", "test/test_tiny.py", "TestTiny.test_plus"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if not full_run:
time.sleep(random.uniform(0, 1200) / 1000.0)
p.kill()
_, stderr = p.communicate()
else:
_, stderr = p.communicate()
stderr_text = stderr.decode()
assert ("Ran 1 test in" in stderr_text and "OK" in stderr_text) or (not force_ok and "Failed to take lock file" in stderr_text), stderr_text
if __name__ == "__main__":
max_workers = getenv("MAX_WORKERS", 4)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = []
for i in range(1000000):
if i % 100 == 0:
# wait for everything we launched so far
for f in as_completed(futures):
try:
f.result()
except Exception as e:
print(f"\nError in iteration: {e}")
futures = []
# do a full run in the main proc
run_test(i, True, force_ok=True)
else:
futures.append(executor.submit(run_test, i, bool(getenv("FULL_RUN", 0))))
# keep list small
if len(futures) > max_workers * 2:
futures = [f for f in futures if not f.done()]

View File

@@ -831,7 +831,7 @@ class PCIIface(PCIIfaceBase):
class USBIface(PCIIface):
def __init__(self, dev, dev_id): # pylint: disable=super-init-not-called
self.dev, self.pci_dev = dev, USBPCIDevice(f"usb:{dev_id}", bars=[0, 2, 5])
self.dev, self.pci_dev = dev, USBPCIDevice(dev.__class__.__name__[:2], f"usb:{dev_id}", bars=[0, 2, 5])
self._setup_adev(self.pci_dev, dma_regions=[(0x200000, self.pci_dev.dma_view(0xf000, 0x80000))])
self.pci_dev.usb._pci_cacheable += [(self.pci_dev.bar_info[2].addr, self.pci_dev.bar_info[2].size)] # doorbell region is cacheable

View File

@@ -1,11 +1,11 @@
from __future__ import annotations
import ctypes, collections, dataclasses, functools, os, hashlib, array
import ctypes, collections, dataclasses, functools, hashlib, array
from tinygrad.helpers import mv_address, getenv, DEBUG, fetch
from tinygrad.runtime.autogen.am import am
from tinygrad.runtime.support.hcq import MMIOInterface
from tinygrad.runtime.support.amd import AMDReg, import_module, import_asic_regs
from tinygrad.runtime.support.memory import TLSFAllocator, MemoryManager
from tinygrad.runtime.support.system import System, PCIDevice, PCIDevImplBase
from tinygrad.runtime.support.system import PCIDevice, PCIDevImplBase
from tinygrad.runtime.support.am.ip import AM_SOC, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA
AM_DEBUG = getenv("AM_DEBUG", 0)
@@ -122,8 +122,6 @@ class AMDev(PCIDevImplBase):
self.pci_dev, self.devfmt, self.dma_regions = pci_dev, pci_dev.pcibus, dma_regions
self.vram, self.doorbell64, self.mmio = self.pci_dev.map_bar(0), self.pci_dev.map_bar(2, fmt='Q'), self.pci_dev.map_bar(5, fmt='I')
self.lock_fd = System.flock_acquire(f"am_{self.devfmt}.lock")
self._run_discovery()
self._build_regs()
@@ -190,7 +188,6 @@ class AMDev(PCIDevImplBase):
for ip in [self.sdma, self.gfx]: ip.fini_hw()
self.smu.set_clocks(level=0)
self.ih.interrupt_handler()
os.close(self.lock_fd)
def paddr2mc(self, paddr:int) -> int: return self.gmc.mc_base + paddr

View File

@@ -73,8 +73,6 @@ class NVDev(PCIDevImplBase):
def __init__(self, pci_dev:PCIDevice):
self.pci_dev, self.devfmt, self.mmio = pci_dev, pci_dev.pcibus, pci_dev.map_bar(0, fmt='I')
self.lock_fd = System.flock_acquire(f"nv_{self.devfmt}.lock")
self.smi_dev, self.is_booting = False, True
self._early_init()

View File

@@ -165,7 +165,8 @@ class _System:
System = _System()
class PCIDevice:
def __init__(self, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
def __init__(self, devpref:str, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
self.lock_fd = System.flock_acquire(f"{devpref.lower()}_{pcibus.lower()}.lock")
self.pcibus, self.irq_poller = pcibus, None
if FileIOInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
@@ -215,7 +216,8 @@ class PCIDevice:
def reset(self): os.system(f"sudo sh -c 'echo 1 > /sys/bus/pci/devices/{self.pcibus}/reset'")
class APLPCIDevice(PCIDevice):
def __init__(self, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
def __init__(self, devpref:str, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
self.lock_fd = System.flock_acquire(f"{devpref.lower()}_{pcibus.lower()}.lock")
self.pcibus, self.bars = pcibus, {b: System.iokit_pci_memmap(b) for b in bars}
self.bar_info = {b:PCIBarInfo(0, self.bars[b].nbytes-1 if b in self.bars else 0) for b in range(6)} # NOTE: fake bar info for nv.
def map_bar(self, bar:int, off:int=0, addr:int=0, size:int|None=None, fmt='B') -> MMIOInterface: return self.bars[bar].view(off, size, fmt)
@@ -224,7 +226,8 @@ class APLPCIDevice(PCIDevice):
def reset(self): System.iokit_pci_rpc(__TinyGPURPCReset:=2)
class USBPCIDevice(PCIDevice):
def __init__(self, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
def __init__(self, devpref:str, pcibus:str, bars:list[int], resize_bars:list[int]|None=None):
self.lock_fd = System.flock_acquire(f"{devpref.lower()}_{pcibus.lower()}.lock")
self.usb = ASM24Controller()
self.pcibus, self.bar_info = pcibus, System.pci_setup_usb_bars(self.usb, gpu_bus=4, mem_base=0x10000000, pref_mem_base=(32 << 30))
def map_bar(self, bar, off=0, addr=0, size=None, fmt='B'):
@@ -247,7 +250,7 @@ class LNXPCIIfaceBase:
# Acquire va range to avoid collisions.
FileIOInterface.anon_mmap(va_start, va_size, 0, mmap.MAP_PRIVATE | mmap.MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, 0)
self.pci_dev, self.dev, self.vram_bar = PCIDevice(cls.gpus[dev_id], bars=bars, resize_bars=[vram_bar]), dev, vram_bar
self.pci_dev, self.dev, self.vram_bar = PCIDevice(dev.__class__.__name__[:2], cls.gpus[dev_id], bars=bars, resize_bars=[vram_bar]), dev, vram_bar
self.p2p_base_addr = self.pci_dev.bar_info[vram_bar].addr
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, force_devmem=False, **kwargs) -> HCQBuffer:
@@ -281,7 +284,7 @@ class LNXPCIIfaceBase:
class APLPCIIfaceBase(LNXPCIIfaceBase):
def __init__(self, dev, dev_id, vendor, devices, bars, vram_bar, va_start, va_size):
self.pci_dev, self.dev, self.vram_bar = APLPCIDevice(pcibus=f'usb4:{dev_id}', bars=bars), dev, vram_bar
self.pci_dev, self.dev, self.vram_bar = APLPCIDevice(dev.__class__.__name__[:2], pcibus=f'usb4:{dev_id}', bars=bars), dev, vram_bar
def map(self, b:HCQBuffer): raise RuntimeError(f"map failed: {b.owner} -> {self.dev}")
PCIIfaceBase:type = APLPCIIfaceBase if OSX else LNXPCIIfaceBase