From c2504357afbc3caf77e06249f190781df70d3e4e Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Mon, 13 Jan 2025 23:53:13 +0300 Subject: [PATCH] am: lock to access dev (#8594) * amm lock to access dev * wording * just works * disbale --- tinygrad/runtime/support/am/amdev.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tinygrad/runtime/support/am/amdev.py b/tinygrad/runtime/support/am/amdev.py index a070ba0b99..aaf4a4ab74 100644 --- a/tinygrad/runtime/support/am/amdev.py +++ b/tinygrad/runtime/support/am/amdev.py @@ -1,6 +1,6 @@ from __future__ import annotations -import ctypes, collections, time, dataclasses, pathlib -from tinygrad.helpers import to_mv, mv_address, getenv, round_up, DEBUG +import ctypes, collections, time, dataclasses, pathlib, fcntl, os, signal +from tinygrad.helpers import to_mv, mv_address, getenv, round_up, DEBUG, temp from tinygrad.runtime.autogen.am import am, mp_11_0, mp_13_0_0, nbio_4_3_0, mmhub_3_0_0, gc_11_0_0, osssys_6_0_0 from tinygrad.runtime.support.allocator import TLSFAllocator from tinygrad.runtime.support.am.ip import AM_SOC21, AM_GMC, AM_IH, AM_PSP, AM_SMU, AM_GFX, AM_SDMA @@ -255,6 +255,15 @@ class AMDev: self.pcidev, self.devfmt = pcidev, devfmt self.vram, self.doorbell64, self.mmio = vram_bar, doorbell_bar, mmio_bar + os.umask(0) # Set umask to 0 to allow creating files with 0666 permissions + + # Avoid O_CREAT because we don’t want to re-create/replace an existing file (triggers extra perms checks) when opening as non-owner. + if os.path.exists(lock_name:=temp(f"am_{self.devfmt}.lock")): self.lock_fd = os.open(lock_name, os.O_RDWR) + else: self.lock_fd = os.open(lock_name, os.O_RDWR | os.O_CREAT, 0o666) + + try: fcntl.flock(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: raise RuntimeError(f"Failed to open AM device {self.devfmt}. It's already in use.") + self._run_discovery() self._build_regs() @@ -284,14 +293,17 @@ class AMDev: self.sdma:AM_SDMA = AM_SDMA(self) if self.partial_boot and (self.reg("regCP_MEC_RS64_CNTL").read() & gc_11_0_0.CP_MEC_RS64_CNTL__MEC_HALT_MASK == 0): - print(f"am {self.devfmt}: MEC is active. Someone might be using the GPU? Issue a full reset.") + if DEBUG >= 2: print(f"am {self.devfmt}: MEC is active. Issue a full reset.") self.partial_boot = False if not self.partial_boot: - if self.psp.is_sos_alive(): self.smu.mode1_reset() - for ip in [self.soc21, self.gmc, self.ih, self.psp, self.smu]: - ip.init() - if DEBUG >= 2: print(f"am {self.devfmt}: {ip.__class__.__name__} initialized") + try: # do not interrupt the boot process + signal.signal(signal.SIGINT, signal.SIG_IGN) + if self.psp.is_sos_alive(): self.smu.mode1_reset() + for ip in [self.soc21, self.gmc, self.ih, self.psp, self.smu]: + ip.init() + if DEBUG >= 2: print(f"am {self.devfmt}: {ip.__class__.__name__} initialized") + finally: signal.signal(signal.SIGINT, signal.default_int_handler) # Booting done self.is_booting = False