mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
amd: replace AMD_DRIVERLESS with AMD_IFACE (#10116)
* amd: replace AMD_DRIVERLESS with AMD_IFACE * docs * print direct err for amd_iface * print for all
This commit is contained in:
@@ -5,7 +5,7 @@ tinygrad supports various runtimes, enabling your code to scale across a wide ra
|
||||
| Runtime | Description | Requirements |
|
||||
|---------|-------------|--------------|
|
||||
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | Ampere/Ada series GPUs |
|
||||
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | RDNA2/RDNA3/RDNA4 series GPUs |
|
||||
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | RDNA2/RDNA3/RDNA4 series GPUs. You can select one of the interfaces for communication by setting `AMD_IFACE=(KFD|PCI)`. See [AMD interfaces](#amd-interfaces) for more details. |
|
||||
| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | 6xx series GPUs |
|
||||
| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support |
|
||||
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support |
|
||||
@@ -64,3 +64,11 @@ rawbuf_ptr = to_mv(cl_buf_desc_ptr, 0x100).cast('Q')[20] # offset 0xA0 is a raw
|
||||
# create tiny tensor
|
||||
tiny = Tensor.from_blob(rawbuf_ptr, (h*w*4,), dtype=dtypes.imagef((h,w)), device='QCOM')
|
||||
```
|
||||
|
||||
## AMD Interfaces
|
||||
AMD backend supports several interfaces for communicating with devices:
|
||||
|
||||
* `KFD`: uses the amdgpu driver
|
||||
* `PCI`: uses the [AM driver](developer/am.md)
|
||||
|
||||
You can force an interface by setting `AMD_IFACE` to one of these values. In the case of `AMD_IFACE=PCI`, this may unbind your GPU from the amdgpu driver.
|
||||
|
||||
@@ -297,7 +297,7 @@ class AMDComputeQueue(HWQueue):
|
||||
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
if (dev:=signal.timeline_for_device) is not None and not dev.is_am():
|
||||
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
||||
return self
|
||||
@@ -353,10 +353,10 @@ class AMDCopyQueue(HWQueue):
|
||||
fence_flags = self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3) if self.dev.target >= (10,0,0) else 0
|
||||
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(signal.value_addr), value)
|
||||
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
if (dev:=signal.timeline_for_device) is not None and not dev.is_am():
|
||||
self.q(self.sdma.SDMA_OP_FENCE | fence_flags, *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
||||
self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
||||
elif AMDDevice.driverless: self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
||||
elif dev is not None and dev.is_am(): self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
||||
|
||||
return self
|
||||
|
||||
@@ -372,7 +372,7 @@ class AMDCopyQueue(HWQueue):
|
||||
return self
|
||||
|
||||
def bind(self, dev:AMDDevice):
|
||||
if not getenv("AMD_SDMA_BIND", 0) or not dev.driverless: return
|
||||
if not getenv("AMD_SDMA_BIND", 0) or not dev.is_am(): return
|
||||
|
||||
self.binded_device = dev
|
||||
self.hw_page = dev.allocator.alloc((qsz:=round_up(len(self._q), 8)) * 4, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
||||
@@ -496,7 +496,7 @@ class AMDQueueDesc:
|
||||
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
|
||||
|
||||
# Flush hdp if queue is in dev mem.
|
||||
if dev.driverless and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
|
||||
if dev.is_am() and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
|
||||
for doorbell in self.doorbells: doorbell[0] = self.put_value
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -798,11 +798,18 @@ class AMDDevice(HCQCompiled):
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
signal_pool: ClassVar[list[HCQBuffer]] = []
|
||||
|
||||
driverless:bool = not FileIOInterface.exists('/sys/module/amdgpu') or bool(getenv("AMD_DRIVERLESS", 0))
|
||||
def is_am(self) -> bool: return isinstance(self.dev_iface, PCIIface)
|
||||
|
||||
def _select_iface(self):
|
||||
errs:str = ""
|
||||
for iface_t in (KFDIface, PCIIface) if len(nm:=getenv("AMD_IFACE", "")) == 0 else (getattr(sys.modules[__name__], f"{nm}Iface"),):
|
||||
try: return iface_t(self, self.device_id)
|
||||
except Exception as e: errs += f"\n{iface_t.__name__}: {type(e).__name__}: {e}"
|
||||
raise RuntimeError(f"Cannot find a usable interface for AMD:{self.device_id}:{errs}")
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.dev_iface = PCIIface(self, self.device_id) if AMDDevice.driverless else KFDIface(self, self.device_id)
|
||||
self.dev_iface = self._select_iface()
|
||||
self.target:tuple[int, ...] = ((trgt:=self.dev_iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
|
||||
self.arch = "gfx%d%x%x" % self.target
|
||||
if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
@@ -859,8 +866,8 @@ class AMDDevice(HCQCompiled):
|
||||
self.sqtt_enabled = PROFILE and bool(getenv("SQTT", 0))
|
||||
if self.sqtt_enabled:
|
||||
if self.arch != 'gfx1100': raise RuntimeError('SQ Thread Tracing is only supported on 7900XTX')
|
||||
if not self.driverless and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
|
||||
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use driverless or add "
|
||||
if not self.is_am() and (ppfeaturemask:=int(FileIOInterface('/sys/module/amdgpu/parameters/ppfeaturemask', os.O_RDONLY).read(), 16))&0x8000:
|
||||
raise RuntimeError("SQTT can't be enabled because of hardware bug, to workaround either use AMD_IFACE=PCI or add "
|
||||
f"ppfeaturemask={(ppfeaturemask&~0x8000):#x} (current {ppfeaturemask=:#x} & ~PP_GFXOFF_MASK) to amdgpu module parameters\n"
|
||||
"For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md")
|
||||
SQTT_BUFFER_SIZE = getenv("SQTT_BUFFER_SIZE", 256) # in mb, per shader engine
|
||||
|
||||
Reference in New Issue
Block a user