From 2d5c24879f8a7a1e8849a04247d3a4402b9484bd Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Thu, 29 Jan 2026 20:06:01 +0300
Subject: [PATCH] nv: pma for 5090 (#14420)

* nv: pma for 5090

* hm

* 4090
---
 extra/nv_gpu_driver/nv_ioctl.py |  2 +-
 tinygrad/runtime/ops_nv.py      | 38 ++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/extra/nv_gpu_driver/nv_ioctl.py b/extra/nv_gpu_driver/nv_ioctl.py
index d4be7c5589..a0148cde97 100644
--- a/extra/nv_gpu_driver/nv_ioctl.py
+++ b/extra/nv_gpu_driver/nv_ioctl.py
@@ -202,7 +202,7 @@ def ioctl(fd, request, argp):
         if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
         if s.hClass == nv_gpu.GT200_DEBUGGER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV83DE_ALLOC_PARAMETERS))
         if s.hClass == nv_gpu.MAXWELL_PROFILER_DEVICE: dump_struct(get_struct(s.pAllocParms, nv_gpu.NVB2CC_ALLOC_PARAMETERS))
-        if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:
+        if s.hClass in {nv_gpu.AMPERE_CHANNEL_GPFIFO_A, nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A}:
           sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)
           dump_struct(sx)
           gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
index 69864f689e..24070492a6 100644
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -739,7 +739,7 @@ class NVDevice(HCQCompiled[NVSignal]):
     raise RuntimeError("\n".join(report))
 
   def _prof_init(self):
-    assert not self.is_nvd() and self.iface.compute_class is nv_gpu.ADA_COMPUTE_A, "not supported for PMA profiling"
+    assert not self.is_nvd()
 
     self.profiler = self.iface.rm_alloc(self.subdevice, nv_gpu.MAXWELL_PROFILER_DEVICE,
       nv_gpu.NVB2CC_ALLOC_PARAMETERS(hClientTarget=self.iface.root, hContextTarget=self.channel_group))
@@ -764,13 +764,15 @@ class NVDevice(HCQCompiled[NVSignal]):
     self._prof_setup_pc_sampling()
 
   def _prof_setup_pc_sampling(self):
-    PMASYS_BASE, PMAGPC_BASE, GR_GPC_BASE, GPC_BASE = 0x24a000, 0x244000, 0x419000, 0x180000
+    is_bw = self.iface.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A
+    PMASYS_BASE, PMAGPC_BASE, GR_GPC_BASE, GPC_BASE = (0x2b1000, 0x2b0000, 0x424000, 0x200000) if is_bw else (0x24a000, 0x244000, 0x419800, 0x180000)
 
     tpc_masks = [m for i in range(self.num_gpcs) if (m:=self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_GR_GET_TPC_MASK,
       nv_gpu.NV2080_CTRL_GR_GET_TPC_MASK_PARAMS(gpcId=i)).tpcMask) > 0]
+    tpc_cnt = [bin(mask).count('1') for mask in tpc_masks]
 
     # enables pma on gpc
-    self.reg_ops(*[(PMAGPC_BASE + gpc * 0x200, 0x100, 0x100) for gpc in range(len(tpc_masks))])
+    if not is_bw: self.reg_ops(*[(PMAGPC_BASE + gpc * 0x200, 0x100, 0x100) for gpc in range(len(tpc_masks))])
 
     # sets streaming bw for each gpc
     hs = nv_gpu.struct_NVB0CC_CTRL_HS_CREDITS_PARAMS(pmaChannelIdx=0, numEntries=len(tpc_masks))
@@ -779,20 +781,30 @@ class NVDevice(HCQCompiled[NVSignal]):
         chipletType=nv_gpu.NVB0CC_CHIPLET_TYPE_GPC, chipletIndex=i, numCredits=bin(mask).count('1'))
     self.iface.rm_control(self.profiler, nv_gpu.NVB0CC_CTRL_CMD_SET_HS_CREDITS, hs)
 
-    self.reg_ops(*[(PMASYS_BASE + 0x65c + off * 4, 0xffffffff) for off in range(self.num_gpcs * 2)])
-    self.reg_ops((PMASYS_BASE + 0x620, 0x2000007))
+    if is_bw:
+      # enables pma on gpcs
+      self.reg_ops(*[op for i in range(3) for op in [(PMASYS_BASE + 0x128 + i*8, 480), (PMASYS_BASE + 0x12c + i*8, 0x80000000)]])
+      self.reg_ops((PMAGPC_BASE + 0xa24, 0x04000001), (PMAGPC_BASE + 0xa10, 0x80000002))
+      self.reg_ops(*[(GPC_BASE + gpc * 0x4000 + 0x200 + tpc * 0x200 + reg, 0)
+                     for gpc in range(len(tpc_masks)) for tpc in range(tpc_cnt[gpc]) for reg in [0x100, 0x108, 0x110, 0x120]])
 
-    # tpc addressing is right aligned
-    tpc_cnt = [bin(mask).count('1') for mask in tpc_masks]
-    def SM_REG(gpc, tpc, sm, reg): return GPC_BASE + gpc * 0x4000 + (self.num_tpc_per_gpc - tpc_cnt[gpc] + tpc) * 0x200 + [0x400, 0x1000][sm] + reg
+      def SM_REG(gpc, tpc, sm, reg): return GPC_BASE + gpc * 0x4000 + 0x800 + (tpc * self.num_sm_per_tpc + sm) * 0x200 + reg
+    else:
+      self.reg_ops(*[(PMASYS_BASE + 0x65c + off * 4, 0xffffffff) for off in range(self.num_gpcs * 2)])
+      self.reg_ops((PMASYS_BASE + 0x620, 0x2000007))
 
-    self.reg_ops(*[op for gpc in range(len(tpc_masks)) for tpc in range(tpc_cnt[gpc]) for sm in range(2) for op in [
-      (SM_REG(gpc, tpc, sm, 0x128), (gpc << 5) | (tpc << 1) | sm), # enumeration. NOTE: different from cuda
-      (SM_REG(gpc, tpc, sm, 0x40), 0x19181716), (SM_REG(gpc, tpc, sm, 0x48), 0x1d1c1b1a), (SM_REG(gpc, tpc, sm, 0x50), 0x1e201f), # unk, counters?
-      (SM_REG(gpc, tpc, sm, 0xec), 0x1), (SM_REG(gpc, tpc, sm, 0x6c), 0x2), (SM_REG(gpc, tpc, sm, 0x9c), 0x5), (SM_REG(gpc, tpc, sm, 0x108), 0x20)]])
+      def SM_REG(gpc, tpc, sm, reg): return GPC_BASE + gpc * 0x4000 + (self.num_tpc_per_gpc - tpc_cnt[gpc] + tpc) * 0x200 + [0x400, 0x1000][sm] + reg
 
     # enable pc sampling for the context
-    self.reg_ops((GR_GPC_BASE + 0xbdc, 0x1), reg_type=1)
+    self.reg_ops((GR_GPC_BASE + 0x304, 0x80808a))
+
+    # sm config and enable
+    self.reg_ops(*[op for gpc in range(len(tpc_masks)) for tpc in range(tpc_cnt[gpc]) for sm in range(self.num_sm_per_tpc) for op in [
+      (SM_REG(gpc, tpc, sm, 0x128), (gpc << 5) | (tpc << 1) | sm), # enumeration. NOTE: different from cuda
+      (SM_REG(gpc, tpc, sm, 0x40), 0x19181716), (SM_REG(gpc, tpc, sm, 0x48), 0x1d1c1b1a), (SM_REG(gpc, tpc, sm, 0x50), 0x1e201f), # unk, counters?
+      (SM_REG(gpc, tpc, sm, 0xec), 0x1), (SM_REG(gpc, tpc, sm, 0x6c), 0x2), (SM_REG(gpc, tpc, sm, 0x9c), 0x5),
+      (SM_REG(gpc, tpc, sm, 0x108), 0xa0 if is_bw else 0x20), *([(SM_REG(gpc, tpc, sm, 0x120), 0x100000)] if is_bw else [])]])
+    self.reg_ops((GR_GPC_BASE + 0x3dc, 0x1), reg_type=1)
 
   def reg_ops(self, *ops, reg_type=0, op=nv_gpu.NV2080_CTRL_GPU_REG_OP_WRITE_32):
     for i in range(0, len(ops), 124):