lds bank count tests from pmc counters (#13667)

* lds bank count tests from pmc counters

* these tests run on the RDNA3 card too

* rename duration to cycles, other rename comment

* add SQ_LDS_IDX_ACTIVE to gfx9 defaults
This commit is contained in:
qazal
2025-12-13 04:39:32 -05:00
committed by GitHub
parent a6dfd8a672
commit 019e71f8ca
4 changed files with 43 additions and 4 deletions

View File

@@ -1,5 +1,4 @@
import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools, threading
from tabulate import tabulate
from typing import Generator
from tinygrad.helpers import temp, unwrap, DEBUG
from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent
@@ -161,6 +160,7 @@ def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
def print_pmc(events:list[ProfilePMCEvent]) -> None:
from tinygrad.viz.serve import unpack_pmc
from tabulate import tabulate
for e in events:
print("**", e.kern)
data = unpack_pmc(e)

View File

@@ -6,8 +6,11 @@ import unittest
import functools, contextlib
import numpy as np
from tinygrad import Tensor, Context, Device
from tinygrad.uop.ops import UOp, KernelInfo, AxisType
from tinygrad.dtype import dtypes, AddrSpace
from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType
from tinygrad.runtime.ops_amd import ProfilePMCEvent
from tinygrad.engine.realize import get_runner
from tinygrad.viz.serve import unpack_pmc
from extra.sqtt.roc import print_pmc
def copy_kernel(B, A, stride=1):
@@ -19,6 +22,16 @@ def copy_kernel(B, A, stride=1):
index = (i * stride) % A.size
return B[index].store(A[index]).sink(arg=KernelInfo(name=f"copy_{A.size}_stride_{stride}", opts_to_apply=()))
def lds_kernel(offset:UOp, size:int, inst:str) -> UOp:
tid = UOp.range(offset.size, 0, AxisType.LOCAL)
dst = UOp.placeholder((size,), dtypes.float32, 1, AddrSpace.REG)
#lds = UOp.placeholder((1024,), dtypes.float32, 2, AddrSpace.LOCAL)
u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_waitcnt(0);')
u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_barrier();', src=(u,))
u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_sched_barrier(0);', src=(u,))
u = UOp(Ops.CUSTOM, arg=f'asm volatile("{inst} '+'%0, %1" : "=v"({0}) : "v"({1}));', src=(dst, offset[tid], u))
return UOp.sink(u, arg=KernelInfo(name="test_lds", opts_to_apply=()))
dev = Device[Device.DEFAULT]
@contextlib.contextmanager
@@ -45,5 +58,30 @@ class TestPMC(unittest.TestCase):
def test_copy_uncoalesced(self): return self.test_copy(stride=17)
# test with two threads issuing ds_reads at different offsets
def test_ds_read(self, size=1, inst='ds_read_b32'):
test_banks = 256
offsets = [Tensor([0, b*4]) for b in range(1, test_banks)]
with Context(DEBUG=0): Tensor.realize(*offsets)
k = Tensor.custom_kernel(offsets[0], fxn=functools.partial(lds_kernel, size=size, inst=inst))[0]
# sample all kernels
with save_pmc() as pmc_events:
runner = get_runner(Device.DEFAULT, k.schedule()[0].ast)
# TODO: llvm eliminates lds definition from the ELF, is there another way to pin lds size?
runner._prg.group_segment_size = 1024
for offset in offsets: runner([offset.uop.buffer])
# find read offsets that created bank conflicts from the pmc counters
found:list[Tensor] = []
for i,e in enumerate(pmc_events):
pmc = unpack_pmc(e)["rows"]
# SQ on gfx9, renamed to SQC after gfx10
val = next(total for name,total,_all_instances in pmc if name in {"SQ_LDS_BANK_CONFLICT", "SQC_LDS_BANK_CONFLICT"})
if val > 0: found.append(offsets[i])
print("Found bank conflicts at offsets:", [s.numpy() for s in found])
def test_ds_read_b64(self): self.test_ds_read(2, 'ds_read_b64')
def test_ds_read_b128(self): self.test_ds_read(4, 'ds_read_b128')
if __name__ == "__main__":
unittest.main()