diff --git a/extra/sqtt/roc.py b/extra/sqtt/roc.py index f268272067..f33afee46d 100644 --- a/extra/sqtt/roc.py +++ b/extra/sqtt/roc.py @@ -1,5 +1,4 @@ import ctypes, pathlib, argparse, pickle, re, functools, dataclasses, itertools, threading -from tabulate import tabulate from typing import Generator from tinygrad.helpers import temp, unwrap, DEBUG from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent @@ -161,6 +160,7 @@ def decode(profile:list[ProfileEvent]) -> _ROCParseCtx: def print_pmc(events:list[ProfilePMCEvent]) -> None: from tinygrad.viz.serve import unpack_pmc + from tabulate import tabulate for e in events: print("**", e.kern) data = unpack_pmc(e) diff --git a/extra/sqtt/test_pmc.py b/extra/sqtt/test_pmc.py index ab6f23abcc..0143ff02dc 100644 --- a/extra/sqtt/test_pmc.py +++ b/extra/sqtt/test_pmc.py @@ -6,8 +6,11 @@ import unittest import functools, contextlib import numpy as np from tinygrad import Tensor, Context, Device -from tinygrad.uop.ops import UOp, KernelInfo, AxisType +from tinygrad.dtype import dtypes, AddrSpace +from tinygrad.uop.ops import UOp, Ops, KernelInfo, AxisType from tinygrad.runtime.ops_amd import ProfilePMCEvent +from tinygrad.engine.realize import get_runner +from tinygrad.viz.serve import unpack_pmc from extra.sqtt.roc import print_pmc def copy_kernel(B, A, stride=1): @@ -19,6 +22,16 @@ def copy_kernel(B, A, stride=1): index = (i * stride) % A.size return B[index].store(A[index]).sink(arg=KernelInfo(name=f"copy_{A.size}_stride_{stride}", opts_to_apply=())) +def lds_kernel(offset:UOp, size:int, inst:str) -> UOp: + tid = UOp.range(offset.size, 0, AxisType.LOCAL) + dst = UOp.placeholder((size,), dtypes.float32, 1, AddrSpace.REG) + #lds = UOp.placeholder((1024,), dtypes.float32, 2, AddrSpace.LOCAL) + u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_waitcnt(0);') + u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_s_barrier();', src=(u,)) + u = UOp(Ops.CUSTOM, arg='__builtin_amdgcn_sched_barrier(0);', src=(u,)) + u = UOp(Ops.CUSTOM, arg=f'asm volatile("{inst} '+'%0, %1" : "=v"({0}) : "v"({1}));', src=(dst, offset[tid], u)) + return UOp.sink(u, arg=KernelInfo(name="test_lds", opts_to_apply=())) + dev = Device[Device.DEFAULT] @contextlib.contextmanager @@ -45,5 +58,30 @@ class TestPMC(unittest.TestCase): def test_copy_uncoalesced(self): return self.test_copy(stride=17) + # test with two threads issuing ds_reads at different offsets + def test_ds_read(self, size=1, inst='ds_read_b32'): + test_banks = 256 + offsets = [Tensor([0, b*4]) for b in range(1, test_banks)] + with Context(DEBUG=0): Tensor.realize(*offsets) + k = Tensor.custom_kernel(offsets[0], fxn=functools.partial(lds_kernel, size=size, inst=inst))[0] + # sample all kernels + with save_pmc() as pmc_events: + runner = get_runner(Device.DEFAULT, k.schedule()[0].ast) + # TODO: llvm eliminates lds definition from the ELF, is there another way to pin lds size? + runner._prg.group_segment_size = 1024 + for offset in offsets: runner([offset.uop.buffer]) + # find read offsets that created bank conflicts from the pmc counters + found:list[Tensor] = [] + for i,e in enumerate(pmc_events): + pmc = unpack_pmc(e)["rows"] + # SQ on gfx9, renamed to SQC after gfx10 + val = next(total for name,total,_all_instances in pmc if name in {"SQ_LDS_BANK_CONFLICT", "SQC_LDS_BANK_CONFLICT"}) + if val > 0: found.append(offsets[i]) + print("Found bank conflicts at offsets:", [s.numpy() for s in found]) + + def test_ds_read_b64(self): self.test_ds_read(2, 'ds_read_b64') + + def test_ds_read_b128(self): self.test_ds_read(4, 'ds_read_b128') + if __name__ == "__main__": unittest.main() diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 791e4a4537..85b8fd5a4b 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -944,7 +944,8 @@ class AMDDevice(HCQCompiled): self.pmc_counters = import_pmc(self.target) # validate counters - pmc_default = "TCC_HIT,TCC_MISS,SQ_LDS_BANK_CONFLICT" if self.target[0] == 9 else "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT" + pmc_default = "TCC_HIT,TCC_MISS,SQ_LDS_IDX_ACTIVE,SQ_LDS_BANK_CONFLICT" if self.target[0] == 9 \ + else "GL2C_HIT,GL2C_MISS,SQC_LDS_IDX_ACTIVE,SQC_LDS_BANK_CONFLICT" for k in (PMC_COUNTERS:=getenv("PMC_COUNTERS", pmc_default).split(",")): if k not in self.pmc_counters: raise RuntimeError(f"PMC counter {k} is not supported. Available: {','.join(self.pmc_counters.keys())}") diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py index f1a24a0802..6018a6a67e 100755 --- a/tinygrad/viz/serve.py +++ b/tinygrad/viz/serve.py @@ -398,7 +398,7 @@ def get_render(i:int, j:int, fmt:str) -> dict: metadata.append(amd_readelf(compiler.compile(data.src))) return {"src":disasm_str, "lang":"amdgpu" if data.device.startswith("AMD") else None, "metadata":metadata} if fmt == "sqtt-insts": - columns = ["PC", "Instruction", "Hits", "Duration", "Stall", "Type"] + columns = ["PC", "Instruction", "Hits", "Cycles", "Stall", "Type"] inst_columns = ["N", "Clk", "Idle", "Dur", "Stall"] # Idle: The total time gap between the completion of previous instruction and the beginning of the current instruction. # The idle time can be caused by: