assembly gemm clean ups, instructions for cli (#13892)

This commit is contained in:
qazal
2025-12-30 16:14:06 +09:00
committed by GitHub
parent d7e1f26e3d
commit b557c46233
3 changed files with 11 additions and 2926 deletions

View File

@@ -208,3 +208,9 @@ Key patterns to watch (from ResNet50 benchmark):
- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.
## AMD Performance Counter Profiling
Set VIZ to `-2` to save performance counters traces for the AMD backend.
Use the CLI in `./extra/sqtt/roc.py` to explore the trace.

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
# Run assembly on the AMD runtime and check correctness
# VIZ=2 to profile
import pathlib
from tinygrad import Tensor, Device, dtypes
from tinygrad import Tensor, Device, dtypes, Context
from tinygrad.engine.realize import ExecItem, CompiledRunner
from tinygrad.renderer import ProgramSpec
from tinygrad.uop.ops import track_rewrites, UOp
@@ -55,9 +55,10 @@ def get_asm_prg() -> ProgramSpec:
eis.append(ExecItem(ast, [C_asm.uop.buffer, from_torch(B).uop.buffer, from_torch(A).uop.buffer], fixedvars={"SZ":N, "NUM_WG":NUM_WG},
prg=CompiledRunner(get_asm_prg())))
for ei in eis:
et = ei.run(wait=True)
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
with Context(DEBUG=2):
for ei in eis:
et = ei.run(wait=True)
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
# ** correctness