assembly gemm clean ups, instructions for cli (#13892)

This commit is contained in:
qazal
2025-12-30 16:14:06 +09:00
committed by GitHub
parent d7e1f26e3d
commit b557c46233
3 changed files with 11 additions and 2926 deletions

View File

@@ -208,3 +208,9 @@ Key patterns to watch (from ResNet50 benchmark):
- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches - `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose. Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.
## AMD Performance Counter Profiling
Set VIZ to `-2` to save performance counters traces for the AMD backend.
Use the CLI in `./extra/sqtt/roc.py` to explore the trace.

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
# Run assembly on the AMD runtime and check correctness # Run assembly on the AMD runtime and check correctness
# VIZ=2 to profile # VIZ=2 to profile
import pathlib import pathlib
from tinygrad import Tensor, Device, dtypes from tinygrad import Tensor, Device, dtypes, Context
from tinygrad.engine.realize import ExecItem, CompiledRunner from tinygrad.engine.realize import ExecItem, CompiledRunner
from tinygrad.renderer import ProgramSpec from tinygrad.renderer import ProgramSpec
from tinygrad.uop.ops import track_rewrites, UOp from tinygrad.uop.ops import track_rewrites, UOp
@@ -55,9 +55,10 @@ def get_asm_prg() -> ProgramSpec:
eis.append(ExecItem(ast, [C_asm.uop.buffer, from_torch(B).uop.buffer, from_torch(A).uop.buffer], fixedvars={"SZ":N, "NUM_WG":NUM_WG}, eis.append(ExecItem(ast, [C_asm.uop.buffer, from_torch(B).uop.buffer, from_torch(A).uop.buffer], fixedvars={"SZ":N, "NUM_WG":NUM_WG},
prg=CompiledRunner(get_asm_prg()))) prg=CompiledRunner(get_asm_prg())))
for ei in eis: with Context(DEBUG=2):
et = ei.run(wait=True) for ei in eis:
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS") et = ei.run(wait=True)
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
# ** correctness # ** correctness