mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-05 05:04:27 -05:00
assembly gemm clean ups, instructions for cli (#13892)
This commit is contained in:
@@ -208,3 +208,9 @@ Key patterns to watch (from ResNet50 benchmark):
|
||||
- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
|
||||
|
||||
Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.
|
||||
|
||||
## AMD Performance Counter Profiling
|
||||
|
||||
Set VIZ to `-2` to save performance counters traces for the AMD backend.
|
||||
|
||||
Use the CLI in `./extra/sqtt/roc.py` to explore the trace.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
# Run assembly on the AMD runtime and check correctness
|
||||
# VIZ=2 to profile
|
||||
import pathlib
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad import Tensor, Device, dtypes, Context
|
||||
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
||||
from tinygrad.renderer import ProgramSpec
|
||||
from tinygrad.uop.ops import track_rewrites, UOp
|
||||
@@ -55,9 +55,10 @@ def get_asm_prg() -> ProgramSpec:
|
||||
eis.append(ExecItem(ast, [C_asm.uop.buffer, from_torch(B).uop.buffer, from_torch(A).uop.buffer], fixedvars={"SZ":N, "NUM_WG":NUM_WG},
|
||||
prg=CompiledRunner(get_asm_prg())))
|
||||
|
||||
for ei in eis:
|
||||
et = ei.run(wait=True)
|
||||
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
|
||||
with Context(DEBUG=2):
|
||||
for ei in eis:
|
||||
et = ei.run(wait=True)
|
||||
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
|
||||
|
||||
# ** correctness
|
||||
|
||||
|
||||
Reference in New Issue
Block a user