mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-06 21:53:53 -05:00
assembly gemm clean ups, instructions for cli (#13892)
This commit is contained in:
@@ -208,3 +208,9 @@ Key patterns to watch (from ResNet50 benchmark):
|
|||||||
- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
|
- `vmin==vmax folding`: ~55ms, 0.33% match rate - checks 52K ops but rarely matches
|
||||||
|
|
||||||
Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.
|
Patterns with 0% match rate are workload-specific overhead. They may be useful in other workloads, so don't remove them without understanding their purpose.
|
||||||
|
|
||||||
|
## AMD Performance Counter Profiling
|
||||||
|
|
||||||
|
Set VIZ to `-2` to save performance counters traces for the AMD backend.
|
||||||
|
|
||||||
|
Use the CLI in `./extra/sqtt/roc.py` to explore the trace.
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
|||||||
# Run assembly on the AMD runtime and check correctness
|
# Run assembly on the AMD runtime and check correctness
|
||||||
# VIZ=2 to profile
|
# VIZ=2 to profile
|
||||||
import pathlib
|
import pathlib
|
||||||
from tinygrad import Tensor, Device, dtypes
|
from tinygrad import Tensor, Device, dtypes, Context
|
||||||
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
||||||
from tinygrad.renderer import ProgramSpec
|
from tinygrad.renderer import ProgramSpec
|
||||||
from tinygrad.uop.ops import track_rewrites, UOp
|
from tinygrad.uop.ops import track_rewrites, UOp
|
||||||
@@ -55,9 +55,10 @@ def get_asm_prg() -> ProgramSpec:
|
|||||||
eis.append(ExecItem(ast, [C_asm.uop.buffer, from_torch(B).uop.buffer, from_torch(A).uop.buffer], fixedvars={"SZ":N, "NUM_WG":NUM_WG},
|
eis.append(ExecItem(ast, [C_asm.uop.buffer, from_torch(B).uop.buffer, from_torch(A).uop.buffer], fixedvars={"SZ":N, "NUM_WG":NUM_WG},
|
||||||
prg=CompiledRunner(get_asm_prg())))
|
prg=CompiledRunner(get_asm_prg())))
|
||||||
|
|
||||||
for ei in eis:
|
with Context(DEBUG=2):
|
||||||
et = ei.run(wait=True)
|
for ei in eis:
|
||||||
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
|
et = ei.run(wait=True)
|
||||||
|
print(f"{(N*N*N*2 / et)*1e-12:.2f} REAL TFLOPS")
|
||||||
|
|
||||||
# ** correctness
|
# ** correctness
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user