work from benchmarking tinybox red v2 (#13264)

* work from benchmarking tinybox red v2

* gpuburn
This commit is contained in:
George Hotz
2025-11-13 16:38:40 -08:00
committed by GitHub
parent 547304c471
commit ba84d415fe
5 changed files with 60 additions and 10 deletions

34
extra/bandwidth_test.py Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
from tinygrad import Tensor, Device, GlobalCounters, Context, dtypes
from tinygrad.helpers import getenv, colored
SZ = 8_000_000_000
GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
if __name__ == "__main__":
# create tensors
tens = [Tensor.ones(SZ, dtype=dtypes.uint8, device=f"{Device.DEFAULT}:{i}").contiguous() for i in range(GPUS)]
Tensor.realize(*tens)
bw = [[0.0]*GPUS for _ in range(GPUS)]
for i in range(GPUS):
for j in range(GPUS):
GlobalCounters.reset()
with Context(DEBUG=2):
if i == j:
# this copy would be optimized out, just add 1
(tens[i]+1).realize()
else:
tens[i].to(f"{Device.DEFAULT}:{j}").realize()
t = max(GlobalCounters.time_sum_s, 1e-9)
bw[i][j] = SZ / t / 1e9 # GB/s
def fmt(x):
c = "green" if x > 50 else "yellow" if x > 20 else "red"
return colored(f"{x:6.1f}", c)
# header
print(" " * 8 + " ".join(f"{'d'+str(j):>6}" for j in range(GPUS)))
# rows
for i in range(GPUS):
print(f"{'s'+str(i):>6} -> " + " ".join(fmt(x) for x in bw[i]))

View File

@@ -12,7 +12,7 @@ MPS = getenv("MPS", 0)
if getenv("FP16_ACC"): torch.backends.cuda.matmul.allow_fp16_accumulation = True
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
for N in [256, 512, 1024, 2048, 4096]:
for N in [256, 512, 1024, 2048, 4096] + ([6144, 8192] if getenv("BIG") else []):
FLOPS = N*N*N*2
b = torch.rand((N,N), dtype=dtype)

16
extra/gpuburn.py Normal file
View File

@@ -0,0 +1,16 @@
from tinygrad import Tensor, Device, TinyJit, dtypes
from tinygrad.helpers import getenv
GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
N = 6144
@TinyJit
def many_matmul(A, B):
out = A
for _ in range(8): out = out@B
return out
if __name__ == "__main__":
A = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
B = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
while 1: many_matmul(A, B)

View File

@@ -32,7 +32,7 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
src = src.replace("DIRECTIVE", DIRECTIVE)
lib = COMPILER.compile(src)
fxn = AMDProgram(DEV, "matmul", lib)
elapsed = fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True)
elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")

View File

@@ -3,14 +3,14 @@
.p2align 8
.type matmul,@function
matmul:
s_mov_b32 s1, INTERNAL_LOOP
s_mov_b32 s2, 0
inner_loop:
INSTRUCTION
s_sub_u32 s1, s1, 1
s_cmp_lg_i32 s1, s2
s_cbranch_scc1 inner_loop
s_endpgm
s_mov_b32 s1, INTERNAL_LOOP
s_mov_b32 s2, 0
inner_loop:
INSTRUCTION
s_sub_u32 s1, s1, 1
s_cmp_lg_i32 s1, s2
s_cbranch_scc1 inner_loop
s_endpgm
.rodata
.p2align 6