mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
work from benchmarking tinybox red v2 (#13264)
* work from benchmarking tinybox red v2 * gpuburn
This commit is contained in:
34
extra/bandwidth_test.py
Normal file
34
extra/bandwidth_test.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
from tinygrad import Tensor, Device, GlobalCounters, Context, dtypes
|
||||
from tinygrad.helpers import getenv, colored
|
||||
|
||||
SZ = 8_000_000_000
|
||||
GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
|
||||
|
||||
if __name__ == "__main__":
|
||||
# create tensors
|
||||
tens = [Tensor.ones(SZ, dtype=dtypes.uint8, device=f"{Device.DEFAULT}:{i}").contiguous() for i in range(GPUS)]
|
||||
Tensor.realize(*tens)
|
||||
|
||||
bw = [[0.0]*GPUS for _ in range(GPUS)]
|
||||
for i in range(GPUS):
|
||||
for j in range(GPUS):
|
||||
GlobalCounters.reset()
|
||||
with Context(DEBUG=2):
|
||||
if i == j:
|
||||
# this copy would be optimized out, just add 1
|
||||
(tens[i]+1).realize()
|
||||
else:
|
||||
tens[i].to(f"{Device.DEFAULT}:{j}").realize()
|
||||
t = max(GlobalCounters.time_sum_s, 1e-9)
|
||||
bw[i][j] = SZ / t / 1e9 # GB/s
|
||||
|
||||
def fmt(x):
|
||||
c = "green" if x > 50 else "yellow" if x > 20 else "red"
|
||||
return colored(f"{x:6.1f}", c)
|
||||
|
||||
# header
|
||||
print(" " * 8 + " ".join(f"{'d'+str(j):>6}" for j in range(GPUS)))
|
||||
# rows
|
||||
for i in range(GPUS):
|
||||
print(f"{'s'+str(i):>6} -> " + " ".join(fmt(x) for x in bw[i]))
|
||||
@@ -12,7 +12,7 @@ MPS = getenv("MPS", 0)
|
||||
if getenv("FP16_ACC"): torch.backends.cuda.matmul.allow_fp16_accumulation = True
|
||||
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
for N in [256, 512, 1024, 2048, 4096]:
|
||||
for N in [256, 512, 1024, 2048, 4096] + ([6144, 8192] if getenv("BIG") else []):
|
||||
FLOPS = N*N*N*2
|
||||
|
||||
b = torch.rand((N,N), dtype=dtype)
|
||||
|
||||
16
extra/gpuburn.py
Normal file
16
extra/gpuburn.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from tinygrad import Tensor, Device, TinyJit, dtypes
|
||||
from tinygrad.helpers import getenv
|
||||
|
||||
GPUS = getenv("GPUS", 4) # TODO: expose a way in tinygrad to access this
|
||||
N = 6144
|
||||
|
||||
@TinyJit
|
||||
def many_matmul(A, B):
|
||||
out = A
|
||||
for _ in range(8): out = out@B
|
||||
return out
|
||||
|
||||
if __name__ == "__main__":
|
||||
A = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
|
||||
B = Tensor.ones(GPUS, N, N, dtype=dtypes.half).shard(devices=tuple([f"{Device.DEFAULT}:{i}" for i in range(GPUS)]), axis=0).contiguous()
|
||||
while 1: many_matmul(A, B)
|
||||
@@ -32,7 +32,7 @@ def launchBenchmark(instruction, vgprIndices, dense=True, accum=False, extra="")
|
||||
src = src.replace("DIRECTIVE", DIRECTIVE)
|
||||
lib = COMPILER.compile(src)
|
||||
fxn = AMDProgram(DEV, "matmul", lib)
|
||||
elapsed = fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True)
|
||||
elapsed = min([fxn(global_size=(NUM_WORKGROUPS,1,1), local_size=(WAVE_SIZE*NUM_WAVES,1,1), wait=True) for _ in range(2)])
|
||||
FLOPs = FLOPS_PER_MATMUL * NUM_WAVES * NUM_WORKGROUPS * INTERNAL_LOOP * INSTRUCTIONS_PER_LOOP
|
||||
print(f"{instruction:<29} : {FLOPs/elapsed/10**12:.2f} T(FL)OPS")
|
||||
|
||||
|
||||
@@ -3,14 +3,14 @@
|
||||
.p2align 8
|
||||
.type matmul,@function
|
||||
matmul:
|
||||
s_mov_b32 s1, INTERNAL_LOOP
|
||||
s_mov_b32 s2, 0
|
||||
inner_loop:
|
||||
INSTRUCTION
|
||||
s_sub_u32 s1, s1, 1
|
||||
s_cmp_lg_i32 s1, s2
|
||||
s_cbranch_scc1 inner_loop
|
||||
s_endpgm
|
||||
s_mov_b32 s1, INTERNAL_LOOP
|
||||
s_mov_b32 s2, 0
|
||||
inner_loop:
|
||||
INSTRUCTION
|
||||
s_sub_u32 s1, s1, 1
|
||||
s_cmp_lg_i32 s1, s2
|
||||
s_cbranch_scc1 inner_loop
|
||||
s_endpgm
|
||||
|
||||
.rodata
|
||||
.p2align 6
|
||||
|
||||
Reference in New Issue
Block a user