mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
nv setup constbuf4 (#4511)
* nv correct constbuf 4 * compare results to cuda * test fixed * failed kernel * repro * revert this change
This commit is contained in:
37
test/external/speed_compare_cuda_nv.py
vendored
37
test/external/speed_compare_cuda_nv.py
vendored
@@ -1,7 +1,10 @@
|
||||
from tinygrad import Device
|
||||
from tinygrad import Device, dtypes
|
||||
from tinygrad.helpers import getenv, colored
|
||||
from extra.optimization.helpers import load_worlds, ast_str_to_lin
|
||||
from test.external.fuzz_linearizer import get_fuzz_rawbufs
|
||||
from tinygrad.engine.search import bufs_from_lin
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
import numpy as np
|
||||
|
||||
# move to helpers?
|
||||
def colorize_float(x):
|
||||
@@ -25,31 +28,47 @@ if __name__ == "__main__":
|
||||
# cuda compile
|
||||
culin = ast_str_to_lin(ast, opts=cudev.renderer)
|
||||
culin.hand_coded_optimizations()
|
||||
cuda_prg = cudev.to_runner(culin)
|
||||
cubufs = bufs_from_lin(culin)
|
||||
has_bf16 = any(b.dtype == dtypes.bfloat16 for b in culin.membufs)
|
||||
|
||||
nvlin = ast_str_to_lin(ast, opts=nvdev.renderer)
|
||||
cuda_prg = CompiledRunner(culin.to_program())
|
||||
cubufs = bufs_from_lin(culin)
|
||||
test_cubufs = get_fuzz_rawbufs(culin) if not has_bf16 else cubufs
|
||||
|
||||
rdr = nvdev.renderer
|
||||
rdr.device = "NV"
|
||||
nvlin = ast_str_to_lin(ast, opts=rdr)
|
||||
nvlin.hand_coded_optimizations()
|
||||
nv_prg = nvdev.to_runner(nvlin)
|
||||
nv_prg = CompiledRunner(nvlin.to_program())
|
||||
nvbufs = bufs_from_lin(nvlin)
|
||||
test_nvbufs = get_fuzz_rawbufs(nvlin) if not has_bf16 else nvbufs
|
||||
if not has_bf16:
|
||||
for i,rawbuf in enumerate(test_nvbufs): rawbuf.copyin(test_cubufs[i].as_buffer())
|
||||
|
||||
# warmup
|
||||
tm_cuda, tm_nv = [], []
|
||||
tm_cuda, tm_nv, failed = [], [], False
|
||||
try:
|
||||
cuda_prg(cubufs, {}, wait=True)
|
||||
cuda_prg(test_cubufs, {}, wait=True)
|
||||
for i in range(5): tm_cuda.append(cuda_prg(cubufs, {}, wait=True))
|
||||
except RuntimeError:
|
||||
print("CUDA FAILED")
|
||||
tm_cuda = [1e9]
|
||||
failed = True
|
||||
|
||||
try:
|
||||
nv_prg(nvbufs, {}, wait=True)
|
||||
nv_prg(test_nvbufs, {}, wait=True)
|
||||
for i in range(5): tm_nv.append(nv_prg(nvbufs, {}, wait=True))
|
||||
except RuntimeError:
|
||||
print("NV FAILED")
|
||||
tm_nv = [1e9]
|
||||
failed = True
|
||||
|
||||
if not failed and not has_bf16:
|
||||
curesult = np.frombuffer(test_cubufs[0].as_buffer(), test_cubufs[0].dtype.np)
|
||||
nvresult = np.frombuffer(test_nvbufs[0].as_buffer(), test_nvbufs[0].dtype.np)
|
||||
np.testing.assert_allclose(curesult, nvresult, rtol=1e-2, atol=1e-2)
|
||||
|
||||
average_tm_cuda += min(tm_cuda)
|
||||
average_tm_nv += min(tm_nv)
|
||||
ratio = min(tm_nv)/min(tm_cuda)
|
||||
print(f"{average_tm_nv/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)} {min(tm_nv)*1e6:7.2f} us", nvlin.name)
|
||||
if ratio > 1.1: print(f"NV slower {ratio}", nvlin.ast, nvlin.applied_opts)
|
||||
if ratio > 1.04: print(f"NV slower {ratio}", nvlin.ast, nvlin.applied_opts)
|
||||
|
||||
Reference in New Issue
Block a user