From b05c2828f702efef2944066e971a8f3312eca866 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Thu, 30 Mar 2023 06:08:54 +0400 Subject: [PATCH] better cacheline test --- extra/archprobe.py | 40 ++++++++++++++++++++++++++++++------- tinygrad/runtime/ops_gpu.py | 4 ++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/extra/archprobe.py b/extra/archprobe.py index 33032ccab3..917f2e149c 100644 --- a/extra/archprobe.py +++ b/extra/archprobe.py @@ -56,7 +56,37 @@ def reg_count(nthread, ngrp, nreg): @register_test def test_reg_count(nthread=1, ngrp=1): base = reg_count(nthread, ngrp, 1) - return [(nreg, (reg_count(nthread, ngrp, nreg)-base)/nreg) for nreg in trange(2, 257)] # archprobe goes to 512 + return [(nreg, (reg_count(nthread, ngrp, nreg)-base)/nreg) for nreg in trange(4, 513, 4)] + +def buf_cacheline_size(stride): + BUF_CACHE_SIZE = 128*1024 + NTHREAD_LOGIC = 256 + PITCH = BUF_CACHE_SIZE * 2 // NTHREAD_LOGIC + BUF_SIZE = PITCH * NTHREAD_LOGIC + + prg = """__kernel void buf_cacheline_size( + __global const float* src, + __global float* dst, + __private const int niter, + __private const int stride, + __private const int pitch + ) { + float c = 0; + for (int i = 0; i < niter; ++i) { + const int zero = i >> 31; + c += src[zero + stride * 0 + pitch * get_global_id(0)]; + c += src[zero + stride * 1 + pitch * get_global_id(0)]; + } + dst[0] = c; + }""" + in_buf = CLBuffer(BUF_SIZE, dtypes.float32) + out_buf = CLBuffer(1, dtypes.float32) + cl = CLProgram("buf_cacheline_size", prg, argdtypes=[None, None, np.int32, np.int32, np.int32]) + return min([cl([NTHREAD_LOGIC, 1, 1], [NTHREAD_LOGIC, 1, 1], in_buf, out_buf, 10, stride, PITCH, wait=True) for _ in range(5)])*1e9 + +@register_test +def test_cacheline_size(): + return [(stride, buf_cacheline_size(stride)) for stride in trange(1,64)] def buf_cache_hierarchy_pchase(ndata, stride=1, NCOMP=1, steps=65536): ndata //= NCOMP*4 # ptr size @@ -78,10 +108,6 @@ def buf_cache_hierarchy_pchase(ndata, stride=1, NCOMP=1, steps=65536): cl = CLProgram("buf_cache_hierarchy_pchase", prg, argdtypes=[None, None, np.int32]) return min([cl([1, 1, 1], [1, 1, 1], in_buf, out_buf, steps, wait=True)/steps for _ in range(5)])*1e9 -@register_test -def test_cacheline_size(): - return [(stride, buf_cache_hierarchy_pchase(65536, stride, steps=65536)) for stride in trange(1,64)] - @register_test def test_memory_latency(): # requires cacheline < 16 @@ -125,7 +151,7 @@ def gflops(niter=4, nroll=4, ngroups=4096): out_buf[get_global_id(0) >> 31] = {'+'.join(f"y.s{'0123456789abcdef'[i]}" for i in range(NCOMP))}; }}""" out_buf = CLBuffer(1, dtypes.float32) - cl = CLProgram("gflops", prg) + cl = CLProgram("gflops", prg, options="-Werror -cl-mad-enable -cl-fast-relaxed-math") FLOPS = NCOMP*2*2 * niter * nroll * ngroups * 32 # NOTE: if nay of the niters form a local group, this is wrong return FLOPS/(min([cl([32, ngroups, 1], [32, 1, 1], out_buf, wait=True) for _ in range(10)])*1e9) @@ -137,7 +163,7 @@ def test_gflops(): if __name__ == "__main__": cache = {} #cache = pickle.load(open("/tmp/cache.pkl", "rb")) - #tests = {"test_gflops": tests["test_gflops"]} + #tests = {"test_cacheline_size": tests["test_cacheline_size"]} plt.figure(figsize=(16, 9)) for i,(k,test) in enumerate(tests.items()): print(f"running {k}") diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 360f2a8d7c..6681bc0be5 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -40,10 +40,10 @@ class CLBuffer(RawBufferCopyInOut): cl.enqueue_copy(CL.cl_queue, x, self._buf, is_blocking=True) class CLProgram: - def __init__(self, name:str, prg:str, binary=False, argdtypes=None): + def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None): self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore try: - self._clprg = self.clprogram.build() + self._clprg = self.clprogram.build(options=options) except cl.RuntimeError as e: if DEBUG >= 3: print("FAILED TO BUILD", prg) raise e