From b05c2828f702efef2944066e971a8f3312eca866 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Thu, 30 Mar 2023 06:08:54 +0400
Subject: [PATCH] better cacheline test

---
 extra/archprobe.py          | 40 ++++++++++++++++++++++++++++++-------
 tinygrad/runtime/ops_gpu.py |  4 ++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/extra/archprobe.py b/extra/archprobe.py
index 33032ccab3..917f2e149c 100644
--- a/extra/archprobe.py
+++ b/extra/archprobe.py
@@ -56,7 +56,37 @@ def reg_count(nthread, ngrp, nreg):
 @register_test
 def test_reg_count(nthread=1, ngrp=1):
   base = reg_count(nthread, ngrp, 1)
-  return [(nreg, (reg_count(nthread, ngrp, nreg)-base)/nreg) for nreg in trange(2, 257)]   # archprobe goes to 512
+  return [(nreg, (reg_count(nthread, ngrp, nreg)-base)/nreg) for nreg in trange(4, 513, 4)]
+
+def buf_cacheline_size(stride):
+  BUF_CACHE_SIZE = 128*1024
+  NTHREAD_LOGIC = 256
+  PITCH = BUF_CACHE_SIZE * 2 // NTHREAD_LOGIC
+  BUF_SIZE = PITCH * NTHREAD_LOGIC
+
+  prg = """__kernel void buf_cacheline_size(
+    __global const float* src,
+    __global float* dst,
+    __private const int niter,
+    __private const int stride,
+    __private const int pitch
+  ) {
+    float c = 0;
+    for (int i = 0; i < niter; ++i) {
+      const int zero = i >> 31;
+      c += src[zero + stride * 0 + pitch * get_global_id(0)];
+      c += src[zero + stride * 1 + pitch * get_global_id(0)];
+    }
+    dst[0] = c;
+  }"""
+  in_buf = CLBuffer(BUF_SIZE, dtypes.float32)
+  out_buf = CLBuffer(1, dtypes.float32)
+  cl = CLProgram("buf_cacheline_size", prg, argdtypes=[None, None, np.int32, np.int32, np.int32])
+  return min([cl([NTHREAD_LOGIC, 1, 1], [NTHREAD_LOGIC, 1, 1], in_buf, out_buf, 10, stride, PITCH, wait=True) for _ in range(5)])*1e9
+
+@register_test
+def test_cacheline_size():
+  return [(stride, buf_cacheline_size(stride)) for stride in trange(1,64)]
 
 def buf_cache_hierarchy_pchase(ndata, stride=1, NCOMP=1, steps=65536):
   ndata //= NCOMP*4  # ptr size
@@ -78,10 +108,6 @@ def buf_cache_hierarchy_pchase(ndata, stride=1, NCOMP=1, steps=65536):
   cl = CLProgram("buf_cache_hierarchy_pchase", prg, argdtypes=[None, None, np.int32])
   return min([cl([1, 1, 1], [1, 1, 1], in_buf, out_buf, steps, wait=True)/steps for _ in range(5)])*1e9
 
-@register_test
-def test_cacheline_size():
-  return [(stride, buf_cache_hierarchy_pchase(65536, stride, steps=65536)) for stride in trange(1,64)]
-
 @register_test
 def test_memory_latency():
   # requires cacheline < 16
@@ -125,7 +151,7 @@ def gflops(niter=4, nroll=4, ngroups=4096):
       out_buf[get_global_id(0) >> 31] = {'+'.join(f"y.s{'0123456789abcdef'[i]}" for i in range(NCOMP))};
   }}"""
   out_buf = CLBuffer(1, dtypes.float32)
-  cl = CLProgram("gflops", prg)
+  cl = CLProgram("gflops", prg, options="-Werror -cl-mad-enable -cl-fast-relaxed-math")
   FLOPS = NCOMP*2*2 * niter * nroll * ngroups * 32
   # NOTE: if nay of the niters form a local group, this is wrong
   return FLOPS/(min([cl([32, ngroups, 1], [32, 1, 1], out_buf, wait=True) for _ in range(10)])*1e9)
@@ -137,7 +163,7 @@ def test_gflops():
 if __name__ == "__main__":
   cache = {}
   #cache = pickle.load(open("/tmp/cache.pkl", "rb"))
-  #tests = {"test_gflops": tests["test_gflops"]}
+  #tests = {"test_cacheline_size": tests["test_cacheline_size"]}
   plt.figure(figsize=(16, 9))
   for i,(k,test) in enumerate(tests.items()):
     print(f"running {k}")
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index 360f2a8d7c..6681bc0be5 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -40,10 +40,10 @@ class CLBuffer(RawBufferCopyInOut):
     cl.enqueue_copy(CL.cl_queue, x, self._buf, is_blocking=True)
 
 class CLProgram:
-  def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
+  def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None):
     self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg)  # type: ignore
     try:
-      self._clprg = self.clprogram.build()
+      self._clprg = self.clprogram.build(options=options)
     except cl.RuntimeError as e:
       if DEBUG >= 3: print("FAILED TO BUILD", prg)
       raise e