only 62 gflops (#2629)

2026-01-10 07:28:15 -05:00 · 2023-12-05 13:28:24 -08:00
parent 6d58c19736
commit 0be5d16950
4 changed files with 19 additions and 15 deletions
--- a/extra/gemm/gemm.c
+++ b/extra/gemm/gemm.c
@@ -1,4 +1,5 @@
-// clang -O2 -march=native gemm.c -lpthread
+// single:  clang -O2 -march=native gemm.c
+// multi:   clang -O2 -march=native gemm.c -DNTHREADS=32 -lpthread
 #define _GNU_SOURCE

 // https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
@@ -18,11 +19,15 @@

 #ifdef DEBUG
  #define N 8
-#else
-  //#define N 4096
-  // L1 cache is 32 kB
-  #define N 2048 // 2048*2048*4*2 = 32 MB
-  // 8*768*4 = 24 kB
+#endif
+
+#ifndef N
+  // NOTE: if you change this you have to rerun gemm.py
+  #define N 512
+#endif
+
+#ifndef NTHREADS
+  #define NTHREADS 1
 #endif

 // aligned?
@@ -81,7 +86,6 @@ pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 atomic_int nready = 0;
 atomic_int ndone = 0;

-#define NTHREADS 8
 void *matmul_thread(void *n) {
  int k = (int)(int64_t)n;
  int sy = (N/NTHREADS) * k;
@@ -132,10 +136,10 @@ int main() {
    }
  }

-  for (int i = 0; i < 4000; i++) {
+  for (int i = 0; i < 10; i++) {
    memset(C, 0, N*N*sizeof(float));

-#if NTHREADS != 1 
+#if NTHREADS != 1
    nready = 0;
    ndone = 0;
    pthread_mutex_lock(&lock);
@@ -147,7 +151,7 @@ int main() {
 #endif

    uint64_t start = nanos();
-#if NTHREADS == 1 
+#if NTHREADS == 1
    matmul(0, N);
 #else
    // unlocking mutex starts threads
@@ -156,7 +160,7 @@ int main() {
 #endif
    uint64_t end = nanos();

-#if NTHREADS != 1 
+#if NTHREADS != 1
    for (int j = 0; j < NTHREADS; j++) {
      pthread_join(threads[j], NULL);
    }
--- a/extra/gemm/gemm.py
+++ b/extra/gemm/gemm.py
@@ -4,7 +4,7 @@ import os
 import time
 import numpy as np

-N = 2048
+N = 512
 if __name__ == "__main__":
  # N^2
  A = np.random.randn(N, N).astype(np.float32)
@@ -15,7 +15,7 @@ if __name__ == "__main__":
  flop = 2*N*N*N
  #print(f"{flop / 1e9:.2f} GFLOP")

-  for i in range(4):
+  for i in range(10):
    st = time.monotonic()
    C = A @ B.T
    et = time.monotonic()
--- a/tinygrad/features/search.py
+++ b/tinygrad/features/search.py
@@ -130,7 +130,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea
        lib, global_size, local_size = proc
        if lib in seen_libs: continue
        seen_libs.add(lib)
-        tms = time_program(Device.DEFAULT, lib, global_size, local_size, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else None)
+        tms = time_program(Device.DEFAULT, lib, global_size, local_size, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0)   # > 1 second, run one time
        timed_lins.append((acted_lins[i], min(tms)))
        if DEBUG >= 2: print(f"\r{time.perf_counter() - st:7.2f}s: {timed_lins[-1][1]*1e6:12.2f} us       {len(timed_lins):4d}/{len(acted_lins):4d}         {timed_lins[-1][0].colored_shape()}\033[K", end="")

--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -11,7 +11,7 @@ CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define
 def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
  # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
  with tempfile.NamedTemporaryFile(delete=True) as output_file:
-    subprocess.check_output(args=('clang -shared -O2 -Wall -Werror -x c -lm -fPIC --rtlib=compiler-rt - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))
+    subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))
    return pathlib.Path(output_file.name).read_bytes()

 class ClangProgram: