only 62 gflops (#2629)

This commit is contained in:
George Hotz
2023-12-05 13:28:24 -08:00
committed by GitHub
parent 6d58c19736
commit 0be5d16950
4 changed files with 19 additions and 15 deletions

View File

@@ -1,4 +1,5 @@
// clang -O2 -march=native gemm.c -lpthread
// single: clang -O2 -march=native gemm.c
// multi: clang -O2 -march=native gemm.c -DNTHREADS=32 -lpthread
#define _GNU_SOURCE
// https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
@@ -18,11 +19,15 @@
#ifdef DEBUG
#define N 8
#else
//#define N 4096
// L1 cache is 32 kB
#define N 2048 // 2048*2048*4*2 = 32 MB
// 8*768*4 = 24 kB
#endif
#ifndef N
// NOTE: if you change this you have to rerun gemm.py
#define N 512
#endif
#ifndef NTHREADS
#define NTHREADS 1
#endif
// aligned?
@@ -81,7 +86,6 @@ pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
atomic_int nready = 0;
atomic_int ndone = 0;
#define NTHREADS 8
void *matmul_thread(void *n) {
int k = (int)(int64_t)n;
int sy = (N/NTHREADS) * k;
@@ -132,10 +136,10 @@ int main() {
}
}
for (int i = 0; i < 4000; i++) {
for (int i = 0; i < 10; i++) {
memset(C, 0, N*N*sizeof(float));
#if NTHREADS != 1
#if NTHREADS != 1
nready = 0;
ndone = 0;
pthread_mutex_lock(&lock);
@@ -147,7 +151,7 @@ int main() {
#endif
uint64_t start = nanos();
#if NTHREADS == 1
#if NTHREADS == 1
matmul(0, N);
#else
// unlocking mutex starts threads
@@ -156,7 +160,7 @@ int main() {
#endif
uint64_t end = nanos();
#if NTHREADS != 1
#if NTHREADS != 1
for (int j = 0; j < NTHREADS; j++) {
pthread_join(threads[j], NULL);
}

View File

@@ -4,7 +4,7 @@ import os
import time
import numpy as np
N = 2048
N = 512
if __name__ == "__main__":
# N^2
A = np.random.randn(N, N).astype(np.float32)
@@ -15,7 +15,7 @@ if __name__ == "__main__":
flop = 2*N*N*N
#print(f"{flop / 1e9:.2f} GFLOP")
for i in range(4):
for i in range(10):
st = time.monotonic()
C = A @ B.T
et = time.monotonic()

View File

@@ -130,7 +130,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea
lib, global_size, local_size = proc
if lib in seen_libs: continue
seen_libs.add(lib)
tms = time_program(Device.DEFAULT, lib, global_size, local_size, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else None)
tms = time_program(Device.DEFAULT, lib, global_size, local_size, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0) # > 1 second, run one time
timed_lins.append((acted_lins[i], min(tms)))
if DEBUG >= 2: print(f"\r{time.perf_counter() - st:7.2f}s: {timed_lins[-1][1]*1e6:12.2f} us {len(timed_lins):4d}/{len(acted_lins):4d} {timed_lins[-1][0].colored_shape()}\033[K", end="")

View File

@@ -11,7 +11,7 @@ CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define
def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
with tempfile.NamedTemporaryFile(delete=True) as output_file:
subprocess.check_output(args=('clang -shared -O2 -Wall -Werror -x c -lm -fPIC --rtlib=compiler-rt - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))
subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))
return pathlib.Path(output_file.name).read_bytes()
class ClangProgram: