mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 07:28:15 -05:00
only 62 gflops (#2629)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
// clang -O2 -march=native gemm.c -lpthread
|
||||
// single: clang -O2 -march=native gemm.c
|
||||
// multi: clang -O2 -march=native gemm.c -DNTHREADS=32 -lpthread
|
||||
#define _GNU_SOURCE
|
||||
|
||||
// https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
|
||||
@@ -18,11 +19,15 @@
|
||||
|
||||
#ifdef DEBUG
|
||||
#define N 8
|
||||
#else
|
||||
//#define N 4096
|
||||
// L1 cache is 32 kB
|
||||
#define N 2048 // 2048*2048*4*2 = 32 MB
|
||||
// 8*768*4 = 24 kB
|
||||
#endif
|
||||
|
||||
#ifndef N
|
||||
// NOTE: if you change this you have to rerun gemm.py
|
||||
#define N 512
|
||||
#endif
|
||||
|
||||
#ifndef NTHREADS
|
||||
#define NTHREADS 1
|
||||
#endif
|
||||
|
||||
// aligned?
|
||||
@@ -81,7 +86,6 @@ pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
atomic_int nready = 0;
|
||||
atomic_int ndone = 0;
|
||||
|
||||
#define NTHREADS 8
|
||||
void *matmul_thread(void *n) {
|
||||
int k = (int)(int64_t)n;
|
||||
int sy = (N/NTHREADS) * k;
|
||||
@@ -132,10 +136,10 @@ int main() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4000; i++) {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
memset(C, 0, N*N*sizeof(float));
|
||||
|
||||
#if NTHREADS != 1
|
||||
#if NTHREADS != 1
|
||||
nready = 0;
|
||||
ndone = 0;
|
||||
pthread_mutex_lock(&lock);
|
||||
@@ -147,7 +151,7 @@ int main() {
|
||||
#endif
|
||||
|
||||
uint64_t start = nanos();
|
||||
#if NTHREADS == 1
|
||||
#if NTHREADS == 1
|
||||
matmul(0, N);
|
||||
#else
|
||||
// unlocking mutex starts threads
|
||||
@@ -156,7 +160,7 @@ int main() {
|
||||
#endif
|
||||
uint64_t end = nanos();
|
||||
|
||||
#if NTHREADS != 1
|
||||
#if NTHREADS != 1
|
||||
for (int j = 0; j < NTHREADS; j++) {
|
||||
pthread_join(threads[j], NULL);
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ import os
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
N = 2048
|
||||
N = 512
|
||||
if __name__ == "__main__":
|
||||
# N^2
|
||||
A = np.random.randn(N, N).astype(np.float32)
|
||||
@@ -15,7 +15,7 @@ if __name__ == "__main__":
|
||||
flop = 2*N*N*N
|
||||
#print(f"{flop / 1e9:.2f} GFLOP")
|
||||
|
||||
for i in range(4):
|
||||
for i in range(10):
|
||||
st = time.monotonic()
|
||||
C = A @ B.T
|
||||
et = time.monotonic()
|
||||
|
||||
@@ -130,7 +130,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea
|
||||
lib, global_size, local_size = proc
|
||||
if lib in seen_libs: continue
|
||||
seen_libs.add(lib)
|
||||
tms = time_program(Device.DEFAULT, lib, global_size, local_size, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else None)
|
||||
tms = time_program(Device.DEFAULT, lib, global_size, local_size, var_vals, rawbufs, early_stop=beam[0][1]*3 if len(beam) else 1.0) # > 1 second, run one time
|
||||
timed_lins.append((acted_lins[i], min(tms)))
|
||||
if DEBUG >= 2: print(f"\r{time.perf_counter() - st:7.2f}s: {timed_lins[-1][1]*1e6:12.2f} us {len(timed_lins):4d}/{len(acted_lins):4d} {timed_lins[-1][0].colored_shape()}\033[K", end="")
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ CLANG_PROGRAM_HEADER = '#include <math.h>\n#define max(x,y) ((x>y)?x:y)\n#define
|
||||
def compile_clang(prg:str, header:str=CLANG_PROGRAM_HEADER) -> bytes:
|
||||
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
||||
with tempfile.NamedTemporaryFile(delete=True) as output_file:
|
||||
subprocess.check_output(args=('clang -shared -O2 -Wall -Werror -x c -lm -fPIC --rtlib=compiler-rt - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))
|
||||
subprocess.check_output(args=('clang -shared -march=native -O2 -Wall -Werror -x c -fPIC - -o '+str(output_file.name)).split(), input=(header+prg).encode('utf-8'))
|
||||
return pathlib.Path(output_file.name).read_bytes()
|
||||
|
||||
class ClangProgram:
|
||||
|
||||
Reference in New Issue
Block a user