diff --git a/accel/MAPPING b/accel/MAPPING index 9966ed3ded..4c5adcbdbb 100644 --- a/accel/MAPPING +++ b/accel/MAPPING @@ -18,9 +18,6 @@ On M1 GPU, theoretical is 2.275 TFLOPS. https://www.notebookcheck.net/Apple-M1-G We observe 2000ms for BS=8 (37 GFLOP). 37/2275 = 11.9 ms. tinygrad is over a factor of 100x off (similar on AMD GPU) -Making matters worse, flop estimates may be off by a factor of 10. - - TPUs: diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index f6418e4300..29668ed527 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -50,14 +50,14 @@ class CLProgram: self.clprg = self.clprogram.build(options=list(self.options)).__getattr__(self.name) if self.argdtypes is not None: self.clprg.set_scalar_arg_dtypes(self.argdtypes) CLProgram.kernel_cnt += 1 - def __call__(self, *args): + def __call__(self, *args, op_estimate=0): CL.kernel_count += 1 if CL.CACHE is not None: CL.CACHE.append((self, args)) else: e = self.clprg(CL().cl_queue, *args) if DEBUG >= 2: CL.cl_queue.finish() if DEBUG >= 1: - CL.ops_sum += max([x.size//4 for x in args[2:]])*(len(args)-2) - print(f"**CL** {CL.kernel_count:6d} {self.name:20s} args {len(args[2:]):5d} size {prod(args[0]):8d} kernels {str(args[0]):20s} {str(args[1]):14s} GOPs {CL.ops_sum/1e9:7.2f} " + \ + CL.ops_sum += op_estimate + print(f"**CL** {CL.kernel_count:6d} {self.name:20s} args {len(args[2:]):5d} size {prod(args[0]):8d} kernels {str(args[0]):20s} {str(args[1]):14s} OPs {op_estimate/1e6:5.1f}M/{CL.ops_sum/1e9:7.2f}G " + \ ("" if DEBUG <= 1 or CL.CACHE is not None else f"runtime {(e.profile.end - e.profile.start)/1e3:9.2f} us")) if DEBUG >= 4: print(self.prg) @@ -143,5 +143,5 @@ class GPUBuffer: {chr(10).join([f' float {name} = ' + (f'get_{name}({name}_g, idx);' if views[name][1] else f'get_{name}(idx);') for name, _ in bufs if name not in earlybufs])} output[gid] = {code}; }}""", argdtypes=tuple(None if i < 1+len(buf_types) else np.int32 for i in range(1+len(buf_types)))) - conv_prg([prod(ret.shape), 1, 1], None, ret.cl, *[buf.cl for name, buf in bufs if name not in views or views[name][1]]) + conv_prg([prod(ret.shape), 1, 1], None, ret.cl, *[buf.cl for name, buf in bufs if name not in views or views[name][1]], op_estimate=prod(reduce_shape[0])*len(earlybufs) + prod(reduce_shape[1])*len(bufs)) return ret