From 281b0db773c0836167c248bca39f7603d8e34555 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Thu, 12 Jan 2023 12:26:58 -0800 Subject: [PATCH] three from image --- extra/thneed.py | 1 + openpilot/compile.py | 6 +++++- tinygrad/lazy.py | 3 +-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/extra/thneed.py b/extra/thneed.py index f08fe9c3e3..622bd19ed5 100644 --- a/extra/thneed.py +++ b/extra/thneed.py @@ -283,6 +283,7 @@ class Thneed: print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {prg.options}") total_runtime += runtime print(f"total runtime: {total_runtime/1e6:.2f} ms wall time: {et*1000.0:.2f} ms") + return et def optimize_local_workgroup(self): MAX_WORKGROUP = CL.cl_ctx.devices[0].max_work_group_size diff --git a/openpilot/compile.py b/openpilot/compile.py index 2e990253c7..065b1c5500 100644 --- a/openpilot/compile.py +++ b/openpilot/compile.py @@ -14,6 +14,7 @@ import onnx import numpy as np import tinygrad.graph as graph +from tinygrad.ops import GlobalCounters from tinygrad.llops.ops_gpu import CL from extra.utils import fetch @@ -91,6 +92,7 @@ def compile(dat, output_fn): tinygrad_out = next(iter(run_onnx(inputs).values())) # note, since CL.CACHE is enabled, it doesn't actually run the kernels + start_ops = GlobalCounters.global_ops CL.CACHE = [] if using_graph: graph.GRAPH = True CL.kernel_count = -1 @@ -98,6 +100,7 @@ def compile(dat, output_fn): graph.GRAPH = False print("kernel count:", len(CL.CACHE)) assert len(CL.CACHE) <= ALLOWED_KERNEL_COUNT or ALLOWED_KERNEL_COUNT == 0, "too many kernels!" + used_ops = GlobalCounters.global_ops - start_ops from extra.thneed import Thneed t = Thneed(CL.CACHE, {k:inputs[k].lazydata.realized.cl for k in inputs.keys()}) @@ -109,7 +112,8 @@ def compile(dat, output_fn): t.save(output_fn) print(f"buffers to save: {len(t.buffers_to_save)}, outputs: {t.outputs}") - t.run() + runtime = t.run() + print(f"network using {used_ops/1e9:.2f} GOPS with runtime {runtime*1e3:.2f} ms that's {used_ops/runtime*1e-9:.2f} GFLOPS") # confirm thneed found the right output thneed_out = np.empty((t.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape) diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index 1dfd8f199c..5c82f3cef2 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -9,7 +9,6 @@ from tinygrad.graph import log_op # lazy can recurse a lot sys.setrecursionlimit(10000) -sys.tracebacklimit = 20 OPT = int(os.getenv("OPT", "1")) NOCONV = int(os.getenv("NOCONV", "0")) @@ -250,7 +249,7 @@ class LazyBuffer: def processing_op(self:LazyBuffer, op:ProcessingOps, w:LazyBuffer, C:ConvArgs) -> LazyBuffer: x = self - if IMAGE == 1: + if IMAGE >= 1: from accel.opencl.preprocessing import preprocessing_op, postprocessing_op # type: ignore Cold = C x,w,C = preprocessing_op(x, w, Cold, False)