From 281b0db773c0836167c248bca39f7603d8e34555 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Thu, 12 Jan 2023 12:26:58 -0800
Subject: [PATCH] three from image

---
 extra/thneed.py      | 1 +
 openpilot/compile.py | 6 +++++-
 tinygrad/lazy.py     | 3 +--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/extra/thneed.py b/extra/thneed.py
index f08fe9c3e3..622bd19ed5 100644
--- a/extra/thneed.py
+++ b/extra/thneed.py
@@ -283,6 +283,7 @@ class Thneed:
         print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us  {prg.options}")
         total_runtime += runtime
       print(f"total runtime: {total_runtime/1e6:.2f} ms   wall time: {et*1000.0:.2f} ms")
+    return et
 
   def optimize_local_workgroup(self):
     MAX_WORKGROUP = CL.cl_ctx.devices[0].max_work_group_size
diff --git a/openpilot/compile.py b/openpilot/compile.py
index 2e990253c7..065b1c5500 100644
--- a/openpilot/compile.py
+++ b/openpilot/compile.py
@@ -14,6 +14,7 @@ import onnx
 import numpy as np
 
 import tinygrad.graph as graph
+from tinygrad.ops import GlobalCounters
 
 from tinygrad.llops.ops_gpu import CL
 from extra.utils import fetch
@@ -91,6 +92,7 @@ def compile(dat, output_fn):
   tinygrad_out = next(iter(run_onnx(inputs).values()))
 
   # note, since CL.CACHE is enabled, it doesn't actually run the kernels
+  start_ops = GlobalCounters.global_ops
   CL.CACHE = []
   if using_graph: graph.GRAPH = True
   CL.kernel_count = -1
@@ -98,6 +100,7 @@ def compile(dat, output_fn):
   graph.GRAPH = False
   print("kernel count:", len(CL.CACHE))
   assert len(CL.CACHE) <= ALLOWED_KERNEL_COUNT or ALLOWED_KERNEL_COUNT == 0, "too many kernels!"
+  used_ops = GlobalCounters.global_ops - start_ops
 
   from extra.thneed import Thneed
   t = Thneed(CL.CACHE, {k:inputs[k].lazydata.realized.cl for k in inputs.keys()})
@@ -109,7 +112,8 @@ def compile(dat, output_fn):
   t.save(output_fn)
 
   print(f"buffers to save: {len(t.buffers_to_save)}, outputs: {t.outputs}")
-  t.run()
+  runtime = t.run()
+  print(f"network using {used_ops/1e9:.2f} GOPS with runtime {runtime*1e3:.2f} ms that's {used_ops/runtime*1e-9:.2f} GFLOPS")
 
   # confirm thneed found the right output
   thneed_out = np.empty((t.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape)
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index 1dfd8f199c..5c82f3cef2 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -9,7 +9,6 @@ from tinygrad.graph import log_op
 
 # lazy can recurse a lot
 sys.setrecursionlimit(10000)
-sys.tracebacklimit = 20
 
 OPT = int(os.getenv("OPT", "1"))
 NOCONV = int(os.getenv("NOCONV", "0"))
@@ -250,7 +249,7 @@ class LazyBuffer:
   def processing_op(self:LazyBuffer, op:ProcessingOps, w:LazyBuffer, C:ConvArgs) -> LazyBuffer:
     x = self
 
-    if IMAGE == 1:
+    if IMAGE >= 1:
       from accel.opencl.preprocessing import preprocessing_op, postprocessing_op  # type: ignore
       Cold = C
       x,w,C = preprocessing_op(x, w, Cold, False)