diff --git a/extra/thneed.py b/extra/thneed.py
index b8cd0891f2..3c70faa6e3 100644
--- a/extra/thneed.py
+++ b/extra/thneed.py
@@ -277,15 +277,17 @@ class Thneed:
     print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms")
 
     if DEBUGCL >= 1:
-      total_runtime = sum([(e.profile.end - e.profile.start) for e in events])/1e9
+      scale_total_runtime = sum([(e.profile.end - e.profile.start) for e in events])/1e9
       # TODO: Mac OS has a scaling issue, this hack fixes it
-      scale = 1 if (et/total_runtime) < 10 else (et/total_runtime)
+      scale = 1 if (et/scale_total_runtime) < 10 else (et/scale_total_runtime)
+      total_runtime = 0
       for i, ((prg, args), e) in enumerate(zip(self.cl_cache, events)):
         runtime = (e.profile.end - e.profile.start)*scale
         print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(prg.op_estimate)/runtime:9.2f} GFLOPS {prg.options} -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}")
         if (DEBUGCL >= 2 and int(os.getenv("PRINT_KERNEL", "-1")) == i) or DEBUGCL >= 3:
           print(prg.prg)
-      print(f"total runtime: {total_runtime*1000.0:.2f} ms   wall time: {et*1000.0:.2f} ms")
+        total_runtime += runtime
+      print(f"total runtime: {total_runtime/1e6:.2f} ms   wall time: {et*1000.0:.2f} ms")
     return et
 
   def optimize_local_workgroup(self):