diff --git a/extra/thneed.py b/extra/thneed.py index b8cd0891f2..3c70faa6e3 100644 --- a/extra/thneed.py +++ b/extra/thneed.py @@ -277,15 +277,17 @@ class Thneed: print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms") if DEBUGCL >= 1: - total_runtime = sum([(e.profile.end - e.profile.start) for e in events])/1e9 + scale_total_runtime = sum([(e.profile.end - e.profile.start) for e in events])/1e9 # TODO: Mac OS has a scaling issue, this hack fixes it - scale = 1 if (et/total_runtime) < 10 else (et/total_runtime) + scale = 1 if (et/scale_total_runtime) < 10 else (et/scale_total_runtime) + total_runtime = 0 for i, ((prg, args), e) in enumerate(zip(self.cl_cache, events)): runtime = (e.profile.end - e.profile.start)*scale print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(prg.op_estimate)/runtime:9.2f} GFLOPS {prg.options} -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}") if (DEBUGCL >= 2 and int(os.getenv("PRINT_KERNEL", "-1")) == i) or DEBUGCL >= 3: print(prg.prg) - print(f"total runtime: {total_runtime*1000.0:.2f} ms wall time: {et*1000.0:.2f} ms") + total_runtime += runtime + print(f"total runtime: {total_runtime/1e6:.2f} ms wall time: {et*1000.0:.2f} ms") return et def optimize_local_workgroup(self):