viz: sqtt touchups (#13228)

* viz: sqtt touchups * revert * matches
2026-01-08 22:48:25 -05:00 · 2025-11-12 22:40:37 +08:00
parent 7a6853fa40
commit af17e07251
2 changed files with 18 additions and 5 deletions
--- a/extra/sqtt/roc.py
+++ b/extra/sqtt/roc.py
@@ -35,7 +35,7 @@ class InstInfo:
  hit:int=0
  lat:int=0
  stall:int=0
-  def __str__(self): return f"{self.inst:>20} hits:{self.typ:>6} hits:{self.hit:>6} latency:{self.lat:>6} stall:{self.stall:>6}"
+  def __str__(self): return f"{self.inst:>20} type:{self.typ:>6} hits:{self.hit:>6} latency:{self.lat:>6} stall:{self.stall:>6}"

  def on_ev(self, ev):
    self.hit, self.lat, self.stall = self.hit + 1, self.lat + ev.duration, self.stall + ev.stall
@@ -61,6 +61,8 @@ class WaveExec:
  wave_id:int
  cu:int
  simd:int
+  begin_time:int
+  end_time:int
  insts:list[InstExec]

 class _ROCParseCtx:
@@ -99,7 +101,7 @@ class _ROCParseCtx:

    if ev.instructions_size > 0:
      self.wave_events[key:=PrgExec(unwrap(self.active_kern), ev.wave_id, ev.cu, ev.simd)] = asm
-      self.inst_execs.setdefault(key.name, []).append(WaveExec(ev.wave_id, ev.cu, ev.simd, inst_execs))
+      self.inst_execs.setdefault(key.name, []).append(WaveExec(ev.wave_id, ev.cu, ev.simd, ev.begin_time, ev.end_time, inst_execs))

 def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
  dev_events:dict[str, ProfileDeviceEvent] = {}
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -217,12 +217,23 @@ def load_sqtt(profile:list[ProfileEvent]) -> None:
    if (r:=ref_map.get(name)): name = ctxs[r]["name"]
    steps.append({"name":name, "depth":0, "query":f"/render?ctx={len(ctxs)}&step={len(steps)}&fmt=counters",
                  "data":{"src":trace.keys[r].ret.src if r else name, "lang":"cpp"}})
+
+    # Idle:     The total time gap between the completion of previous instruction and the beginning of the current instruction.
+    #           The idle time can be caused by:
+    #             * Arbiter loss
+    #             * Source or destination register dependency
+    #             * Instruction cache miss
+    # Stall:    The total number of cycles the hardware pipe couldn't issue an instruction.
+    # Duration: Total latency in cycles, defined as "Stall time + Issue time" for gfx9 or "Stall time + Execute time" for gfx10+.
    for w in waves:
-      rows = [(e.inst, e.time, e.time-(w.insts[i-1].time if i else 0), e.dur, e.stall, str(e.typ).split("_")[-1]) for i,e in enumerate(w.insts)]
-      summary = [{"label":"Total Cycles", "value":w.insts[-1].time-w.insts[0].time if w.insts else 0}, {"label":"CU", "value":w.cu},
+      rows, prev_instr = [], w.begin_time
+      for i,e in enumerate(w.insts):
+        rows.append((e.inst, e.time, max(0, e.time-prev_instr), e.dur, e.stall, str(e.typ).split("_")[-1]))
+        prev_instr = max(prev_instr, e.time + e.dur)
+      summary = [{"label":"Total Cycles", "value":w.end_time-w.begin_time}, {"label":"CU", "value":w.cu},
                 {"label":"SIMD", "value":w.simd}]
      steps.append({"name":f"Wave {w.wave_id}", "depth":1, "query":f"/render?ctx={len(ctxs)}&step={len(steps)}&fmt=counters",
-                    "data":{"rows":rows, "cols":["Instruction", "Clk", "Wait", "Duration", "Stall", "Type"], "summary":summary}})
+                    "data":{"rows":rows, "cols":["Instruction", "Clk", "Idle", "Duration", "Stall", "Type"], "summary":summary}})
  ctxs.append({"name":"Counters", "steps":steps})

 def get_profile(profile:list[ProfileEvent]) -> bytes|None: