sqtt: add occupancy events to the timeline (#13430)

This commit is contained in:
qazal
2025-11-24 22:28:05 +08:00
committed by GitHub
parent 63a931ff76
commit 2a9bd12700
4 changed files with 59 additions and 26 deletions

View File

@@ -38,11 +38,18 @@ class InstExec:
time:int
@dataclasses.dataclass(frozen=True)
class WaveExec:
class WaveSlot:
wave_id:int
cu:int
simd:int
se:int
@property
def simd_loc(self) -> str: return f"SE:{self.se} CU:{self.cu} SIMD:{self.simd}"
@property
def wave_loc(self) -> str: return f"{self.simd_loc} WAVE:{self.wave_id}"
@dataclasses.dataclass(frozen=True)
class WaveExec(WaveSlot):
begin_time:int
end_time:int
insts:bytearray
@@ -53,11 +60,17 @@ class WaveExec:
inst_typ = rocprof.enum_rocprofiler_thread_trace_decoder_inst_category_t.get(inst.category)
yield InstExec(inst_typ, inst.pc.address, inst.stall, inst.duration, inst.time)
@dataclasses.dataclass(frozen=True)
class OccEvent(WaveSlot):
time:int
start:int
class _ROCParseCtx:
def __init__(self, dev_evs:dict[str, ProfileDeviceEvent], sqtt_evs:list[ProfileSQTTEvent], prog_evs:list[ProfileProgramEvent]):
self.dev_evs, self.sqtt_evs, self.prog_evs = dev_evs, iter(sqtt_evs), prog_evs
self.disasms:dict[str, dict[int, tuple[str, int]]] = {}
self.inst_execs:dict[str, list[WaveExec]] = {}
self.occ_events:dict[str, list[OccEvent]] = {}
for prog in prog_evs:
arch = "gfx%d%x%x" % ((trgt:=unwrap(dev_evs[prog.device].props)['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
@@ -73,9 +86,12 @@ class _ROCParseCtx:
def on_occupancy_ev(self, ev:rocprof.rocprofiler_thread_trace_decoder_occupancy_t):
if DEBUG >= 5: print(f"OCC {ev.time=} {self.active_se=} {ev.cu=} {ev.simd=} {ev.wave_id=} {ev.start=}")
self.occ_events.setdefault(unwrap(self.active_kern), []).append(OccEvent(ev.wave_id, ev.cu, ev.simd, unwrap(self.active_se), ev.time, ev.start))
def on_wave_ev(self, ev:rocprof.rocprofiler_thread_trace_decoder_wave_t):
if DEBUG >= 5: print(f"WAVE {ev.wave_id=} {self.active_se=} {ev.cu=} {ev.simd=} {ev.contexts=} {ev.begin_time=} {ev.end_time=}")
# Skip wave events without instruction timings, occupancy events give the start and duration.
if ev.instructions_size == 0: return
insts_blob = bytearray(sz:=ev.instructions_size * ctypes.sizeof(rocprof.rocprofiler_thread_trace_decoder_inst_t))
ctypes.memmove((ctypes.c_char * sz).from_buffer(insts_blob), ev.instructions_array, sz)

View File

@@ -273,6 +273,7 @@
overflow-y: hidden;
white-space: nowrap;
display: flex;
min-height: 32px;
}
#device-list > div:hover {
background-color: rgba(20, 23, 35, 0.3);

View File

@@ -158,7 +158,7 @@ const formatUnit = (d, unit="") => d3.format(".3~s")(d)+unit;
const colorScheme = {TINY:new Map([["Schedule","#1b5745"],["get_program","#1d2e62"],["compile","#63b0cd"],["DEFAULT","#354f52"]]),
DEFAULT:["#2b2e39", "#2c2f3a", "#31343f", "#323544", "#2d303a", "#2e313c", "#343746", "#353847", "#3c4050", "#404459", "#444862", "#4a4e65"],
BUFFER:["#342483", "#3E2E94", "#4938A4", "#5442B4", "#5E4CC2", "#674FCA"], SE:["#2b2e39"],
BUFFER:["#342483", "#3E2E94", "#4938A4", "#5442B4", "#5E4CC2", "#674FCA"], SE:new Map([["OCC", "#101725"], ["INST", "#0A2042"]]),
CATEGORICAL:["#ff8080", "#F4A261", "#C8F9D4", "#8D99AE", "#F4A261", "#ffffa2", "#ffffc0", "#87CEEB"],}
const cycleColors = (lst, i) => lst[i%lst.length];
@@ -206,7 +206,7 @@ async function renderProfiler(path, unit, opts) {
// layout once!
if (data != null && data.path === path) return updateProgress({ start:false });
// support non realtime x axis units
const formatTime = unit === "realtime" ? formatMicroseconds : (s) => `${s} ${unit}`;
const formatTime = unit === "realtime" ? formatMicroseconds : (s) => formatUnit(s, " "+unit);
const profiler = d3.select("#profiler").html("");
const buf = cache[path] ?? await fetchValue(path);
const view = new DataView(buf);
@@ -236,33 +236,36 @@ async function renderProfiler(path, unit, opts) {
for (let i=0; i<layoutsLen; i++) {
const nameLen = view.getUint8(offset, true); offset += 1;
const k = textDecoder.decode(new Uint8Array(buf, offset, nameLen)); offset += nameLen;
const div = deviceList.append("div").attr("id", k).text(k).style("padding", padding+"px").style("width", opts.width).style("min-height", opts.height);
const div = deviceList.append("div").attr("id", k).text(k).style("padding", padding+"px").style("width", opts.width);
const { y:baseY, height:baseHeight } = rect(div.node());
const colors = colorScheme[k.split(":")[0]] ?? colorScheme.DEFAULT;
const offsetY = baseY-canvasTop+padding/2;
const shapes = [], visible = [];
const eventType = u8(), eventsLen = u32();
if (eventType === EventTypes.EXEC) {
const levelHeight = baseHeight-padding;
const levelHeight = (baseHeight-padding)*(opts.heightScale ?? 1);
const levels = [];
data.tracks.set(k, { shapes, eventType, visible, offsetY, pcolor:"#9ea2ad" });
let colorKey, ref;
for (let j=0; j<eventsLen; j++) {
const e = {name:strings[u32()], ref:optional(u32()), key:optional(u32()), st:u32(), dur:f32(), info:strings[u32()] || null};
// find a free level to put the event
let depth = levels.findIndex(levelEt => e.st >= levelEt);
const et = e.st+Math.trunc(e.dur);
if (depth === -1) {
depth = levels.length;
levels.push(et);
} else levels[depth] = et;
let depth = 0;
if (opts.levelKey != null) { depth = opts.levelKey(e); levels[depth] = 0; }
else {
depth = levels.findIndex(levelEt => e.st >= levelEt);
const et = e.st+Math.trunc(e.dur);
if (depth === -1) {
depth = levels.length;
levels.push(et);
} else levels[depth] = et;
}
if (depth === 0) colorKey = e.name.split(" ")[0];
if (!colorMap.has(colorKey)) {
const color = colors instanceof Map ? (colors.get(colorKey) || colors.get("DEFAULT")) : cycleColors(colors, colorMap.size);
colorMap.set(colorKey, d3.rgb(color));
}
const base = colorMap.get(colorKey), s = Math.min(Math.pow(1/0.7, depth), 240 / Math.max(base.r, base.g, base.b));
const fillColor = d3.rgb(base.r*s, base.g*s, base.b*s).toString();
const fillColor = colorMap.get(colorKey).brighter(0.3*depth).toString();
const label = parseColors(e.name).map(({ color, st }) => ({ color, st, width:ctx.measureText(st).width }));
let shapeRef = e.ref;
if (shapeRef != null) { ref = {ctx:e.ref, step:0}; shapeRef = ref; }
@@ -286,7 +289,7 @@ async function renderProfiler(path, unit, opts) {
ctx:shapeRef?.ctx, step:shapeRef?.step };
if (e.key != null) shapeMap.set(e.key, arg);
// offset y by depth
shapes.push({x:e.st, y:levelHeight*depth, width:e.dur, height:levelHeight, arg, label, fillColor });
shapes.push({x:e.st, y:levelHeight*depth, width:e.dur, height:levelHeight, arg, label:opts.hideLabels ? null : label, fillColor });
}
div.style("height", levelHeight*levels.length+padding+"px").style("pointerEvents", "none");
} else {
@@ -472,7 +475,7 @@ async function renderProfiler(path, unit, opts) {
drawLine(ctx, [x, x], [0, canvas.clientHeight], { color:m.color });
ctx.fillText(m.name, x+2, 1);
}
for (const [p, color] of paths) { ctx.lineWidth = 1.4; ctx.strokeStyle = color; ctx.stroke(p); }
for (const [p, color] of paths) { ctx.strokeStyle = color; ctx.stroke(p); }
}
function resize() {
@@ -707,7 +710,7 @@ async function main() {
if (url.pathname+url.search !== ckey) e.close();
else if (e.readyState === EventSource.OPEN) activeSrc = e;
}
if (ctx.name === "Profiler") return renderProfiler("/get_profile", "realtime", { width:"132px", height:"32px" });
if (ctx.name === "Profiler") return renderProfiler("/get_profile", "realtime", { width:"132px" });
if (workerUrl == null) await initWorker();
if (ckey in cache) {
ret = cache[ckey];
@@ -715,7 +718,11 @@ async function main() {
// ** Disassembly view
if (!ckey.startsWith("/rewrites")) {
if (!(ckey in cache)) cache[ckey] = ret = await fetchValue(ckey);
if (ret instanceof ArrayBuffer) return renderProfiler(ckey, "clk", { height:"16px" }); // cycles on the x axis
// cycles on the x axis
if (ret instanceof ArrayBuffer) {
opts = {heightScale:0.5, hideLabels:true, levelKey:(e) => parseInt(e.name.split(" ")[1].split(":")[1])};
return renderProfiler(ckey, "clk", opts);
}
displaySelection("#custom");
metadata.innerHTML = "";
const root = d3.create("div").classed("raw-text", true).node();

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
import multiprocessing, pickle, difflib, os, threading, json, time, sys, webbrowser, socket, argparse, socketserver, functools, codecs, io, struct
import ctypes, pathlib, traceback
import ctypes, pathlib, traceback, itertools
from contextlib import redirect_stdout, redirect_stderr
from decimal import Decimal
from http.server import BaseHTTPRequestHandler
@@ -225,19 +225,28 @@ def load_sqtt(profile:list[ProfileEvent]) -> None:
if getenv("SQTT_PARSE"):
from extra.sqtt.attempt_sqtt_parse import parse_sqtt_print_packets
for e in sqtt_events: parse_sqtt_print_packets(e.blob)
if not rctx.inst_execs: return err("EMPTY SQTT OUTPUT", f"{len(sqtt_events)} SQTT events recorded, none got decoded")
if not any([rctx.inst_execs, rctx.occ_events]): return err("EMPTY SQTT OUTPUT", f"{len(sqtt_events)} SQTT events recorded, none got decoded")
steps:list[dict] = []
for name,disasm in rctx.disasms.items():
units:dict[str, int] = {}
events:list[ProfileEvent] = []
# wave instruction events
wave_insts:dict[str, dict] = {}
inst_units:dict[str, itertools.count] = {}
for w in rctx.inst_execs.get(name, []):
if (row:=f"SE:{w.se} CU:{w.cu} SIMD:{w.simd} WAVE:{w.wave_id}") not in units: units[row] = 0
units[row] += 1
events.append(ProfileRangeEvent(row, f"N:{units[row]}", Decimal(w.begin_time), Decimal(w.end_time)))
wave_insts[f"{row} N:{units[row]}"] = {"wave":w, "disasm":disasm, "run_number":units[row]}
# gather and sort all wave execs of this kernel
if (u:=w.wave_loc) not in inst_units: inst_units[u] = itertools.count(0)
n = next(inst_units[u])
events.append(ProfileRangeEvent(w.simd_loc, f"INST WAVE:{w.wave_id} N:{n}", Decimal(w.begin_time), Decimal(w.end_time)))
wave_insts[f"{u} N:{n}"] = {"wave":w, "disasm":disasm, "run_number":n}
# occupancy events
units:dict[str, itertools.count] = {}
wave_start:dict[str, int] = {}
for occ in rctx.occ_events[name]:
if (u:=occ.wave_loc) not in units: units[u] = itertools.count(0)
if u in inst_units: continue
if occ.start: wave_start[u] = occ.time
else: events.append(ProfileRangeEvent(occ.simd_loc, f"OCC WAVE:{occ.wave_id} N:{next(units[u])}", Decimal(wave_start.pop(u)),Decimal(occ.time)))
if not events: continue
# gather and sort all sqtt events for this kernel
events = [ProfilePointEvent(unit, "start", unit, ts=Decimal(0)) for unit in units]+events
kernel = trace.keys[r].ret if (r:=ref_map.get(name)) else None
steps.append(create_step(kernel.name if kernel is not None else name, ("/counters", len(ctxs), len(steps)),
@@ -275,7 +284,7 @@ def get_profile(profile:list[ProfileEvent], sort_fn:Callable[[str], Any]|None=No
(v:=dev_events[k]).sort(key=lambda e:e[0])
layout[k] = timeline_layout(v, start_ts, scache)
layout[f"{k} Memory"] = mem_layout(v, start_ts, unwrap(end_ts), peaks, dtype_size, scache)
groups = sorted(layout.items(), key=lambda x: '' if len(ss:=x[0].split(" ")) == 1 else ss[1])
groups = layout.items() if sort_fn is not None else sorted(layout.items(), key=lambda x: '' if len(ss:=x[0].split(" ")) == 1 else ss[1])
ret = [b"".join([struct.pack("<B", len(k)), k.encode(), v]) for k,v in groups if v is not None]
index = json.dumps({"strings":list(scache), "dtypeSize":dtype_size, "markers":[{"ts":int(e.ts-start_ts), **e.arg} for e in markers]}).encode()
return struct.pack("<IQII", unwrap(end_ts)-start_ts, max(peaks,default=0), len(index), len(ret))+index+b"".join(ret)