viz: early convert to cpu time (#11192)

This commit is contained in:
qazal
2025-07-12 17:19:41 +03:00
committed by GitHub
parent 12b04efd69
commit 6a9f059b21

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python3
import multiprocessing, pickle, difflib, os, threading, json, time, sys, webbrowser, socket, argparse, socketserver, functools, decimal, codecs
import multiprocessing, pickle, difflib, os, threading, json, time, sys, webbrowser, socket, argparse, socketserver, functools, codecs
from decimal import Decimal
from http.server import BaseHTTPRequestHandler
from urllib.parse import parse_qs, urlparse
from typing import Any, TypedDict, Generator
@@ -94,18 +95,19 @@ def get_details(ctx:TrackedGraphRewrite) -> Generator[GraphRewriteDetails, None,
# Profiler API
device_ts_diffs:dict[str, tuple[Decimal, Decimal]] = {}
def cpu_ts_diff(device:str, thread=0) -> Decimal: return device_ts_diffs.get(device, (Decimal(0),))[thread]
DevEvent = ProfileRangeEvent|ProfileGraphEntry|ProfilePointEvent
def flatten_events(profile:list[ProfileEvent], devs) -> Generator[tuple[decimal.Decimal, decimal.Decimal|None, DevEvent], None, None]:
def flatten_events(profile:list[ProfileEvent]) -> Generator[tuple[Decimal, Decimal, DevEvent], None, None]:
for e in profile:
if isinstance(e, ProfileRangeEvent): yield (e.st, e.en, e)
if isinstance(e, ProfilePointEvent): yield (e.st, None, e)
if isinstance(e, ProfileGraphEvent):
if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device, e.is_copy)), (e.en if e.en is not None else e.st)+diff, e)
elif isinstance(e, ProfilePointEvent): yield (e.st, e.st, e)
elif isinstance(e, ProfileGraphEvent):
cpu_ts = []
for ent in e.ents:
tdiff = devs[ent.device][ent.is_copy]
cpu_ts += [e.sigs[ent.st_id]+tdiff, e.sigs[ent.en_id]+tdiff]
for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device, ent.is_copy)), e.sigs[ent.en_id]+diff]
yield (st:=min(cpu_ts)), (et:=max(cpu_ts)), ProfileRangeEvent(f"{e.ents[0].device.split(':')[0]} Graph", f"batched {len(e.ents)}", st, et)
for ent in e.ents: yield (e.sigs[ent.st_id], e.sigs[ent.en_id], ent)
for i,ent in enumerate(e.ents): yield (cpu_ts[i*2], cpu_ts[i*2+1], ent)
# timeline layout stacks events in a contiguous block. When a late starter finishes late, there is whitespace in the higher levels.
def timeline_layout(events:list[tuple[int, int, float, DevEvent]]) -> dict:
@@ -155,17 +157,14 @@ def mem_layout(events:list[tuple[int, int, float, DevEvent]]) -> dict:
def get_profile(profile:list[ProfileEvent]):
# start by getting the time diffs
devs = {e.device:(e.comp_tdiff, e.copy_tdiff if e.copy_tdiff is not None else e.comp_tdiff) for e in profile if isinstance(e,ProfileDeviceEvent)}
for ev in profile:
if isinstance(ev,ProfileDeviceEvent): device_ts_diffs[ev.device] = (ev.comp_tdiff, ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff)
# map events per device
dev_events:dict[str, list[tuple[int, int, float, DevEvent]]] = {}
min_ts:int|None = None
max_ts:int|None = None
for ts,en,e in flatten_events(profile, devs):
time_diff = devs[e.device][e.__dict__.get("is_copy",False)] if e.device in devs else decimal.Decimal(0)
# ProfilePointEvent records perf_counter, offset other events by GPU time diff
st = int(ts) if isinstance(e, ProfilePointEvent) else int(ts+time_diff)
et = st if en is None else int(en+time_diff)
dev_events.setdefault(e.device,[]).append((st, et, 0. if en is None else float(en-ts), e))
for ts,en,e in flatten_events(profile):
dev_events.setdefault(e.device,[]).append((st:=int(ts), et:=int(en), float(en-ts), e))
if min_ts is None or st < min_ts: min_ts = st
if max_ts is None or et > max_ts: max_ts = et
# return layout of per device events