assert benchmark times (#12042)

* assert jitted times in openpilot

* better error

* better error

* add ASSERT_MIN_STEP_TIME to more models

* t is step_times

* update benchmark times

* update times
This commit is contained in:
Sieds Lykles
2025-09-09 23:40:02 +02:00
committed by GitHub
parent 58d13a6e3e
commit 5b73076e48
7 changed files with 67 additions and 31 deletions

View File

@@ -181,6 +181,7 @@ class GPT2:
self.tokenizer = tokenizer
def generate(self, prompt:str, max_length:int, temperature:float, timing:bool=False, batch_size:int=1):
step_times = []
prompt_tokens = self.tokenizer.encode(prompt, allowed_special={"<|endoftext|>"})
toks = [prompt_tokens[:] for _ in range(batch_size)]
start_pos = 0
@@ -197,8 +198,13 @@ class GPT2:
else:
tokens = Tensor([x[start_pos:] for x in toks])
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
step_times.append((GlobalCounters.time_sum_s-st)*1e3)
start_pos = len(toks[0])
for i,t in enumerate(tok): toks[i].append(t)
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
min_time = min(step_times)
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
return [self.tokenizer.decode(x) for x in toks]
# **** main code ****

View File

@@ -355,7 +355,7 @@ def train_cifar():
# https://www.anandtech.com/show/16727/nvidia-announces-geforce-rtx-3080-ti-3070-ti-upgraded-cards-coming-in-june
# 136 TFLOPS is the theoretical max w float16 on 3080 Ti
step_times = []
model_ema: Optional[modelEMA] = None
projected_ema_decay_val = hyp['ema']['decay_base'] ** hyp['ema']['every_n_steps']
i = 0
@@ -413,12 +413,17 @@ def train_cifar():
model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
cl = time.monotonic()
step_times.append((cl-st)*1000.0)
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
# 53 221.74 ms run, 2.22 ms python, 219.52 ms CL, 803.39 loss, 0.000807 LR, 4.66 GB used, 3042.49 GFLOPS, 674.65 GOPS
print(f"{i:3d} {(cl-st)*1000.0:7.2f} ms run, {(et-st)*1000.0:7.2f} ms python, {(cl-et)*1000.0:7.2f} ms {device_str}, {loss_cpu:7.2f} loss, {opt_non_bias.lr.numpy()[0]:.6f} LR, {GlobalCounters.mem_used/1e9:.2f} GB used, {GlobalCounters.global_ops*1e-9/(cl-st):9.2f} GFLOPS, {GlobalCounters.global_ops*1e-9:9.2f} GOPS")
st = cl
i += 1
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
min_time = min(step_times)
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
# verify eval acc
if target := getenv("TARGET_EVAL_ACC_PCT", 0.0):
if eval_acc_pct >= target:

View File

@@ -252,6 +252,10 @@ def train_resnet():
print(f"epoch global_ops: {steps_in_train_epoch * GlobalCounters.global_ops:_}, "
f"epoch global_mem: {steps_in_train_epoch * GlobalCounters.global_mem:_}")
# if we are doing beam search, run the first eval too
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
min_time = min(step_times)
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
if (TRAIN_BEAM or EVAL_BEAM) and e == start_epoch: break
return
if MLLOGGER and RUNMLPERF:
@@ -344,6 +348,8 @@ def train_resnet():
print(f"saving ckpt to {fn}")
safe_save(get_training_state(model, optimizer_group, scheduler_group), fn)
def train_retinanet():
from contextlib import redirect_stdout
from examples.mlperf.dataloader import batch_load_retinanet

View File

@@ -6,7 +6,7 @@
from tinygrad import Tensor, TinyJit, dtypes, GlobalCounters
from tinygrad.nn import Conv2d, GroupNorm
from tinygrad.nn.state import safe_load, load_state_dict
from tinygrad.helpers import fetch, trange, colored, Timing
from tinygrad.helpers import fetch, trange, colored, Timing, getenv
from extra.models.clip import Embedder, FrozenClosedClipEmbedder, FrozenOpenClipEmbedder
from extra.models.unet import UNetModel, Upsample, Downsample, timestep_embedding
from extra.bench_log import BenchEvent, WallTimeEvent
@@ -14,7 +14,7 @@ from examples.stable_diffusion import ResnetBlock, Mid
import numpy as np
from typing import Dict, List, Callable, Optional, Any, Set, Tuple, Union, Type
import argparse, tempfile
import argparse, tempfile, time
from abc import ABC, abstractmethod
from pathlib import Path
from PIL import Image
@@ -342,11 +342,13 @@ class DPMPP2MSampler:
sigmas = self.discretization(num_steps).to(x.device)
x *= Tensor.sqrt(1.0 + sigmas[0] ** 2.0)
num_sigmas = len(sigmas)
step_times = []
old_denoised = None
for i in trange(num_sigmas - 1):
with Timing("step in ", enabled=timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
GlobalCounters.reset()
st = time.perf_counter_ns()
with WallTimeEvent(BenchEvent.STEP):
x, old_denoised = self.sampler_step(
old_denoised=old_denoised,
@@ -358,8 +360,13 @@ class DPMPP2MSampler:
c=c,
uc=uc,
)
step_times.append(t:=(time.perf_counter_ns() - st)*1e-6)
x.realize(old_denoised)
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
min_time = min(step_times)
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
return x

View File

@@ -2,7 +2,7 @@
# https://github.com/ekagra-ranjan/huggingface-blog/blob/main/stable_diffusion.md
import tempfile
from pathlib import Path
import argparse
import argparse, time
from collections import namedtuple
from typing import Dict, Any
@@ -266,17 +266,23 @@ if __name__ == "__main__":
def run(model, *x): return model(*x).realize()
# this is diffusion
step_times = []
with Context(BEAM=getenv("LATEBEAM")):
for index, timestep in (t:=tqdm(list(enumerate(timesteps))[::-1])):
GlobalCounters.reset()
st = time.perf_counter_ns()
t.set_description("%3d %3d" % (index, timestep))
with Timing("step in ", enabled=args.timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
with WallTimeEvent(BenchEvent.STEP):
tid = Tensor([index])
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
if args.timing: Device[Device.DEFAULT].synchronize()
step_times.append((time.perf_counter_ns() - st)*1e-6)
del run
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
min_time = min(step_times)
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
# upsample latent space to image with autoencoder
x = model.decode(latent)
print(x.shape)