mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
assert benchmark times (#12042)
* assert jitted times in openpilot * better error * better error * add ASSERT_MIN_STEP_TIME to more models * t is step_times * update benchmark times * update times
This commit is contained in:
@@ -181,6 +181,7 @@ class GPT2:
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def generate(self, prompt:str, max_length:int, temperature:float, timing:bool=False, batch_size:int=1):
|
||||
step_times = []
|
||||
prompt_tokens = self.tokenizer.encode(prompt, allowed_special={"<|endoftext|>"})
|
||||
toks = [prompt_tokens[:] for _ in range(batch_size)]
|
||||
start_pos = 0
|
||||
@@ -197,8 +198,13 @@ class GPT2:
|
||||
else:
|
||||
tokens = Tensor([x[start_pos:] for x in toks])
|
||||
tok = self.model(tokens, Variable("start_pos", 1 if start_pos else 0, MAX_CONTEXT-1).bind(start_pos), temperature).tolist()
|
||||
step_times.append((GlobalCounters.time_sum_s-st)*1e3)
|
||||
start_pos = len(toks[0])
|
||||
for i,t in enumerate(tok): toks[i].append(t)
|
||||
|
||||
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
|
||||
min_time = min(step_times)
|
||||
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
|
||||
return [self.tokenizer.decode(x) for x in toks]
|
||||
|
||||
# **** main code ****
|
||||
|
||||
@@ -355,7 +355,7 @@ def train_cifar():
|
||||
|
||||
# https://www.anandtech.com/show/16727/nvidia-announces-geforce-rtx-3080-ti-3070-ti-upgraded-cards-coming-in-june
|
||||
# 136 TFLOPS is the theoretical max w float16 on 3080 Ti
|
||||
|
||||
step_times = []
|
||||
model_ema: Optional[modelEMA] = None
|
||||
projected_ema_decay_val = hyp['ema']['decay_base'] ** hyp['ema']['every_n_steps']
|
||||
i = 0
|
||||
@@ -413,12 +413,17 @@ def train_cifar():
|
||||
model_ema.update(model, Tensor([projected_ema_decay_val*(i/STEPS)**hyp['ema']['decay_pow']]))
|
||||
|
||||
cl = time.monotonic()
|
||||
step_times.append((cl-st)*1000.0)
|
||||
device_str = loss.device if isinstance(loss.device, str) else f"{loss.device[0]} * {len(loss.device)}"
|
||||
# 53 221.74 ms run, 2.22 ms python, 219.52 ms CL, 803.39 loss, 0.000807 LR, 4.66 GB used, 3042.49 GFLOPS, 674.65 GOPS
|
||||
print(f"{i:3d} {(cl-st)*1000.0:7.2f} ms run, {(et-st)*1000.0:7.2f} ms python, {(cl-et)*1000.0:7.2f} ms {device_str}, {loss_cpu:7.2f} loss, {opt_non_bias.lr.numpy()[0]:.6f} LR, {GlobalCounters.mem_used/1e9:.2f} GB used, {GlobalCounters.global_ops*1e-9/(cl-st):9.2f} GFLOPS, {GlobalCounters.global_ops*1e-9:9.2f} GOPS")
|
||||
st = cl
|
||||
i += 1
|
||||
|
||||
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
|
||||
min_time = min(step_times)
|
||||
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
|
||||
|
||||
# verify eval acc
|
||||
if target := getenv("TARGET_EVAL_ACC_PCT", 0.0):
|
||||
if eval_acc_pct >= target:
|
||||
|
||||
@@ -252,6 +252,10 @@ def train_resnet():
|
||||
print(f"epoch global_ops: {steps_in_train_epoch * GlobalCounters.global_ops:_}, "
|
||||
f"epoch global_mem: {steps_in_train_epoch * GlobalCounters.global_mem:_}")
|
||||
# if we are doing beam search, run the first eval too
|
||||
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
|
||||
min_time = min(step_times)
|
||||
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
|
||||
|
||||
if (TRAIN_BEAM or EVAL_BEAM) and e == start_epoch: break
|
||||
return
|
||||
if MLLOGGER and RUNMLPERF:
|
||||
@@ -344,6 +348,8 @@ def train_resnet():
|
||||
print(f"saving ckpt to {fn}")
|
||||
safe_save(get_training_state(model, optimizer_group, scheduler_group), fn)
|
||||
|
||||
|
||||
|
||||
def train_retinanet():
|
||||
from contextlib import redirect_stdout
|
||||
from examples.mlperf.dataloader import batch_load_retinanet
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
from tinygrad import Tensor, TinyJit, dtypes, GlobalCounters
|
||||
from tinygrad.nn import Conv2d, GroupNorm
|
||||
from tinygrad.nn.state import safe_load, load_state_dict
|
||||
from tinygrad.helpers import fetch, trange, colored, Timing
|
||||
from tinygrad.helpers import fetch, trange, colored, Timing, getenv
|
||||
from extra.models.clip import Embedder, FrozenClosedClipEmbedder, FrozenOpenClipEmbedder
|
||||
from extra.models.unet import UNetModel, Upsample, Downsample, timestep_embedding
|
||||
from extra.bench_log import BenchEvent, WallTimeEvent
|
||||
@@ -14,7 +14,7 @@ from examples.stable_diffusion import ResnetBlock, Mid
|
||||
import numpy as np
|
||||
|
||||
from typing import Dict, List, Callable, Optional, Any, Set, Tuple, Union, Type
|
||||
import argparse, tempfile
|
||||
import argparse, tempfile, time
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
@@ -342,11 +342,13 @@ class DPMPP2MSampler:
|
||||
sigmas = self.discretization(num_steps).to(x.device)
|
||||
x *= Tensor.sqrt(1.0 + sigmas[0] ** 2.0)
|
||||
num_sigmas = len(sigmas)
|
||||
step_times = []
|
||||
|
||||
old_denoised = None
|
||||
for i in trange(num_sigmas - 1):
|
||||
with Timing("step in ", enabled=timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
|
||||
GlobalCounters.reset()
|
||||
st = time.perf_counter_ns()
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
x, old_denoised = self.sampler_step(
|
||||
old_denoised=old_denoised,
|
||||
@@ -358,8 +360,13 @@ class DPMPP2MSampler:
|
||||
c=c,
|
||||
uc=uc,
|
||||
)
|
||||
step_times.append(t:=(time.perf_counter_ns() - st)*1e-6)
|
||||
x.realize(old_denoised)
|
||||
|
||||
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
|
||||
min_time = min(step_times)
|
||||
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
|
||||
|
||||
return x
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# https://github.com/ekagra-ranjan/huggingface-blog/blob/main/stable_diffusion.md
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import argparse, time
|
||||
from collections import namedtuple
|
||||
from typing import Dict, Any
|
||||
|
||||
@@ -266,17 +266,23 @@ if __name__ == "__main__":
|
||||
def run(model, *x): return model(*x).realize()
|
||||
|
||||
# this is diffusion
|
||||
step_times = []
|
||||
with Context(BEAM=getenv("LATEBEAM")):
|
||||
for index, timestep in (t:=tqdm(list(enumerate(timesteps))[::-1])):
|
||||
GlobalCounters.reset()
|
||||
st = time.perf_counter_ns()
|
||||
t.set_description("%3d %3d" % (index, timestep))
|
||||
with Timing("step in ", enabled=args.timing, on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
|
||||
with WallTimeEvent(BenchEvent.STEP):
|
||||
tid = Tensor([index])
|
||||
latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args.guidance]))
|
||||
if args.timing: Device[Device.DEFAULT].synchronize()
|
||||
step_times.append((time.perf_counter_ns() - st)*1e-6)
|
||||
del run
|
||||
|
||||
if (assert_time:=getenv("ASSERT_MIN_STEP_TIME")):
|
||||
min_time = min(step_times)
|
||||
assert min_time < assert_time, f"Speed regression, expected min step time of < {assert_time} ms but took: {min_time} ms"
|
||||
# upsample latent space to image with autoencoder
|
||||
x = model.decode(latent)
|
||||
print(x.shape)
|
||||
|
||||
Reference in New Issue
Block a user