Files
tinygrad/test/external/mlperf_stable_diffusion/external_test_eval.py
hooved 5d9035f5a6 Eval for Stable Diffusion mlperf (#12316)
* add diff

* rerun ci

* refactor beam workaround, add test

* fix conflict

* linting
2025-10-02 02:35:38 -04:00

59 lines
3.2 KiB
Python

import unittest, os
import numpy as np
from pathlib import Path
from tempfile import TemporaryDirectory
from tinygrad import Device, Tensor
from tinygrad.helpers import getenv, Context
from tinygrad.nn.state import safe_save, torch_load, get_parameters
from examples.mlperf.model_eval import eval_stable_diffusion, vae_decode
from examples.stable_diffusion import AutoencoderKL
def set_eval_params():
# override these as needed from cli
for k,v in {"MODEL": "stable_diffusion", "GPUS": "8", "EVAL_SAMPLES": "600", "CONTEXT_BS": "816", "DENOISE_BS": "600", "DECODE_BS": "384",
"INCEPTION_BS": "560", "CLIP_BS": "240", "DATADIR": "/raid/datasets/stable_diffusion", "CKPTDIR": "/raid/weights/stable_diffusion",
"AMD_LLVM": "0"}.items():
os.environ[k] = getenv(k, v)
class TestEval(unittest.TestCase):
def test_eval_ckpt(self):
set_eval_params()
with TemporaryDirectory(prefix="test-eval") as tmp:
os.environ["EVAL_CKPT_DIR"] = tmp
# NOTE Although this checkpoint has the original fully trained model from StabilityAI, we are using mlperf code that uses different
# GroupNorm num_groups. Therefore, eval results may not reflect eval results on the original model.
# The purpose of using this checkpoint is to have reproducible eval outputs.
# Eval code expects file and weight names in a specific format, as .safetensors (not .ckpt), which is why we resave the checkpoint
sd_v2 = torch_load(Path(getenv("CKPTDIR", "")) / "sd" / "512-base-ema.ckpt")["state_dict"]
sd_v2 = {k.replace("model.diffusion_model.", "", 1): v for k,v in sd_v2.items() if k.startswith("model.diffusion_model.")}
safe_save(sd_v2, f"{tmp}/0.safetensors")
clip, fid, ckpt = eval_stable_diffusion()
assert ckpt == 0
if Device.DEFAULT == "NULL":
assert clip == 0
assert fid > 0 and fid < 1000
else:
# observed:
# clip=0.08369670808315277, fid=301.05236173709545 (if SEED=12345, commit=c01b2c93076e80ae6d1ebca64bb8e83a54dadba6)
# clip=0.08415728807449341, fid=300.3710877072948 (if SEED=12345, commit=179c7fcfe132f1a6344b57c9d8cef4eded586867)
# clip=0.0828116238117218, fid=301.241909555543 (if SEED=98765, commit=c01b2c93076e80ae6d1ebca64bb8e83a54dadba6)
np.testing.assert_allclose(fid, 301.147, rtol=0.1, atol=0)
np.testing.assert_allclose(clip, 0.08325, rtol=0.1, atol=0)
# only tested on 8xMI300x system
@unittest.skipUnless(getenv("HANG_OK"), "expected to hang")
def test_decoder_beam_hang(self):
set_eval_params()
for k,v in {"BEAM": "2", "HCQDEV_WAIT_TIMEOUT_MS": "300000", "BEAM_UOPS_MAX": "8000", "BEAM_UPCAST_MAX": "256", "BEAM_LOCAL_MAX": "1024",
"BEAM_MIN_PROGRESS": "5", "IGNORE_JIT_FIRST_BEAM": "1"}.items():
os.environ[k] = getenv(k, v)
with Context(BEAM=int(os.environ["BEAM"])): # necessary because helpers.py has already set BEAM=0 and cached getenv for "BEAM"
GPUS = [f"{Device.DEFAULT}:{i}" for i in range(getenv("GPUS", 8))]
vae = AutoencoderKL()
for p in get_parameters(vae): p.to_(GPUS).realize()
x = Tensor.zeros(48,4,64,64).contiguous().to(GPUS).realize()
x.uop = x.uop.multi(0)
for _ in range(2): vae_decode(x, vae)
if __name__=="__main__":
unittest.main()