From fb89d9a5844865afb996a69f2148936bc28d8da4 Mon Sep 17 00:00:00 2001 From: chenyu Date: Tue, 22 Apr 2025 07:43:51 -0400 Subject: [PATCH] retinanet eval combine output on GPUS[0] (#9966) eval 35 sec -> 20 sec. it was spending 13 seconds assembling output tensor on CPU backend. GPUS[0] seems to have enough memory, otherwise we can lower EVAL_BS --- examples/mlperf/model_train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py index cc96095c93..85a0a502a0 100644 --- a/examples/mlperf/model_train.py +++ b/examples/mlperf/model_train.py @@ -410,7 +410,8 @@ def train_retinanet(): @TinyJit def _eval_step(model, x, **kwargs): out = model(normalize(x, GPUS), **kwargs) - return out.realize() + # reassemble on GPUS[0] before sending back to CPU for speed + return out.to(GPUS[0]).realize() # ** hyperparameters ** config["seed"] = SEED = getenv("SEED", random.SystemRandom().randint(0, 2**32 - 1))