From fb89d9a5844865afb996a69f2148936bc28d8da4 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Tue, 22 Apr 2025 07:43:51 -0400
Subject: [PATCH] retinanet eval combine output on GPUS[0] (#9966)

eval 35 sec -> 20 sec. it was spending 13 seconds assembling output tensor on CPU backend. GPUS[0] seems to have enough memory, otherwise we can lower EVAL_BS
---
 examples/mlperf/model_train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/mlperf/model_train.py b/examples/mlperf/model_train.py
index cc96095c93..85a0a502a0 100644
--- a/examples/mlperf/model_train.py
+++ b/examples/mlperf/model_train.py
@@ -410,7 +410,8 @@ def train_retinanet():
   @TinyJit
   def _eval_step(model, x, **kwargs):
     out = model(normalize(x, GPUS), **kwargs)
-    return out.realize()
+    # reassemble on GPUS[0] before sending back to CPU for speed
+    return out.to(GPUS[0]).realize()
 
   # ** hyperparameters **
   config["seed"] = SEED = getenv("SEED", random.SystemRandom().randint(0, 2**32 - 1))