mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
retinanet eval combine output on GPUS[0] (#9966)
eval 35 sec -> 20 sec. it was spending 13 seconds assembling output tensor on CPU backend. GPUS[0] seems to have enough memory, otherwise we can lower EVAL_BS
This commit is contained in:
@@ -410,7 +410,8 @@ def train_retinanet():
|
||||
@TinyJit
|
||||
def _eval_step(model, x, **kwargs):
|
||||
out = model(normalize(x, GPUS), **kwargs)
|
||||
return out.realize()
|
||||
# reassemble on GPUS[0] before sending back to CPU for speed
|
||||
return out.to(GPUS[0]).realize()
|
||||
|
||||
# ** hyperparameters **
|
||||
config["seed"] = SEED = getenv("SEED", random.SystemRandom().randint(0, 2**32 - 1))
|
||||
|
||||
Reference in New Issue
Block a user