From 0599e8618643d9c40c692a1b06fd6cd48a933135 Mon Sep 17 00:00:00 2001 From: chenyu Date: Wed, 10 Sep 2025 13:56:40 -0400 Subject: [PATCH] replace hardcoded GPU in llama debug msg (#12102) --- examples/gpt2.py | 2 +- examples/llama.py | 2 +- examples/llama3.py | 4 ++-- examples/qwq.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/gpt2.py b/examples/gpt2.py index de577e911f..6670b4e2bb 100644 --- a/examples/gpt2.py +++ b/examples/gpt2.py @@ -189,7 +189,7 @@ class GPT2: GlobalCounters.reset() if timing: print("") st = GlobalCounters.time_sum_s - with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ + with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+ f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing): with WallTimeEvent(BenchEvent.STEP): diff --git a/examples/llama.py b/examples/llama.py index 42f9b6e57b..6739ca4c56 100755 --- a/examples/llama.py +++ b/examples/llama.py @@ -478,7 +478,7 @@ After you are done speaking, output [EOS]. You are not Chad. with Profiling(enabled=args.profile): with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"): with WallTimeEvent(BenchEvent.STEP): - with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ + with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+ f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing): tok_tensor = llama.model(next_tok, start_pos, args.temperature) diff --git a/examples/llama3.py b/examples/llama3.py index 9664f491f2..d7c7f2c921 100644 --- a/examples/llama3.py +++ b/examples/llama3.py @@ -441,7 +441,7 @@ if __name__ == "__main__": with Profiling(enabled=args.profile): with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"): with WallTimeEvent(BenchEvent.STEP): - with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ + with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+ f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None): tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P) @@ -479,7 +479,7 @@ if __name__ == "__main__": st = GlobalCounters.time_sum_s with Profiling(enabled=args.profile): with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"): - with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+ + with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+ f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+ (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing): diff --git a/examples/qwq.py b/examples/qwq.py index fad87695bd..b3b03065dd 100644 --- a/examples/qwq.py +++ b/examples/qwq.py @@ -8,7 +8,7 @@ from typing import Dict, Union from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16 from examples.llama3 import load -from tinygrad import nn, Tensor +from tinygrad import nn, Tensor, Device from tinygrad.helpers import fetch, colored, GlobalCounters, Timing, DEBUG from tinygrad.nn.state import load_state_dict, get_parameters @@ -80,7 +80,7 @@ if __name__ == "__main__": st = GlobalCounters.time_sum_s next_tok = Tensor([toks[start_pos:]]) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1) with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"): - with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "") + + with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "") + f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB" + (f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing): tok_tensor = transformer(next_tok, start_pos, args.temperature)