From 897254ad6caa10dc4db4d012f88de47600c4656b Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:22:44 +0300 Subject: [PATCH] ci: add dev<->cpu copy speeds (#11959) --- .github/workflows/benchmark.yml | 8 ++++++++ test/speed/external_test_copy_speed.py | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d01bb69314..af30b3b19a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -688,6 +688,10 @@ jobs: run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py - name: Test DISK copy time run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py + - name: Test CPU copy time + run: | + AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit + AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit - name: Run full CIFAR training w 1 GPU run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt # TODO: enable @@ -745,6 +749,10 @@ jobs: run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops - name: Test DISK copy time run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py + - name: Test CPU copy time + run: | + NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit + NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit - name: Test LLAMA-3 run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt - name: Run full CIFAR training w 1 GPU diff --git a/test/speed/external_test_copy_speed.py b/test/speed/external_test_copy_speed.py index 391a4da0c6..351c7d993a 100644 --- a/test/speed/external_test_copy_speed.py +++ b/test/speed/external_test_copy_speed.py @@ -1,9 +1,9 @@ import unittest, numpy as np from tinygrad import Tensor, Device, TinyJit -from tinygrad.helpers import Timing, CI, OSX +from tinygrad.helpers import Timing, CI, OSX, getenv import multiprocessing.shared_memory as shared_memory -N = 256 +N = getenv("NSZ", 256) class TestCopySpeed(unittest.TestCase): @classmethod def setUpClass(cls): Device[Device.DEFAULT].synchronize() @@ -54,22 +54,24 @@ class TestCopySpeed(unittest.TestCase): @TinyJit def _do_copy(t): return t.to('CPU').realize() - t = Tensor.randn(N, N, 4).contiguous().realize() + t = Tensor.randn(N, N).contiguous().realize() + Device[Device.DEFAULT].synchronize() for _ in range(5): - with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): + with Timing(f"copy {Device.DEFAULT} -> CPU {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): x = _do_copy(t) Device[Device.DEFAULT].synchronize() np.testing.assert_equal(t.numpy(), x.numpy()) - def testCopytoCPUtoDefaultJit(self): + def testCopyCPUtoDefaultJit(self): if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op") @TinyJit - def _do_copy(x): return t.to(Device.DEFAULT).realize() + def _do_copy(x): return x.to(Device.DEFAULT).realize() for _ in range(5): - t = Tensor.randn(N, N, 4, device="CPU").contiguous().realize() - with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): + t = Tensor.randn(N, N, device="CPU").contiguous().realize() + Device["CPU"].synchronize() + with Timing(f"copy CPU -> {Device.DEFAULT} {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): x = _do_copy(t) Device[Device.DEFAULT].synchronize() np.testing.assert_equal(t.numpy(), x.numpy())