ci: add dev<->cpu copy speeds (#11959)

2026-01-09 15:08:02 -05:00 · 2025-09-02 15:22:44 +03:00
parent 74040663bf
commit 897254ad6c
2 changed files with 18 additions and 8 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -688,6 +688,10 @@ jobs:
      run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
    - name: Test DISK copy time
      run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
+    - name: Test CPU copy time
+      run: |
+        AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
+        AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
    - name: Run full CIFAR training w 1 GPU
      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
    # TODO: enable
@@ -745,6 +749,10 @@ jobs:
      run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test DISK copy time
      run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
+    - name: Test CPU copy time
+      run: |
+        NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
+        NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
    - name: Test LLAMA-3
      run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
    - name: Run full CIFAR training w 1 GPU
--- a/test/speed/external_test_copy_speed.py
+++ b/test/speed/external_test_copy_speed.py
@@ -1,9 +1,9 @@
 import unittest, numpy as np
 from tinygrad import Tensor, Device, TinyJit
-from tinygrad.helpers import Timing, CI, OSX
+from tinygrad.helpers import Timing, CI, OSX, getenv
 import multiprocessing.shared_memory as shared_memory

-N = 256
+N = getenv("NSZ", 256)
 class TestCopySpeed(unittest.TestCase):
  @classmethod
  def setUpClass(cls): Device[Device.DEFAULT].synchronize()
@@ -54,22 +54,24 @@ class TestCopySpeed(unittest.TestCase):
    @TinyJit
    def _do_copy(t): return t.to('CPU').realize()

-    t = Tensor.randn(N, N, 4).contiguous().realize()
+    t = Tensor.randn(N, N).contiguous().realize()
+    Device[Device.DEFAULT].synchronize()
    for _ in range(5):
-      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
+      with Timing(f"copy {Device.DEFAULT} -> CPU {t.nbytes()/(1024**2)}M:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        x = _do_copy(t)
        Device[Device.DEFAULT].synchronize()
      np.testing.assert_equal(t.numpy(), x.numpy())

-  def testCopytoCPUtoDefaultJit(self):
+  def testCopyCPUtoDefaultJit(self):
    if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")

    @TinyJit
-    def _do_copy(x): return t.to(Device.DEFAULT).realize()
+    def _do_copy(x): return x.to(Device.DEFAULT).realize()

    for _ in range(5):
-      t = Tensor.randn(N, N, 4, device="CPU").contiguous().realize()
-      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
+      t = Tensor.randn(N, N, device="CPU").contiguous().realize()
+      Device["CPU"].synchronize()
+      with Timing(f"copy CPU -> {Device.DEFAULT} {t.nbytes()/(1024**2)}M:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        x = _do_copy(t)
        Device[Device.DEFAULT].synchronize()
      np.testing.assert_equal(t.numpy(), x.numpy())