ci: add dev<->cpu copy speeds (#11959)

This commit is contained in:
nimlgen
2025-09-02 15:22:44 +03:00
committed by GitHub
parent 74040663bf
commit 897254ad6c
2 changed files with 18 additions and 8 deletions

View File

@@ -688,6 +688,10 @@ jobs:
run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
- name: Test DISK copy time
run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
- name: Test CPU copy time
run: |
AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
- name: Run full CIFAR training w 1 GPU
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
# TODO: enable
@@ -745,6 +749,10 @@ jobs:
run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
- name: Test DISK copy time
run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
- name: Test CPU copy time
run: |
NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
- name: Test LLAMA-3
run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
- name: Run full CIFAR training w 1 GPU

View File

@@ -1,9 +1,9 @@
import unittest, numpy as np
from tinygrad import Tensor, Device, TinyJit
from tinygrad.helpers import Timing, CI, OSX
from tinygrad.helpers import Timing, CI, OSX, getenv
import multiprocessing.shared_memory as shared_memory
N = 256
N = getenv("NSZ", 256)
class TestCopySpeed(unittest.TestCase):
@classmethod
def setUpClass(cls): Device[Device.DEFAULT].synchronize()
@@ -54,22 +54,24 @@ class TestCopySpeed(unittest.TestCase):
@TinyJit
def _do_copy(t): return t.to('CPU').realize()
t = Tensor.randn(N, N, 4).contiguous().realize()
t = Tensor.randn(N, N).contiguous().realize()
Device[Device.DEFAULT].synchronize()
for _ in range(5):
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
with Timing(f"copy {Device.DEFAULT} -> CPU {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
x = _do_copy(t)
Device[Device.DEFAULT].synchronize()
np.testing.assert_equal(t.numpy(), x.numpy())
def testCopytoCPUtoDefaultJit(self):
def testCopyCPUtoDefaultJit(self):
if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")
@TinyJit
def _do_copy(x): return t.to(Device.DEFAULT).realize()
def _do_copy(x): return x.to(Device.DEFAULT).realize()
for _ in range(5):
t = Tensor.randn(N, N, 4, device="CPU").contiguous().realize()
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
t = Tensor.randn(N, N, device="CPU").contiguous().realize()
Device["CPU"].synchronize()
with Timing(f"copy CPU -> {Device.DEFAULT} {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
x = _do_copy(t)
Device[Device.DEFAULT].synchronize()
np.testing.assert_equal(t.numpy(), x.numpy())