mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
ci: add dev<->cpu copy speeds (#11959)
This commit is contained in:
8
.github/workflows/benchmark.yml
vendored
8
.github/workflows/benchmark.yml
vendored
@@ -688,6 +688,10 @@ jobs:
|
||||
run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
|
||||
- name: Test DISK copy time
|
||||
run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
|
||||
- name: Test CPU copy time
|
||||
run: |
|
||||
AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
|
||||
AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
|
||||
- name: Run full CIFAR training w 1 GPU
|
||||
run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt
|
||||
# TODO: enable
|
||||
@@ -745,6 +749,10 @@ jobs:
|
||||
run: NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
|
||||
- name: Test DISK copy time
|
||||
run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
|
||||
- name: Test CPU copy time
|
||||
run: |
|
||||
NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
|
||||
NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
|
||||
- name: Test LLAMA-3
|
||||
run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt
|
||||
- name: Run full CIFAR training w 1 GPU
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import unittest, numpy as np
|
||||
from tinygrad import Tensor, Device, TinyJit
|
||||
from tinygrad.helpers import Timing, CI, OSX
|
||||
from tinygrad.helpers import Timing, CI, OSX, getenv
|
||||
import multiprocessing.shared_memory as shared_memory
|
||||
|
||||
N = 256
|
||||
N = getenv("NSZ", 256)
|
||||
class TestCopySpeed(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls): Device[Device.DEFAULT].synchronize()
|
||||
@@ -54,22 +54,24 @@ class TestCopySpeed(unittest.TestCase):
|
||||
@TinyJit
|
||||
def _do_copy(t): return t.to('CPU').realize()
|
||||
|
||||
t = Tensor.randn(N, N, 4).contiguous().realize()
|
||||
t = Tensor.randn(N, N).contiguous().realize()
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
for _ in range(5):
|
||||
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
||||
with Timing(f"copy {Device.DEFAULT} -> CPU {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
||||
x = _do_copy(t)
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
np.testing.assert_equal(t.numpy(), x.numpy())
|
||||
|
||||
def testCopytoCPUtoDefaultJit(self):
|
||||
def testCopyCPUtoDefaultJit(self):
|
||||
if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")
|
||||
|
||||
@TinyJit
|
||||
def _do_copy(x): return t.to(Device.DEFAULT).realize()
|
||||
def _do_copy(x): return x.to(Device.DEFAULT).realize()
|
||||
|
||||
for _ in range(5):
|
||||
t = Tensor.randn(N, N, 4, device="CPU").contiguous().realize()
|
||||
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
||||
t = Tensor.randn(N, N, device="CPU").contiguous().realize()
|
||||
Device["CPU"].synchronize()
|
||||
with Timing(f"copy CPU -> {Device.DEFAULT} {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
|
||||
x = _do_copy(t)
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
np.testing.assert_equal(t.numpy(), x.numpy())
|
||||
|
||||
Reference in New Issue
Block a user