tinygrad/test/test_copy_speed.py

import unittest, numpy as np
from tinygrad import Tensor, Device, TinyJit
from tinygrad.helpers import Timing, CI, OSX
import multiprocessing.shared_memory as shared_memory

N = 256
class TestCopySpeed(unittest.TestCase):
  @classmethod
  def setUpClass(cls): Device[Device.DEFAULT].synchronize()

  def testCopySHMtoDefault(self):
    s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
    s.close()
    if CI and not OSX:
      t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
    else:
      t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        with Timing("queue: "):
          t.to(Device.DEFAULT).realize()
        Device[Device.DEFAULT].synchronize()
    s.unlink()

  def testCopyCPUtoDefault(self):
    t = Tensor.ones(N, N, device="CPU").contiguous().realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        with Timing("queue: "):
          t.to(Device.DEFAULT).realize()
        Device[Device.DEFAULT].synchronize()

  def testCopyCPUtoDefaultFresh(self):
    print("fresh copy")
    for _ in range(3):
      t = Tensor.ones(N, N, device="CPU").contiguous().realize()
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821
        with Timing("queue: "):
          t.to(Device.DEFAULT).realize()
        Device[Device.DEFAULT].synchronize()
      del t

  def testCopyDefaulttoCPU(self):
    t = Tensor.ones(N, N).contiguous().realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        t.to('CPU').realize()

  def testCopyDefaulttoCPUJit(self):
    if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")

    @TinyJit
    def _do_copy(t): return t.to('CPU').realize()

    t = Tensor.randn(N, N, 4).contiguous().realize()
    for _ in range(5):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        x = _do_copy(t)
        Device[Device.DEFAULT].synchronize()
      np.testing.assert_equal(t.numpy(), x.numpy())

  def testCopytoCPUtoDefaultJit(self):
    if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")

    @TinyJit
    def _do_copy(x): return t.to(Device.DEFAULT).realize()

    for _ in range(5):
      t = Tensor.randn(N, N, 4, device="CPU").contiguous().realize()
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
        x = _do_copy(t)
        Device[Device.DEFAULT].synchronize()
      np.testing.assert_equal(t.numpy(), x.numpy())

  @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
  @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
  def testCopyCPUto6GPUs(self):
    from tinygrad.runtime.ops_gpu import CLDevice
    if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
    t = Tensor.ones(N, N, device="CPU").contiguous().realize()
    print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
    for _ in range(3):
      with Timing("sync:  ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
        with Timing("queue: "):
          for g in range(6):
            t.to(f"gpu:{g}").realize()
        Device["gpu"].synchronize()

if __name__ == '__main__':
  unittest.main()