mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
test external_multi_gpu.py (and works in CUDA)
This commit is contained in:
54
test/external/external_multi_gpu.py
vendored
54
test/external/external_multi_gpu.py
vendored
@@ -3,14 +3,17 @@
|
||||
# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import colored
|
||||
from tinygrad.helpers import Timing
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
from tinygrad.helpers import colored, Timing
|
||||
from tinygrad.device import Device
|
||||
|
||||
# TODO: support multidevice in cuda
|
||||
device = 'gpu'
|
||||
d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
|
||||
|
||||
def sync():
|
||||
Device[d0].synchronize()
|
||||
Device[d1].synchronize()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("GPU devices", d0, d1)
|
||||
sz = 1024*1024*256 # 1 GB
|
||||
#sz = 1024*64
|
||||
|
||||
@@ -19,38 +22,36 @@ if __name__ == "__main__":
|
||||
c1 = (Tensor.ones(sz, device="cpu")/2).realize()
|
||||
|
||||
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a0 = c0.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
a0 = c0.to(d0).realize()
|
||||
sync()
|
||||
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b1 = c1.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
b1 = c1.to(d1).realize()
|
||||
sync()
|
||||
|
||||
# cross copy. this is going through the CPU
|
||||
with Timing("0 -> CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a1 = a0.to(f'{device}:1').realize()
|
||||
CL.synchronize()
|
||||
with Timing("1 -> CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b0 = b1.to(f'{device}:0').realize()
|
||||
CL.synchronize()
|
||||
# cross copy. this is (sometimes) going through the CPU
|
||||
with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
a1 = a0.to(d1).realize()
|
||||
sync()
|
||||
with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
b0 = b1.to(d0).realize()
|
||||
sync()
|
||||
|
||||
# sum
|
||||
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab0 = (a0 + b0).realize()
|
||||
CL.synchronize()
|
||||
sync()
|
||||
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
ab1 = (a1 + b1).realize()
|
||||
CL.synchronize()
|
||||
sync()
|
||||
|
||||
# cross device sum (does this work?)
|
||||
# is this making a copy first? is that copy through the CPU?
|
||||
# the slowness comes from the *blocking* clprg call, is this pyopencl?
|
||||
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
abx0 = (a0 + b1).realize()
|
||||
CL.synchronize()
|
||||
abx0 = (a0 + b1.to(d0)).realize()
|
||||
sync()
|
||||
|
||||
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
|
||||
abx1 = (b1 + a0).realize()
|
||||
CL.synchronize()
|
||||
abx1 = (b1 + a0.to(d1)).realize()
|
||||
sync()
|
||||
|
||||
# copy back
|
||||
# NOTE: half of this slowness is caused by allocating memory on the CPU
|
||||
@@ -63,6 +64,11 @@ if __name__ == "__main__":
|
||||
print("testing")
|
||||
np.testing.assert_allclose(cc0, cc1)
|
||||
|
||||
# same (cross)
|
||||
print("testing (cross)")
|
||||
np.testing.assert_allclose(cc0, abx0.numpy())
|
||||
np.testing.assert_allclose(cc0, abx1.numpy())
|
||||
|
||||
# devices
|
||||
print(ab0)
|
||||
print(ab1)
|
||||
|
||||
Reference in New Issue
Block a user