multidevice works (#763)

* basic multigpu working * better multigpu test * upper * touchups * cl sync
2026-02-09 06:05:11 -05:00 · 2023-05-04 01:04:58 -07:00
parent 4f6d674ec0
commit f28df9900f
6 changed files with 78 additions and 8 deletions
--- a/test/external/external_multi_gpu.py
+++ b/test/external/external_multi_gpu.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# cd disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
+# LD_PRELOAD=$PWD/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
+import numpy as np
+from tinygrad.tensor import Tensor
+from tinygrad.helpers import colored
+from extra.helpers import Timing
+from tinygrad.runtime.ops_gpu import CL
+
+# TODO: support multidevice in cuda
+device = 'gpu'
+
+if __name__ == "__main__":
+  sz = 1024*1024*256  # 1 GB
+  #sz = 1024*64
+
+  with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
+    c0 = Tensor.ones(sz, device="cpu").realize()
+    c1 = (Tensor.ones(sz, device="cpu")/2).realize()
+
+  with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    a0 = c0.to(f'{device}:0').realize()
+    CL.synchronize()
+  with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    b1 = c1.to(f'{device}:1').realize()
+    CL.synchronize()
+
+  # cross copy. this is going through the CPU
+  with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    a1 = a0.to(f'{device}:1').realize()
+    CL.synchronize()
+  with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    b0 = b1.to(f'{device}:0').realize()
+    CL.synchronize()
+
+  # sum
+  with Timing("0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    ab0 = (a0 + b0).realize()
+    CL.synchronize()
+  with Timing("1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    ab1 = (a1 + b1).realize()
+    CL.synchronize()
+
+  # cross device sum (does this work?)
+  # is this making a copy first? is that copy through the CPU?
+  # the slowness comes from the *blocking* clprg call, is this pyopencl?
+  with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    abx0 = (a0 + b1).realize()
+    CL.synchronize()
+
+  with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
+    abx1 = (b1 + a0).realize()
+    CL.synchronize()
+
+  # devices
+  print(ab0)
+  print(ab1)
+  print(abx0)
+  print(abx1)
+
+  # same
+  #print("testing")
+  #np.testing.assert_allclose(ab0.numpy(), ab1.numpy())
+  #np.testing.assert_allclose(ab0.numpy(), abx0.numpy())
+  #np.testing.assert_allclose(ab0.numpy(), abx1.numpy())
+