disk_read_speed example

2026-04-29 03:00:14 -04:00 · 2024-01-04 13:58:39 -08:00
parent 8a63f26a0f
commit c2a044ed83
4 changed files with 116 additions and 2 deletions
--- a/extra/disk_read_speed.py
+++ b/extra/disk_read_speed.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+import os, ctypes, ctypes.util, io, mmap
+from tinygrad.helpers import Timing, from_mv
+libc = ctypes.CDLL(ctypes.util.find_library("c"))
+
+#from extra.hip_gpu_driver import hip_ioctl
+
+# sudo su -c "echo 3 > /proc/sys/vm/drop_caches"
+
+# sudo su -c 'echo 8 > /proc/sys/kernel/printk'
+# sudo su -c "echo 'module amdgpu +p' > /sys/kernel/debug/dynamic_debug/control"
+
+libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
+
+libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
+libc.read.restype = ctypes.c_size_t
+
+libc.malloc.argtypes = [ctypes.c_size_t]
+libc.malloc.restype = ctypes.c_void_p
+
+def read_direct(fd, sz):
+  with Timing("mmap: ", lambda x: f", {sz/x:.2f} GB/s"):
+    buf = mmap.mmap(-1, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE)
+  with Timing("read: ", lambda x: f", {sz/x:.2f} GB/s"):
+    ret = libc.read(fd, from_mv(buf), sz)
+  assert ret == sz
+
+def read_mmap(fd, sz):
+  with Timing("mmfd:       ", lambda x: f", {sz/x:.2f} GB/s"):
+    buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
+    t = 0
+    for i in range(0, sz, 0x1000): t += buf[i]
+
+def read_to_gpu_mmap(fd, sz, gpubuf):
+  with Timing("gpu copyin: ", lambda x: f", {sz/x:.2f} GB/s"):
+    with Timing("mmfd:       ", lambda x: f", {sz/x:.2f} GB/s"):
+      buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
+    dev.allocator._copyin_async(gpubuf, from_mv(buf), sz)
+    dev.synchronize()
+
+def read_to_gpu_single(fd, sz, gpubuf):
+  os.lseek(fd, 0, os.SEEK_SET)
+  with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
+    with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
+      hst = dev.allocator._hostalloc(sz)
+    with Timing("read to host:   ", lambda x: f", {sz/x:.2f} GB/s"):
+      ret = libc.read(fd, hst, sz)
+    with Timing("gpu host copy:  ", lambda x: f", {sz/x:.2f} GB/s"):
+      dev.allocator._copyin_async(gpubuf, hst, sz)
+      dev.synchronize()
+
+def read_to_gpu_pingpong(fd, sz, gpubuf):
+  PIECE = 8
+  psz = sz//PIECE
+  print(f"piece size {psz:#x}")
+  with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
+    hst1 = dev.allocator._hostalloc(psz)
+    hst2 = dev.allocator._hostalloc(psz)
+
+  os.lseek(fd, 0, os.SEEK_SET)
+  with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
+    for i in range(PIECE//2):
+      with Timing("tfer(0):           ", lambda x: f", {psz/x:.2f} GB/s"):
+        ret = libc.read(fd, hst1, psz)
+        dev.synchronize()
+        dev.allocator._copyin_async(gpubuf, hst1, psz)
+      with Timing("tfer(1):           ", lambda x: f", {psz/x:.2f} GB/s"):
+        ret = libc.read(fd, hst2, psz)
+        dev.synchronize()
+        dev.allocator._copyin_async(gpubuf, hst2, psz)
+    dev.synchronize()
+
+MAP_LOCKED = 0x2000
+MAP_HUGETLB = 0x40000
+
+from tinygrad.runtime.ops_hip import HIPDevice
+
+if __name__ == "__main__":
+  # 4GB of random numbers
+  fd = os.open("/home/tiny/tinygrad/weights/rng", os.O_RDWR|os.O_DIRECT)
+  #sz = (os.fstat(fd).st_size) // 4
+  #sz = 128*1024*1024
+  #sz = 256*1024*1024
+  sz = 1024*1024*1024
+  print(f"read {sz} from {fd}")
+
+  dev = HIPDevice()
+  with Timing("gpu alloc:  ", lambda x: f", {sz/x:.2f} GB/s"):
+    gpubuf = dev.allocator._alloc(sz)
+  # warmup
+  dev.allocator._copyin_async(gpubuf, from_mv(bytearray(b"\x00\x00\x00\x00"*0x1000)), 0x4000)
+  print("copying, is warm")
+
+  print("****** read direct")
+  read_direct(fd, sz)
+
+  print("****** read mmap")
+  read_mmap(fd, sz)
+
+  print("****** read to gpu pingpong")
+  read_to_gpu_pingpong(fd, sz, gpubuf)
+
+  print("****** read to gpu single")
+  read_to_gpu_single(fd, sz, gpubuf)
+
+  print("****** read to gpu mmap")
+  read_to_gpu_mmap(fd, sz, gpubuf)
+
+  os._exit(0)
--- a/extra/hip_gpu_driver/hip_ioctl.py
+++ b/extra/hip_gpu_driver/hip_ioctl.py
@@ -63,6 +63,9 @@ def ioctl(fd, request, argp):
    name, stype = nrs[nr]
    s = get_struct(argp, stype)
    print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
+    if name == "AMDKFD_IOC_SVM":
+      out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute))
+      for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}")
  else:
    print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
  return ret
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -88,7 +88,7 @@ class Timing(contextlib.ContextDecorator):
  def __enter__(self): self.st = time.perf_counter_ns()
  def __exit__(self, *exc):
    self.et = time.perf_counter_ns() - self.st
-    if self.enabled: print(f"{self.prefix}{self.et*1e-6:.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
+    if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))

 class Profiling(contextlib.ContextDecorator):
  def __init__(self, enabled=True, sort='cumtime', frac=0.2): self.enabled, self.sort, self.frac = enabled, sort, frac
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -48,9 +48,11 @@ class HIPAllocator(LRUAllocator):
    check(hip.hipSetDevice(self.device.device))
    return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
  def _free(self, opaque:T): check(hip.hipFree(opaque))
+  def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0)))
+  def _copyin_async(self, dest:T, src:T, size:int): check(hip.hipMemcpyAsync(dest, src, size, hip.hipMemcpyHostToDevice, None))
  def copyin(self, dest:T, src: memoryview):
    check(hip.hipSetDevice(self.device.device))
-    host_mem = init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), len(src), 0)))
+    host_mem = self._hostalloc(len(src))
    self.device.pending_copyin.append(host_mem)
    ctypes.memmove(host_mem, from_mv(src), len(src))
    check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))