disk_read_speed example

This commit is contained in:
George Hotz
2024-01-04 13:58:39 -08:00
parent 8a63f26a0f
commit c2a044ed83
4 changed files with 116 additions and 2 deletions

109
extra/disk_read_speed.py Normal file
View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
import os, ctypes, ctypes.util, io, mmap
from tinygrad.helpers import Timing, from_mv
libc = ctypes.CDLL(ctypes.util.find_library("c"))
#from extra.hip_gpu_driver import hip_ioctl
# sudo su -c "echo 3 > /proc/sys/vm/drop_caches"
# sudo su -c 'echo 8 > /proc/sys/kernel/printk'
# sudo su -c "echo 'module amdgpu +p' > /sys/kernel/debug/dynamic_debug/control"
libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
libc.read.restype = ctypes.c_size_t
libc.malloc.argtypes = [ctypes.c_size_t]
libc.malloc.restype = ctypes.c_void_p
def read_direct(fd, sz):
with Timing("mmap: ", lambda x: f", {sz/x:.2f} GB/s"):
buf = mmap.mmap(-1, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE)
with Timing("read: ", lambda x: f", {sz/x:.2f} GB/s"):
ret = libc.read(fd, from_mv(buf), sz)
assert ret == sz
def read_mmap(fd, sz):
with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"):
buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
t = 0
for i in range(0, sz, 0x1000): t += buf[i]
def read_to_gpu_mmap(fd, sz, gpubuf):
with Timing("gpu copyin: ", lambda x: f", {sz/x:.2f} GB/s"):
with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"):
buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
dev.allocator._copyin_async(gpubuf, from_mv(buf), sz)
dev.synchronize()
def read_to_gpu_single(fd, sz, gpubuf):
os.lseek(fd, 0, os.SEEK_SET)
with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
hst = dev.allocator._hostalloc(sz)
with Timing("read to host: ", lambda x: f", {sz/x:.2f} GB/s"):
ret = libc.read(fd, hst, sz)
with Timing("gpu host copy: ", lambda x: f", {sz/x:.2f} GB/s"):
dev.allocator._copyin_async(gpubuf, hst, sz)
dev.synchronize()
def read_to_gpu_pingpong(fd, sz, gpubuf):
PIECE = 8
psz = sz//PIECE
print(f"piece size {psz:#x}")
with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
hst1 = dev.allocator._hostalloc(psz)
hst2 = dev.allocator._hostalloc(psz)
os.lseek(fd, 0, os.SEEK_SET)
with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
for i in range(PIECE//2):
with Timing("tfer(0): ", lambda x: f", {psz/x:.2f} GB/s"):
ret = libc.read(fd, hst1, psz)
dev.synchronize()
dev.allocator._copyin_async(gpubuf, hst1, psz)
with Timing("tfer(1): ", lambda x: f", {psz/x:.2f} GB/s"):
ret = libc.read(fd, hst2, psz)
dev.synchronize()
dev.allocator._copyin_async(gpubuf, hst2, psz)
dev.synchronize()
MAP_LOCKED = 0x2000
MAP_HUGETLB = 0x40000
from tinygrad.runtime.ops_hip import HIPDevice
if __name__ == "__main__":
# 4GB of random numbers
fd = os.open("/home/tiny/tinygrad/weights/rng", os.O_RDWR|os.O_DIRECT)
#sz = (os.fstat(fd).st_size) // 4
#sz = 128*1024*1024
#sz = 256*1024*1024
sz = 1024*1024*1024
print(f"read {sz} from {fd}")
dev = HIPDevice()
with Timing("gpu alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
gpubuf = dev.allocator._alloc(sz)
# warmup
dev.allocator._copyin_async(gpubuf, from_mv(bytearray(b"\x00\x00\x00\x00"*0x1000)), 0x4000)
print("copying, is warm")
print("****** read direct")
read_direct(fd, sz)
print("****** read mmap")
read_mmap(fd, sz)
print("****** read to gpu pingpong")
read_to_gpu_pingpong(fd, sz, gpubuf)
print("****** read to gpu single")
read_to_gpu_single(fd, sz, gpubuf)
print("****** read to gpu mmap")
read_to_gpu_mmap(fd, sz, gpubuf)
os._exit(0)

View File

@@ -63,6 +63,9 @@ def ioctl(fd, request, argp):
name, stype = nrs[nr]
s = get_struct(argp, stype)
print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
if name == "AMDKFD_IOC_SVM":
out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute))
for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}")
else:
print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
return ret

View File

@@ -88,7 +88,7 @@ class Timing(contextlib.ContextDecorator):
def __enter__(self): self.st = time.perf_counter_ns()
def __exit__(self, *exc):
self.et = time.perf_counter_ns() - self.st
if self.enabled: print(f"{self.prefix}{self.et*1e-6:.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
class Profiling(contextlib.ContextDecorator):
def __init__(self, enabled=True, sort='cumtime', frac=0.2): self.enabled, self.sort, self.frac = enabled, sort, frac

View File

@@ -48,9 +48,11 @@ class HIPAllocator(LRUAllocator):
check(hip.hipSetDevice(self.device.device))
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
def _free(self, opaque:T): check(hip.hipFree(opaque))
def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0)))
def _copyin_async(self, dest:T, src:T, size:int): check(hip.hipMemcpyAsync(dest, src, size, hip.hipMemcpyHostToDevice, None))
def copyin(self, dest:T, src: memoryview):
check(hip.hipSetDevice(self.device.device))
host_mem = init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), len(src), 0)))
host_mem = self._hostalloc(len(src))
self.device.pending_copyin.append(host_mem)
ctypes.memmove(host_mem, from_mv(src), len(src))
check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))