From c2a044ed83ae0dd3971263a2f1bbd6e062824a77 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Thu, 4 Jan 2024 13:58:39 -0800 Subject: [PATCH] disk_read_speed example --- extra/disk_read_speed.py | 109 ++++++++++++++++++++++++++++++ extra/hip_gpu_driver/hip_ioctl.py | 3 + tinygrad/helpers.py | 2 +- tinygrad/runtime/ops_hip.py | 4 +- 4 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 extra/disk_read_speed.py diff --git a/extra/disk_read_speed.py b/extra/disk_read_speed.py new file mode 100644 index 0000000000..584fba5a2b --- /dev/null +++ b/extra/disk_read_speed.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +import os, ctypes, ctypes.util, io, mmap +from tinygrad.helpers import Timing, from_mv +libc = ctypes.CDLL(ctypes.util.find_library("c")) + +#from extra.hip_gpu_driver import hip_ioctl + +# sudo su -c "echo 3 > /proc/sys/vm/drop_caches" + +# sudo su -c 'echo 8 > /proc/sys/kernel/printk' +# sudo su -c "echo 'module amdgpu +p' > /sys/kernel/debug/dynamic_debug/control" + +libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t] + +libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t] +libc.read.restype = ctypes.c_size_t + +libc.malloc.argtypes = [ctypes.c_size_t] +libc.malloc.restype = ctypes.c_void_p + +def read_direct(fd, sz): + with Timing("mmap: ", lambda x: f", {sz/x:.2f} GB/s"): + buf = mmap.mmap(-1, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) + with Timing("read: ", lambda x: f", {sz/x:.2f} GB/s"): + ret = libc.read(fd, from_mv(buf), sz) + assert ret == sz + +def read_mmap(fd, sz): + with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"): + buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED) + t = 0 + for i in range(0, sz, 0x1000): t += buf[i] + +def read_to_gpu_mmap(fd, sz, gpubuf): + with Timing("gpu copyin: ", lambda x: f", {sz/x:.2f} GB/s"): + with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"): + buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED) + dev.allocator._copyin_async(gpubuf, from_mv(buf), sz) + dev.synchronize() + +def read_to_gpu_single(fd, sz, gpubuf): + os.lseek(fd, 0, os.SEEK_SET) + with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"): + with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"): + hst = dev.allocator._hostalloc(sz) + with Timing("read to host: ", lambda x: f", {sz/x:.2f} GB/s"): + ret = libc.read(fd, hst, sz) + with Timing("gpu host copy: ", lambda x: f", {sz/x:.2f} GB/s"): + dev.allocator._copyin_async(gpubuf, hst, sz) + dev.synchronize() + +def read_to_gpu_pingpong(fd, sz, gpubuf): + PIECE = 8 + psz = sz//PIECE + print(f"piece size {psz:#x}") + with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"): + hst1 = dev.allocator._hostalloc(psz) + hst2 = dev.allocator._hostalloc(psz) + + os.lseek(fd, 0, os.SEEK_SET) + with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"): + for i in range(PIECE//2): + with Timing("tfer(0): ", lambda x: f", {psz/x:.2f} GB/s"): + ret = libc.read(fd, hst1, psz) + dev.synchronize() + dev.allocator._copyin_async(gpubuf, hst1, psz) + with Timing("tfer(1): ", lambda x: f", {psz/x:.2f} GB/s"): + ret = libc.read(fd, hst2, psz) + dev.synchronize() + dev.allocator._copyin_async(gpubuf, hst2, psz) + dev.synchronize() + +MAP_LOCKED = 0x2000 +MAP_HUGETLB = 0x40000 + +from tinygrad.runtime.ops_hip import HIPDevice + +if __name__ == "__main__": + # 4GB of random numbers + fd = os.open("/home/tiny/tinygrad/weights/rng", os.O_RDWR|os.O_DIRECT) + #sz = (os.fstat(fd).st_size) // 4 + #sz = 128*1024*1024 + #sz = 256*1024*1024 + sz = 1024*1024*1024 + print(f"read {sz} from {fd}") + + dev = HIPDevice() + with Timing("gpu alloc: ", lambda x: f", {sz/x:.2f} GB/s"): + gpubuf = dev.allocator._alloc(sz) + # warmup + dev.allocator._copyin_async(gpubuf, from_mv(bytearray(b"\x00\x00\x00\x00"*0x1000)), 0x4000) + print("copying, is warm") + + print("****** read direct") + read_direct(fd, sz) + + print("****** read mmap") + read_mmap(fd, sz) + + print("****** read to gpu pingpong") + read_to_gpu_pingpong(fd, sz, gpubuf) + + print("****** read to gpu single") + read_to_gpu_single(fd, sz, gpubuf) + + print("****** read to gpu mmap") + read_to_gpu_mmap(fd, sz, gpubuf) + + os._exit(0) diff --git a/extra/hip_gpu_driver/hip_ioctl.py b/extra/hip_gpu_driver/hip_ioctl.py index 71fcb92c9f..70365ff30a 100644 --- a/extra/hip_gpu_driver/hip_ioctl.py +++ b/extra/hip_gpu_driver/hip_ioctl.py @@ -63,6 +63,9 @@ def ioctl(fd, request, argp): name, stype = nrs[nr] s = get_struct(argp, stype) print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s))) + if name == "AMDKFD_IOC_SVM": + out = ctypes.cast(s.attrs, ctypes.POINTER(kfd_ioctl.struct_kfd_ioctl_svm_attribute)) + for i in range(s.nattr): print(f"{i}: {kfd_ioctl.kfd_ioctl_svm_attr_type__enumvalues[out[i].type]:40s}: {out[i].value:#x}") else: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "") return ret diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index f8f72670f3..c6aa1854aa 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -88,7 +88,7 @@ class Timing(contextlib.ContextDecorator): def __enter__(self): self.st = time.perf_counter_ns() def __exit__(self, *exc): self.et = time.perf_counter_ns() - self.st - if self.enabled: print(f"{self.prefix}{self.et*1e-6:.2f} ms"+(self.on_exit(self.et) if self.on_exit else "")) + if self.enabled: print(f"{self.prefix}{self.et*1e-6:6.2f} ms"+(self.on_exit(self.et) if self.on_exit else "")) class Profiling(contextlib.ContextDecorator): def __init__(self, enabled=True, sort='cumtime', frac=0.2): self.enabled, self.sort, self.frac = enabled, sort, frac diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 1dd0d69e87..93605c2f13 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -48,9 +48,11 @@ class HIPAllocator(LRUAllocator): check(hip.hipSetDevice(self.device.device)) return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size))) def _free(self, opaque:T): check(hip.hipFree(opaque)) + def _hostalloc(self, size:int): return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), size, 0))) + def _copyin_async(self, dest:T, src:T, size:int): check(hip.hipMemcpyAsync(dest, src, size, hip.hipMemcpyHostToDevice, None)) def copyin(self, dest:T, src: memoryview): check(hip.hipSetDevice(self.device.device)) - host_mem = init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipHostMalloc(ctypes.byref(x), len(src), 0))) + host_mem = self._hostalloc(len(src)) self.device.pending_copyin.append(host_mem) ctypes.memmove(host_mem, from_mv(src), len(src)) check(hip.hipMemcpyAsync(dest, host_mem, len(src), hip.hipMemcpyHostToDevice, None))