diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index cdc78974b2..f1efa121af 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -347,7 +347,7 @@ class AMDAllocator(LRUAllocator): def _alloc(self, size:int, options:BufferOptions): try: if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True) - else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access) + return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access) except OSError as e: if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory") from e else: raise diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 329e7c2025..2144cccedb 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -119,7 +119,7 @@ class CUDAAllocator(LRUAllocator): def _alloc(self, size, options:BufferOptions): check(cuda.cuCtxSetCurrent(self.device.context)) if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01))) - else: return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) + return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) def _free(self, opaque, options:BufferOptions): if options.host: return check(cuda.cuMemFreeHost(opaque)) else: check(cuda.cuMemFree_v2(opaque)) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 25a4d8b4f2..0a6a9b5c0d 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -66,7 +66,7 @@ class CLAllocator(LRUAllocator): return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE, cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]), options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status) - else: return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status) + return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status) def _free(self, buf:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(buf)) def copyin(self, dest:ctypes._CData, src:memoryview): check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None)) diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py index 8ddc7633ad..657f72023e 100644 --- a/tinygrad/runtime/ops_hsa.py +++ b/tinygrad/runtime/ops_hsa.py @@ -109,11 +109,10 @@ class HSAAllocator(LRUAllocator): check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p()))) check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem)) return mem.value - else: - c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]) - check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p()))) - check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf)) - return buf.value + c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]) + check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p()))) + check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf)) + return buf.value def _free(self, opaque:T, options:BufferOptions): HSADevice.synchronize_system() diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index b749199778..98e527aa59 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -20,12 +20,11 @@ class MetalCompiler(Compiler): # NOTE: if you run llvm-dis on "air" you can see the llvm bytecode air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=src.encode('utf-8')) return subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) - else: - options = Metal.MTLCompileOptions.new() - options.setFastMathEnabled_(getenv("METAL_FAST_MATH")) - try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None)) - except AssertionError as e: raise CompileError(e) - return library.libraryDataContents().bytes().tobytes() + options = Metal.MTLCompileOptions.new() + options.setFastMathEnabled_(getenv("METAL_FAST_MATH")) + try: library = unwrap2(self.device.device.newLibraryWithSource_options_error_(src, options, None)) + except AssertionError as e: raise CompileError(e) + return library.libraryDataContents().bytes().tobytes() class MetalProgram: def __init__(self, device:MetalDevice, name:str, lib:bytes): diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 9fc3554cb5..dd107ccd98 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -331,7 +331,7 @@ class NVAllocator(LRUAllocator): def _alloc(self, size:int, options:BufferOptions): if options.host: return self.device._gpu_host_alloc(size) - else: return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20))) + return self.device._gpu_alloc(size, map_to_cpu=options.cpu_access, huge_page=(size > (16 << 20))) def _free(self, gpumem, options:BufferOptions): NVDevice.synchronize_system() diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py index 3d42a13e91..05ada20bce 100644 --- a/tinygrad/runtime/ops_python.py +++ b/tinygrad/runtime/ops_python.py @@ -17,7 +17,7 @@ def _load(m, i): def load(inp, j=0): if len(inp) == 4: return [_load(m, x+j) if gate else default for m,x,gate,default in zip(*inp)] - else: return [_load(m, x+j) for m,x in zip(inp[0], inp[1])] + return [_load(m, x+j) for m,x in zip(inp[0], inp[1])] def _store(m, i, v): if i < 0 or i >= len(m): raise IndexError(f"store out of bounds, size is {len(m)}, access is {i}, value is {v}")