CL class, debugging

This commit is contained in:
George Hotz
2022-06-21 20:16:29 -07:00
parent 0b820f7966
commit 9d06a86f7f
3 changed files with 44 additions and 24 deletions

View File

@@ -77,6 +77,8 @@ class LazyBuffer:
def unary_op(x, op): return elementwise_op(op, (x,))
def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y))
@functools.lru_cache(maxsize=None)
def contiguous_op(x): return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
@functools.lru_cache(maxsize=None)

View File

@@ -1,10 +1,10 @@
from __future__ import annotations
import pyopencl as cl
from tinygrad.llops.ops_gpu import GPUBuffer, get_cl_ctx, get_cl_queue, CLProgram, code_for_op
from tinygrad.llops.ops_gpu import GPUBuffer, CL, CLProgram, code_for_op
from tinygrad.ops import ProcessingOps
from tinygrad.helpers import prod, ConvArgs
from typing import List, Tuple, Optional, Dict
import numpy as np
import pyopencl as cl
import pathlib
def load(x):
@@ -13,6 +13,15 @@ def load(x):
return ret
CONV_SRC = load(pathlib.Path(__file__).parent.parent.parent / 'accel/opencl/conv.cl')
class ECL(CL):
@staticmethod
def image(shape):
if CL.DEBUG >= 2: print(f"cl: create image({shape})")
fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
# HALF_FLOAT breaks tests
#fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT)
return cl.Image(CL().cl_ctx, cl.mem_flags.READ_WRITE, fmt, shape=shape)
def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]:
middle_code = []
@@ -46,16 +55,16 @@ class OpenCLBuffer(GPUBuffer):
def fromCPU(x):
ret = OpenCLBuffer(x.shape)
# TODO: this is blocking even though we told it not to
cl.enqueue_copy(get_cl_queue(), ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
return ret
@property
def cl(self):
if self._buf is None:
if self.st.contiguous:
self._buf = cl.Buffer(get_cl_ctx(), cl.mem_flags.READ_WRITE, 4*roundup(prod(self.shape)))
self._buf = CL.malloc(4*roundup(prod(self.shape)))
if self._image is not None:
self._buf = cl.Buffer(get_cl_ctx(), cl.mem_flags.READ_WRITE, 4*roundup(prod(self._image.shape)*4))
self._buf = CL.malloc(4*roundup(prod(self._image.shape)*4))
#print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
CLProgram("from_image", """
__kernel void from_image(
@@ -78,10 +87,7 @@ class OpenCLBuffer(GPUBuffer):
def image(self):
if self._image is None:
assert self.shape[2] == 4 and len(self.shape) == 3
fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
# HALF_FLOAT breaks tests
#fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT)
self._image = cl.Image(get_cl_ctx(), cl.mem_flags.READ_WRITE, fmt, shape=(self.shape[1], self.shape[0]))
self._image = ECL.image(shape=(self.shape[1], self.shape[0]))
if self._buf is not None:
assert prod(self.shape) == prod(self._image.shape)*4
#print(f"converting {self.shape} to image with shape {self._image.shape}")

View File

@@ -1,4 +1,5 @@
from __future__ import annotations
import os
import functools
import numpy as np
import pyopencl as cl
@@ -7,28 +8,40 @@ from tinygrad.helpers import prod, ConvArgs
from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, MovementOps, ProcessingOps
from tinygrad.shapetracker import ShapeTracker, View, strides_for_shape
cl_ctx, cl_queue = None, None
def get_cl_ctx(): return cl_ctx
def get_cl_queue(): return cl_queue
def require_init_gpu():
global cl_ctx, cl_queue
if cl_ctx is None:
class CL:
DEBUG = int(os.getenv("DEBUGCL", "0"))
def __init__(self):
if getattr(CL, "cl_queue", None) is not None: return
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
if len(devices) == 0: # settle for CPU
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.CPU)
cl_ctx = cl.Context(devices=devices)
cl_queue = cl.CommandQueue(cl_ctx) # this is an in-order command queue
CL.cl_ctx = cl.Context(devices=devices)
CL.cl_queue = cl.CommandQueue(self.cl_ctx) # this is an in-order command queue
@staticmethod
def enqueue_copy(a, b, is_blocking=False):
if CL.DEBUG: print(f"cl: copy into {type(a)} sz {a.size} block {is_blocking}")
cl.enqueue_copy(CL().cl_queue, a, b, is_blocking=is_blocking)
@staticmethod
def malloc(sz):
if CL.DEBUG >= 2: print(f"cl: malloc({sz})")
return cl.Buffer(CL().cl_ctx, cl.mem_flags.READ_WRITE, sz)
@functools.lru_cache(maxsize=None)
class CLProgram:
def __init__(self, name, prg, options=tuple(), argdtypes=None):
self.name = name
self.built = cl.Program(cl_ctx, prg).build(options=options)
if CL.DEBUG >= 2: print(f"cl: building {self.name:20s} with {options}")
self.built = cl.Program(CL().cl_ctx, prg).build(options=options)
self.clprg = self.built.__getattr__(name)
if argdtypes is not None: self.clprg.set_scalar_arg_dtypes(argdtypes)
def __call__(self, *args):
#print(f"running {self.name} with {args[0]} count {len(args)-2}")
self.clprg(cl_queue, *args)
if CL.DEBUG: print(f"cl: running {self.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d}")
self.clprg(CL().cl_queue, *args)
# **** end CL wrappers ****
code_for_op = {
UnaryOps.NOOP: "(A)", UnaryOps.RELU: "max(A, (float)0.)", UnaryOps.EXP: "exp(A)", UnaryOps.LOG: "log(A)", UnaryOps.NEG: "(-(A))", UnaryOps.SIGN: "sign(A)",
@@ -37,14 +50,13 @@ code_for_op = {
class GPUBuffer:
def __init__(self, shape, hostbuf:Optional[GPUBuffer]=None):
require_init_gpu()
self.st = ShapeTracker(shape)
self.shape = self.st.shape
self._buf = hostbuf._buf if hostbuf is not None else None
@property
def cl(self):
if self._buf is None: self._buf = cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE, 4*prod(self.shape))
if self._buf is None: self._buf = CL.malloc(4*prod(self.shape))
return self._buf
def __repr__(self):
@@ -54,12 +66,12 @@ class GPUBuffer:
def fromCPU(x):
ret = GPUBuffer(x.shape)
# TODO: this is blocking even though we told it not to
cl.enqueue_copy(cl_queue, ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
return ret
def toCPU(self):
data = np.empty(self.shape, dtype=np.float32)
cl.enqueue_copy(cl_queue, data, self.contiguous_op().cl, is_blocking=True)
CL.enqueue_copy(data, self.contiguous_op().cl, is_blocking=True)
return data
def contiguous_view(x, name:str) -> str: