add Buffer abstraction for each device

This commit is contained in:
George Hotz
2021-11-17 12:46:07 -08:00
parent c162e748f5
commit e8ca3ad053
4 changed files with 98 additions and 100 deletions

View File

@@ -1,6 +1,27 @@
import numpy as np
from .tensor import Function
class CPUBuffer(np.ndarray):
def log(x):
return np.log(x)
def exp(x):
return np.exp(x)
def relu(x):
return np.maximum(x, 0)
def expand(x, shp):
return np.broadcast_to(x, shp)
def amax(x, *args, **kwargs):
return np.amax(x, *args, **kwargs)
def permute(x, order):
return x.transpose(order)
def type(x, tt):
return x.astype(tt)
def toCPU(x):
return x
@staticmethod
def fromCPU(x):
return x
# ************* unary ops *************
class ReLU(Function):

View File

@@ -1,13 +1,44 @@
import functools
import pyopencl as cl
import numpy as np
from .tensor import Function, GPUBuffer
from .tensor import Function
cl_ctx, cl_queue = None, None
def require_init_gpu():
global cl_ctx, cl_queue
if cl_queue is None:
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
if len(devices) == 0:
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.CPU)
cl_ctx = cl.Context(devices=devices)
# this is an in-order command queue
cl_queue = cl.CommandQueue(cl_ctx)
require_init_gpu()
class GPUBuffer:
def __init__(self, shape, hostbuf=None):
self.shape, self.dtype = tuple(shape), np.float32
self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
def __repr__(self):
return f"<GPUBuffer with shape {self.shape!r}>"
@staticmethod
def fromCPU(x):
return GPUBuffer(x.shape, x.view(np.ndarray))
def toCPU(self):
data = np.empty(self.shape, dtype=np.float32)
cl.enqueue_copy(cl_queue, data, self.cl, is_blocking=True)
return data
def buffer_new(ctx, shape, zero=False):
return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32))
def buffer_np(ctx, x):
return cl.Buffer(ctx.cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
return cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
@functools.lru_cache
def clbuild(cl_ctx, name, prg):
@@ -21,13 +52,13 @@ i32 = np.int32
def unary_op(ctx, code, x):
ret = buffer_new(ctx, x.shape)
unop = clbuild(ctx.cl_ctx, "unop", """
unop = clbuild(cl_ctx, "unop", """
__kernel void unop(__global const float *a_g, __global float *res_g) {
int gid = get_global_id(0);
float a = a_g[gid];
res_g[gid] = """+code+""";
}""")
unop(ctx.cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl)
unop(cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl)
return ret
class ReLU(Function):
@@ -72,7 +103,7 @@ def reduce_op(ctx, code, code2, inp, axis=None, start="0.0"):
ret.shape = (1,)
# TODO: this is insanely slow
reduce = clbuild(ctx.cl_ctx, "reduce", """
reduce = clbuild(cl_ctx, "reduce", """
__kernel void reduce(__global const float *a_g, int sz, __global float *res_g, int prod, int n_dims,
__global const int *shape_x, __global const int *shape_ret) {
int gid = get_global_id(0);
@@ -97,7 +128,7 @@ def reduce_op(ctx, code, code2, inp, axis=None, start="0.0"):
}
res_g[gid] = """+code2+""";
}""")
reduce(ctx.cl_queue, [np.prod(osize)], None, inp.cl,
reduce(cl_queue, [np.prod(osize)], None, inp.cl,
i32(np.prod(inp.shape)//np.prod(osize)), ret.cl,
i32(np.prod(osize)), i32(len(osize)),
buffer_np(ctx, np.array(inp.shape, dtype=np.int32)),
@@ -174,10 +205,10 @@ def binary_op(ctx, code, x, y):
for i in range(n_dims): # group together any adjacent dimensions that we can to simplify broadcasting
push(i32(max(shape_x[i], shape_y[i])), (shape_x[i] > 1, shape_y[i] > 1))
prg = get_binop_prg(ctx.cl_ctx, code, tuple(complist))
prg = get_binop_prg(cl_ctx, code, tuple(complist))
ret = buffer_new(ctx, shape_ret, zero=True)
prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[-1::-1] # take cumprod from back to front
prg.binop(ctx.cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:]))
prg.binop(cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:]))
return ret
def unbroadcast(ctx, out, in_sh):
@@ -245,7 +276,7 @@ class Reshape(Function):
def perm_axis(ctx, inp, order):
osize = np.array(inp.shape)[list(order)]
ret = buffer_new(ctx, osize)
perm = clbuild(ctx.cl_ctx, "perm", """
perm = clbuild(cl_ctx, "perm", """
__kernel void perm(__global const float *a_g, __global float *res_g, int n_axis,
__global const int *shape, __global const int *order) {
int gid = get_global_id(0);
@@ -259,7 +290,7 @@ def perm_axis(ctx, inp, order):
}
res_g[gid] = a_g[idx];
}""")
perm(ctx.cl_queue, [np.prod(osize)], None, inp.cl, ret.cl, i32(len(osize)),
perm(cl_queue, [np.prod(osize)], None, inp.cl, ret.cl, i32(len(osize)),
buffer_np(ctx, np.array(inp.shape, dtype=np.int32)),
buffer_np(ctx, np.array(order, dtype=np.int32)))
return ret
@@ -277,7 +308,7 @@ def inner_slice(ctx, x, arg):
shift = [y[0] for y in arg]
oshape = [y[1]-y[0] for y in arg]
ret = buffer_new(ctx, oshape)
gslice = clbuild(ctx.cl_ctx, "gslice", """
gslice = clbuild(cl_ctx, "gslice", """
__kernel void gslice(__global const float *input, __global float *output, int prod, int n_dims,
__global const int *shape_x, __global const int *shape_ret,
__global const int *shift) {
@@ -292,7 +323,7 @@ def inner_slice(ctx, x, arg):
}
output[gid] = zero ? input[iptr] : 0.0;
}""")
gslice(ctx.cl_queue, [np.prod(ret.shape)], None,
gslice(cl_queue, [np.prod(ret.shape)], None,
x.cl, ret.cl, i32(np.prod(ret.shape)), i32(len(ret.shape)),
buffer_np(ctx, np.array(x.shape, dtype=np.int32)),
buffer_np(ctx, np.array(ret.shape, dtype=np.int32)),
@@ -318,7 +349,7 @@ class Matmul(Function):
isize, msize, osize = i32(input.shape[-2]), i32(input.shape[-1]), i32(weight.shape[-1])
ret = buffer_new(ctx, list(input.shape[0:-2])+[isize, osize])
matmul = clbuild(ctx.cl_ctx, "matmul", """
matmul = clbuild(cl_ctx, "matmul", """
__kernel void matmul(
__global const float *input, __global const float *weight, __global float *res,
int isize, int is0, int is1, int msize, int ws0, int ws1, int osize
@@ -339,7 +370,7 @@ class Matmul(Function):
ctx.save_for_backward(input, weight, matmul, cnt)
# (isize,msize) x (msize,osize) = (isize,osize)
matmul(ctx.cl_queue, [isize, osize, cnt], None,
matmul(cl_queue, [isize, osize, cnt], None,
input.cl, weight.cl, ret.cl, isize,
msize, i32(1), msize, i32(1), osize, osize)
return ret
@@ -352,12 +383,12 @@ class Matmul(Function):
grad_weight = buffer_new(ctx, weight.shape)
# (isize,osize) x (msize,osize) = (isize,msize)
matmul(ctx.cl_queue, [isize, msize, cnt], None,
matmul(cl_queue, [isize, msize, cnt], None,
grad_output.cl, weight.cl, grad_input.cl, isize,
osize, i32(1), osize, osize, i32(1), msize)
# (isize,msize) x (isize,osize) = (msize,osize)
matmul(ctx.cl_queue, [msize, osize, cnt], None,
matmul(cl_queue, [msize, osize, cnt], None,
input.cl, grad_output.cl, grad_weight.cl, msize,
i32(1), msize, isize, i32(1), osize, osize)
@@ -383,7 +414,7 @@ class Conv2D(Function):
# weight = (groups, rcout, cin, H, W)
# output = (bs, groups, rcout, oy, ox)
conv = clbuild(ctx.cl_ctx, "conv", """
conv = clbuild(cl_ctx, "conv", """
__kernel void conv(__global const float *input, __global const float *weight, __global float *output,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
@@ -408,7 +439,7 @@ class Conv2D(Function):
output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
}""")
conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
conv(cl_queue, [bs*groups*rcout, oy, ox], None,
x.cl, w.cl, ret.cl,
i32(H), i32(W), i32(groups), i32(rcout), i32(cin),
i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs)
@@ -433,7 +464,7 @@ class Conv2D(Function):
# tensw = (groups*rcout, cin, H, W)
# ggg = (bs, groups*rout, oy, ox)
convw = clbuild(ctx.cl_ctx, "convw", """
convw = clbuild(cl_ctx, "convw", """
__kernel void convw(__global const float *tensx, __global const float *ggg, __global float *dw,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
@@ -454,7 +485,7 @@ class Conv2D(Function):
}
dw[get_global_id(0)*H*W + y*W + x] = acc;
}""")
convx = clbuild(ctx.cl_ctx, "convx", """
convx = clbuild(cl_ctx, "convx", """
__kernel void convx(__global const float *tensw, __global const float *ggg, __global float *dx,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
@@ -480,6 +511,6 @@ class Conv2D(Function):
""")
conv_args = i32(H), i32(W), i32(ctx.groups), i32(rcout), i32(cin), i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs), i32(bs)
convw(ctx.cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args)
convx(ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args)
convw(cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args)
convx(cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args)
return dx, dw

View File

@@ -2,6 +2,13 @@ import torch
import numpy as np
from .tensor import Function
class TorchBuffer(torch.Tensor):
@staticmethod
def fromCPU(data):
return TorchBuffer(torch.from_numpy(data).requires_grad_(False))
def toCPU(x):
return x.numpy()
# ************* unary+binary+reduce ops *************
from tinygrad.ops_cpu import ReLU, Log, Exp, Add, Sub, Mul, Pow, Sum, Max

View File

@@ -24,40 +24,17 @@ class ProfileOp:
return self
def __exit__(self, *junk):
if DEBUG:
if cl_queue is not None:
cl_queue.finish()
# TODO: fix this
#if cl_queue is not None:
# cl_queue.finish()
et = (time.time()-self.st)*1000.
debug_counts[self.name] += 1
debug_times[self.name] += et
print(f"{self.name:>20} : {et:>7.2f} ms {str([y.shape for y in self.x]):>40} {'-> '+str(self.output.shape) if self.output is not None else ''}")
# **** GPU functions ****
cl_ctx, cl_queue = None, None
def require_init_gpu():
if not GPU: raise Exception("No GPU Support, install pyopencl")
global cl_ctx, cl_queue
if cl_queue is None:
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
if len(devices) == 0:
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.CPU)
cl_ctx = cl.Context(devices=devices)
# this is an in-order command queue
cl_queue = cl.CommandQueue(cl_ctx)
class GPUBuffer:
def __init__(self, shape, hostbuf=None):
self.shape, self.dtype = tuple(shape), np.float32
self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
def __repr__(self):
return f"<GPUBuffer with shape {self.shape!r}>"
# **** start with two base classes, Tensor and Function ****
class Device: CPU, GPU, TORCH = 0, 1, 2
class Device: CPU, GPU, TORCH, buffers = 0, 1, 2, {}
DEFAULT_DEVICE = (Device.CPU if os.environ.get("GPU", 0) != "1" else Device.GPU) if os.environ.get("TORCH", 0) != "1" else Device.TORCH
@@ -148,58 +125,22 @@ class Tensor:
gt = Tensor(g, device=self.device, requires_grad=False)
t.grad = gt if t.grad is None else (t.grad + gt)
# ***** tinygrad supports CPU and GPU *****
# ***** tinygrad supports many devices *****
@staticmethod
def _move_data(data, device):
if isinstance(data, GPUBuffer):
if device == Device.GPU: return data
old = data
data = np.empty(old.shape, dtype=np.float32)
with ProfileOp("toCPU", [data]):
cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
if str(type(data)).startswith("torch"):
data = data.numpy()
if not isinstance(data, np.ndarray):
data = np.array(data, dtype=np.float32)
if isinstance(data, np.ndarray):
data = data.view(Device.buffers[Device.CPU])
if isinstance(data, Device.buffers[device]):
return data
if data.dtype != np.float32 and not Tensor.did_float_warning:
# warning? float64 is actually needed for numerical jacobian
print(f"warning, {data.shape!r} isn't float32, it's {data.dtype}")
Tensor.did_float_warning = True
if device == Device.CPU:
# add these functions to ndarray
class CPUBuffer(np.ndarray):
def log(x):
return np.log(x)
def exp(x):
return np.exp(x)
def relu(x):
return np.maximum(x, 0)
def expand(x, shp):
return np.broadcast_to(x, shp)
def amax(x, *args, **kwargs):
return np.amax(x, *args, **kwargs)
def permute(x, order):
return x.transpose(order)
def type(x, tt):
return x.astype(tt)
data = data.view(CPUBuffer)
if device == Device.GPU:
require_init_gpu()
with ProfileOp("toGPU", [data]):
return GPUBuffer(data.shape, data)
if device == Device.TORCH:
import torch
with ProfileOp("toTORCH", [data]):
return torch.from_numpy(data).requires_grad_(False)
return data
data = data.toCPU().view(Device.buffers[Device.CPU])
return Device.buffers[device].fromCPU(data)
def to_(self, device):
self.data, self.device = self._move_data(self.data, device), device
@@ -345,7 +286,8 @@ def register(name, fxn, device=Device.CPU):
tt = [arg for arg in x if isinstance(arg, Tensor)][0]
x = [Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x]
f = Tensor.ops[tt.device][name]
f.cl_ctx, f.cl_queue, f.device = cl_ctx, cl_queue, tt.device
#f.cl_ctx, f.cl_queue, f.device = cl_ctx, cl_queue, tt.device
f.device = tt.device
return f.apply(f, *x, **kwargs)
setattr(Tensor, name, dispatch)
if name in ['add', 'sub', 'mul', 'pow', 'matmul']:
@@ -360,21 +302,18 @@ for device in [device for device in Device.__dict__.keys() if device[0] != "_"]:
# this registers all the operations
def _register_ops(namespace, device=Device.CPU):
for name, cls in inspect.getmembers(namespace, inspect.isclass):
if name[0] != "_": register(name.lower(), cls, device=device)
if name.endswith("Buffer"): Device.buffers[device] = cls
elif name[0] != "_": register(name.lower(), cls, device=device)
# TODO: refactor this
from tinygrad import ops_cpu
_register_ops(ops_cpu)
try:
import pyopencl as cl
# TODO: move this import to require_init_gpu?
from tinygrad import ops_gpu
_register_ops(ops_gpu, device=Device.GPU)
GPU = True
except ImportError:
# no GPU support
GPU = False
pass
try:
import torch
from tinygrad import ops_torch
_register_ops(ops_torch, device=Device.TORCH)
except ImportError: