precompute weights for opencl

This commit is contained in:
George Hotz
2022-07-08 10:56:48 -07:00
parent d8e7f1f8bc
commit 68959be05d
3 changed files with 14 additions and 12 deletions

View File

@@ -51,16 +51,12 @@ def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]:
def roundup(x, n=4): return (x+(n-1))//n * n
class OpenCLBuffer(GPUBuffer):
def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None):
super().__init__(shape, hostbuf)
def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None):
self._image = hostbuf._image if hostbuf is not None else None
super().__init__(shape, hostbuf, backing)
@staticmethod
def fromCPU(x):
ret = OpenCLBuffer(x.shape)
# TODO: this is blocking even though we told it not to
CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
return ret
def fromCPU(x): return OpenCLBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
def __repr__(self): return f"<OpenCLBuffer with shape {self.shape!r}>"
@@ -69,8 +65,14 @@ class OpenCLBuffer(GPUBuffer):
if self._buf is None:
if self.st.contiguous:
self._buf = CLBuffer(4*roundup(prod(self.shape)))
if self._backing is not None:
CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
self._backing = None
if self._image is not None:
self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4))
if self._backing is not None:
CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
self._backing = None
#print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
CLProgram("from_image", """
__kernel void from_image(
@@ -154,7 +156,6 @@ class OpenCLBuffer(GPUBuffer):
getters.append(f"inline float4 get4_{name}(__global const float *x, const sampler_t smp, int2 loc, int gid) {{"+
f"return (float4)(get_{name}(x,gid+0), get_{name}(x,gid+1), get_{name}(x,gid+2), get_{name}(x,gid+3)); }}")
else:
print("folded")
fakebufs.append(name)
getters.append(f"inline float4 get4_{name}(int gid) {{"+
f"return (float4)(get_{name}(gid+0), get_{name}(gid+1), get_{name}(gid+2), get_{name}(gid+3)); }}")

View File

@@ -56,7 +56,7 @@ def preprocessing_op(x,w,C):
C = C._replace(out_shape = (C.bs*C.oy, C.ox*C.cout//4, 4))
#x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x
#w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w
return x,w,C
return x,w.contiguous_op(),C
def postprocessing_op(ret, C, C_initial):
added_output_channels = C.rcout - C_initial.rcout
@@ -75,5 +75,7 @@ def postprocessing_op(ret, C, C_initial):
def processed_conv(x, w, C):
x,w,Cn = preprocessing_op(x,w,C)
# precompute the weight
w.realize().image
ret = x.processing_op(ProcessingOps.CONV, w, Cn)
return postprocessing_op(ret, Cn, C)

View File

@@ -194,14 +194,12 @@ class LazyBuffer:
self.shape = self.st.shape
self.optype, self.op = optype, op
self.realized : Optional[DeviceBuffer] = None
self.device = device
self.device, self.dbuffer = device, Device._buffers[device]
self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
# NOTE: op should be read only after construction of LazyBuffer
for x in get_lazybuffers(op): x.children.add(self)
if not LAZY: self.realize()
@property
def dbuffer(self) -> DeviceBuffer: return Device._buffers[self.device]
def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
# this produces a device buffer
@@ -225,6 +223,7 @@ class LazyBuffer:
def unary_op(x:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, x)
def binary_op(x:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, x, y)
def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))