mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-23 05:48:08 -05:00
precompute weights for opencl
This commit is contained in:
@@ -51,16 +51,12 @@ def get_replacements(prg_src:str, opencl_type:List[str]) -> Dict[str, str]:
|
||||
|
||||
def roundup(x, n=4): return (x+(n-1))//n * n
|
||||
class OpenCLBuffer(GPUBuffer):
|
||||
def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None):
|
||||
super().__init__(shape, hostbuf)
|
||||
def __init__(self, shape, hostbuf:Optional[OpenCLBuffer]=None, backing:Optional[np.ndarray]=None):
|
||||
self._image = hostbuf._image if hostbuf is not None else None
|
||||
super().__init__(shape, hostbuf, backing)
|
||||
|
||||
@staticmethod
|
||||
def fromCPU(x):
|
||||
ret = OpenCLBuffer(x.shape)
|
||||
# TODO: this is blocking even though we told it not to
|
||||
CL.enqueue_copy(ret.cl, x.view(np.ndarray).astype(np.float32).ravel(), is_blocking=False)
|
||||
return ret
|
||||
def fromCPU(x): return OpenCLBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel())
|
||||
|
||||
def __repr__(self): return f"<OpenCLBuffer with shape {self.shape!r}>"
|
||||
|
||||
@@ -69,8 +65,14 @@ class OpenCLBuffer(GPUBuffer):
|
||||
if self._buf is None:
|
||||
if self.st.contiguous:
|
||||
self._buf = CLBuffer(4*roundup(prod(self.shape)))
|
||||
if self._backing is not None:
|
||||
CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
|
||||
self._backing = None
|
||||
if self._image is not None:
|
||||
self._buf = CLBuffer(4*roundup(prod(self._image.shape)*4))
|
||||
if self._backing is not None:
|
||||
CL.enqueue_copy(self._buf.cl, self._backing, is_blocking=False)
|
||||
self._backing = None
|
||||
#print(f"converting {self.shape} back to buffer, image shape is {self._image.shape}")
|
||||
CLProgram("from_image", """
|
||||
__kernel void from_image(
|
||||
@@ -154,7 +156,6 @@ class OpenCLBuffer(GPUBuffer):
|
||||
getters.append(f"inline float4 get4_{name}(__global const float *x, const sampler_t smp, int2 loc, int gid) {{"+
|
||||
f"return (float4)(get_{name}(x,gid+0), get_{name}(x,gid+1), get_{name}(x,gid+2), get_{name}(x,gid+3)); }}")
|
||||
else:
|
||||
print("folded")
|
||||
fakebufs.append(name)
|
||||
getters.append(f"inline float4 get4_{name}(int gid) {{"+
|
||||
f"return (float4)(get_{name}(gid+0), get_{name}(gid+1), get_{name}(gid+2), get_{name}(gid+3)); }}")
|
||||
|
||||
@@ -56,7 +56,7 @@ def preprocessing_op(x,w,C):
|
||||
C = C._replace(out_shape = (C.bs*C.oy, C.ox*C.cout//4, 4))
|
||||
#x = contiguous(ctx, x, x.shapetracker) if not x.shapetracker.contiguous else x
|
||||
#w = contiguous(ctx, w, w.shapetracker) if not w.shapetracker.contiguous else w
|
||||
return x,w,C
|
||||
return x,w.contiguous_op(),C
|
||||
|
||||
def postprocessing_op(ret, C, C_initial):
|
||||
added_output_channels = C.rcout - C_initial.rcout
|
||||
@@ -75,5 +75,7 @@ def postprocessing_op(ret, C, C_initial):
|
||||
|
||||
def processed_conv(x, w, C):
|
||||
x,w,Cn = preprocessing_op(x,w,C)
|
||||
# precompute the weight
|
||||
w.realize().image
|
||||
ret = x.processing_op(ProcessingOps.CONV, w, Cn)
|
||||
return postprocessing_op(ret, Cn, C)
|
||||
|
||||
@@ -194,14 +194,12 @@ class LazyBuffer:
|
||||
self.shape = self.st.shape
|
||||
self.optype, self.op = optype, op
|
||||
self.realized : Optional[DeviceBuffer] = None
|
||||
self.device = device
|
||||
self.device, self.dbuffer = device, Device._buffers[device]
|
||||
self.children : weakref.WeakSet[LazyBuffer] = weakref.WeakSet()
|
||||
# NOTE: op should be read only after construction of LazyBuffer
|
||||
for x in get_lazybuffers(op): x.children.add(self)
|
||||
if not LAZY: self.realize()
|
||||
|
||||
@property
|
||||
def dbuffer(self) -> DeviceBuffer: return Device._buffers[self.device]
|
||||
def __repr__(self): return f"<LB {self.shape} op:{self.op.op if self.realized is None else 'realized'}>"
|
||||
|
||||
# this produces a device buffer
|
||||
@@ -225,6 +223,7 @@ class LazyBuffer:
|
||||
|
||||
def unary_op(x:LazyBuffer, op:UnaryOps) -> LazyBuffer: return elementwise_op(op, x)
|
||||
def binary_op(x:LazyBuffer, op:BinaryOps, y:LazyBuffer) -> LazyBuffer: return elementwise_op(op, x, y)
|
||||
def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
|
||||
|
||||
def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int, ...]) -> LazyBuffer:
|
||||
return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))
|
||||
|
||||
Reference in New Issue
Block a user