diff --git a/docs/design b/docs/design index b71a5d6ebb..56058ae738 100644 --- a/docs/design +++ b/docs/design @@ -38,23 +38,31 @@ C.shape = max(A.shape, B.shape) -Movement Ops +Movement Ops (2 or 1) === Reshape, Transpose, Slice Depending on your Tensor implementation, these are free. Reshape is almost always free. -Slice can be made free. +Slice can be made free, but probably shouldn't be. Transpose is hard to make free except in trivial cases. Regardless, these are "reindexings" of existing arrays +Transpose and Slice are similar enough I think they can be merged. +They should use a DMA engine - -Processing Ops +Processing Ops (4) === Matmul is 1 matmul for forward, 2 for backward. -Conv2D is very complex. +* It's actually three matmuls transposed +* cublasSgemm() + +Conv2D is very complex. It seems to need three. +* cudnnConvolutionForward() +* cudnnConvolutionBackwardData() +* cudnnConvolutionBackwardFilter() +NOTE: Tensor Cores require that the tensors be in the NHWC data layout diff --git a/tinygrad/llops/gpu.py b/tinygrad/llops/gpu.py index 603d1353c7..b4b64efcf5 100644 --- a/tinygrad/llops/gpu.py +++ b/tinygrad/llops/gpu.py @@ -1,5 +1,4 @@ # llops don't know about derivatives - import functools import numpy as np import pyopencl as cl @@ -45,11 +44,6 @@ def buffer_new(shape, zero=False): def buffer_np(x): return cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x) -def clbuffer(hostbuf, shape): - return cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), - 4*np.prod(shape), - hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None) - @functools.lru_cache def clbuild(name, prg): clprg = cl.Program(cl_ctx, prg).build().__getattr__(name) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index f1a6d6e7e2..267e44994a 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -287,16 +287,14 @@ class Tensor: return self.relu() - (-neg_slope*self).relu() def softmax(self): - ns = list(self.shape)[:-1]+[1] - m = self.max(axis=len(self.shape)-1).reshape(shape=ns) + m = self.max(axis=len(self.shape)-1, keepdim=True) e = (self - m).exp() - ss = e.sum(axis=len(self.shape)-1).reshape(shape=ns) + ss = e.sum(axis=len(self.shape)-1, keepdim=True) return e.div(ss) def logsoftmax(self): - ns = list(self.shape)[:-1]+[1] - m = self.max(axis=len(self.shape)-1).reshape(shape=ns) - ss = m + (self-m).exp().sum(axis=len(self.shape)-1).reshape(shape=ns).log() + m = self.max(axis=len(self.shape)-1, keepdim=True) + ss = m + (self-m).exp().sum(axis=len(self.shape)-1, keepdim=True).log() return self - ss def dropout(self, p=0.5):