diff --git a/docs/design b/docs/design
index b71a5d6ebb..56058ae738 100644
--- a/docs/design
+++ b/docs/design
@@ -38,23 +38,31 @@ C.shape = max(A.shape, B.shape)
 
 
 
-Movement Ops
+Movement Ops (2 or 1)
 ===
 
 Reshape, Transpose, Slice
 
 Depending on your Tensor implementation, these are free.
 Reshape is almost always free.
-Slice can be made free.
+Slice can be made free, but probably shouldn't be.
 Transpose is hard to make free except in trivial cases.
 
 Regardless, these are "reindexings" of existing arrays
+Transpose and Slice are similar enough I think they can be merged.
+They should use a DMA engine
 
 
 
-
-Processing Ops
+Processing Ops (4)
 ===
 
 Matmul is 1 matmul for forward, 2 for backward.
-Conv2D is very complex.
+* It's actually three matmuls transposed
+* cublasSgemm()
+
+Conv2D is very complex. It seems to need three.
+* cudnnConvolutionForward()
+* cudnnConvolutionBackwardData()
+* cudnnConvolutionBackwardFilter()
+NOTE: Tensor Cores require that the tensors be in the NHWC data layout
diff --git a/tinygrad/llops/gpu.py b/tinygrad/llops/gpu.py
index 603d1353c7..b4b64efcf5 100644
--- a/tinygrad/llops/gpu.py
+++ b/tinygrad/llops/gpu.py
@@ -1,5 +1,4 @@
 # llops don't know about derivatives
-
 import functools
 import numpy as np
 import pyopencl as cl
@@ -45,11 +44,6 @@ def buffer_new(shape, zero=False):
 def buffer_np(x):
   return cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
 
-def clbuffer(hostbuf, shape):
-  return cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0),
-            4*np.prod(shape),
-            hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
-
 @functools.lru_cache
 def clbuild(name, prg):
   clprg = cl.Program(cl_ctx, prg).build().__getattr__(name)
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index f1a6d6e7e2..267e44994a 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -287,16 +287,14 @@ class Tensor:
     return self.relu() - (-neg_slope*self).relu()
 
   def softmax(self):
-    ns = list(self.shape)[:-1]+[1]
-    m = self.max(axis=len(self.shape)-1).reshape(shape=ns)
+    m = self.max(axis=len(self.shape)-1, keepdim=True)
     e = (self - m).exp()
-    ss = e.sum(axis=len(self.shape)-1).reshape(shape=ns)
+    ss = e.sum(axis=len(self.shape)-1, keepdim=True)
     return e.div(ss)
 
   def logsoftmax(self):
-    ns = list(self.shape)[:-1]+[1]
-    m = self.max(axis=len(self.shape)-1).reshape(shape=ns)
-    ss = m + (self-m).exp().sum(axis=len(self.shape)-1).reshape(shape=ns).log()
+    m = self.max(axis=len(self.shape)-1, keepdim=True)
+    ss = m + (self-m).exp().sum(axis=len(self.shape)-1, keepdim=True).log()
     return self - ss
 
   def dropout(self, p=0.5):