rawcpu (#365)

* rawcpu * add should work when we respect shapetracker * now that's true * still have to handle shapetracker * copyin * Fix mypy
2026-01-09 15:08:02 -05:00 · 2022-08-17 11:33:20 +02:00
parent 57e5df9f28
commit 783c120a8c
9 changed files with 127 additions and 1 deletions
--- a/accel/rawcpu/buffer.pyx
+++ b/accel/rawcpu/buffer.pyx
@@ -0,0 +1,63 @@
+from cbuffer cimport CBuffer
+cimport numpy as np
+import numpy as np
+from typing import Tuple
+from tinygrad.helpers import prod
+from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps
+from tinygrad.shapetracker import ShapeTracker
+
+cdef class RawCPUBuffer:
+  cdef CBuffer *buf
+  st: ShapeTracker
+
+  def __init__(self, shape, RawCPUBuffer parent=None):
+    # TODO: copied from ops gpu, generic this?
+    self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
+    if parent is not None: self.buf = parent.buf
+    else: self.buf = new CBuffer(prod(shape))
+
+  @property
+  def shape(self): return self.st.shape
+
+  def contiguous_op(RawCPUBuffer x) -> RawCPUBuffer:
+    return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
+
+  @staticmethod
+  def fromCPU(np.ndarray x):
+    ret = RawCPUBuffer([x.shape[i] for i in range(x.ndim)])
+    ret.buf.copyin(x.data)
+    return ret
+
+  def toCPU(RawCPUBuffer self):
+    x: RawCPUBuffer
+    print("toCPU", self.buf.size, self.st)
+    x = self.contiguous_op()
+    buf = memoryview(<float[:prod(x.shape)]> x.buf.buf)
+    return np.frombuffer(buf, dtype=np.float32).reshape(x.shape)
+
+  # 1 free generic op same as GPU (superclass with shapetracker?)
+
+  def movement_op(RawCPUBuffer x, op, arg): return type(x)(ShapeTracker(x.st).movement_op(op, arg), x)
+
+  # 3 actual ops
+
+  REQUIRES_SIMPLE_REDUCE = True
+  def reduce_op(RawCPUBuffer x, op:ReduceOps, new_shape:Tuple[int, ...]): 
+    return x
+
+  def unary_op(RawCPUBuffer x, op):
+    print(op, x.st)
+    return x
+
+  # TODO: shape/strides for x and y combined
+  def binary_op(RawCPUBuffer x, op, RawCPUBuffer y):
+    print(op, x.st, y.st)
+    ret = RawCPUBuffer(x.shape)
+    ret.buf = new CBuffer(prod(x.shape))
+    if op == BinaryOps.ADD: ret.buf.add(x.buf, y.buf)
+    elif op == BinaryOps.MUL: ret.buf.mul(x.buf, y.buf)
+    else: raise NotImplementedError()
+    # TODO: write binary op in c++
+    return ret
+
+  # can all be combined into _processing_op
--- a/accel/rawcpu/buffer.pyxbld
+++ b/accel/rawcpu/buffer.pyxbld
@@ -0,0 +1,10 @@
+import os
+import numpy as np
+
+def make_ext(modname, pyxfilename):
+  from distutils.extension import Extension
+  include_dir = os.path.dirname(pyxfilename)
+  return Extension(name=modname,
+                   include_dirs=[include_dir, np.get_include()],
+                   sources=[pyxfilename],
+                   language='c++')
--- a/accel/rawcpu/cbuffer.h
+++ b/accel/rawcpu/cbuffer.h
@@ -0,0 +1,26 @@
+class CBuffer {
+public:
+  CBuffer(int size_, void* dat = NULL) {
+    size = size_;
+    buf = (float*)malloc(size*4);
+  }
+
+  void copyin(void *dat) {
+    memcpy(buf, dat, size*4);
+  }
+
+  void add(CBuffer *x, CBuffer *y) {
+    for (int i = 0; i < size; i++) {
+      buf[i] = x->buf[i] + y->buf[i];
+    }
+  }
+
+  void mul(CBuffer *x, CBuffer *y) {
+    for (int i = 0; i < size; i++) {
+      buf[i] = x->buf[i] * y->buf[i];
+    }
+  }
+
+  float *buf;
+  int size;
+};
--- a/accel/rawcpu/cbuffer.pxd
+++ b/accel/rawcpu/cbuffer.pxd
@@ -0,0 +1,12 @@
+# distutils: language = c++
+# distutils: sources = cbuffer.h
+
+cdef extern from "cbuffer.h":
+  cdef cppclass CBuffer:
+    CBuffer(int size)
+    void copyin(void *dat)
+    void add(CBuffer *a, CBuffer *b)
+    void mul(CBuffer *a, CBuffer *b)
+    float *buf
+    int size
+
--- a/accel/rawcpu/ops_rawcpu.py
+++ b/accel/rawcpu/ops_rawcpu.py
@@ -0,0 +1,9 @@
+# type: ignore
+import sys
+
+# only pyximport this
+import pyximport
+py_importer, pyx_importer = pyximport.install()
+from accel.rawcpu.buffer import RawCPUBuffer
+sys.meta_path.remove(pyx_importer)
+
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -115,6 +115,7 @@ class GPUBuffer:
  def reduce_op(x, op:ReduceOps, new_shape:Tuple[int, ...]): return type(x)(new_shape)._processing_op([("A", x)], code="acc", earlycode=GPUBuffer.code_for_op[op], earlybufs=set("A"), start=GPUBuffer.start_for_op[op])

  #REQUIRES_SIMPLE_REDUCE = True
+  # is there a downside to REQUIRES_SIMPLE_REDUCE always?
  def _processing_op(ret, bufs: List[Tuple[str, GPUBuffer]]=[], code:str="acc", C:Optional[ConvArgs]=None, start="0.0", reduce_shape=None, earlybufs:Set[str]=set(), earlycode:str="acc") -> GPUBuffer:
    assert C is None

--- a/tinygrad/llops/ops_rawcpu.py
+++ b/tinygrad/llops/ops_rawcpu.py
@@ -0,0 +1 @@
+../../accel/rawcpu/ops_rawcpu.py
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -251,7 +251,7 @@ class LazyBuffer:
    if getattr(x.dbuffer, "REQUIRES_SIMPLE_REDUCE", False) and (len(new_shape) != 2 or new_shape[1] != 1):
      num, red = prod([s for s,n in zip(x.shape, new_shape) if n != 1]), prod([s for s,n in zip(x.shape, new_shape) if n == 1])
      x = x.movement_op(MovementOps.PERMUTE, [i for i,n in enumerate(new_shape) if n != 1] + [i for i,n in enumerate(new_shape) if n == 1])
-      x = x.movement_op(MovementOps.RESHAPE, (num, red))
+      x = x.movement_op(MovementOps.RESHAPE, (num, red))    # remove this reshape, at the end is enough
      return x.reduce_op(op, (num, 1)).movement_op(MovementOps.RESHAPE, new_shape)
    else:
      return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))
--- a/tinygrad/shapetracker.py
+++ b/tinygrad/shapetracker.py
@@ -68,6 +68,7 @@ def view_from_shape(shape:Tuple[int, ...]):
 class ShapeTracker:
  def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]]):
    self.views : List[ViewTypes] = shape.views[:] if isinstance(shape, ShapeTracker) else [view_from_shape(shape)]
+  def __repr__(self): return f"{'Complex' if len(self.views) > 1 else ''}ShapeTracker<{self.shape}, {self.views}>"

  @property
  def contiguous(self): return len(self.views) == 1 and self.views[-1].contiguous
@@ -85,11 +86,14 @@ class ShapeTracker:
  def movement_op(self, op, arg): getattr(self, str(op).split(".")[1].lower())(*arg); return self
  def needs_valid(self): return any(isinstance(v, ZeroView) for v in self.views)

+  # TODO: this is not really needed, only for testing
  def __getitem__(self, val):
    locals = {"idx": val, "valid": 1}
    exec(self.expr(), None, locals)
    return locals["idx"] if locals["valid"] else -1

+  # TODO: do we really need this for conv?
+  # if we replace, confirm the ops taken fold into one view
  def strided(self, *arg):
    view = View([x[0] for x in arg], [x[1] for x in arg])
    if self.contiguous: self.views[-1] = view