* rawcpu

* add should work when we respect shapetracker

* now that's true

* still have to handle shapetracker

* copyin

* Fix mypy
This commit is contained in:
George Hotz
2022-08-17 11:33:20 +02:00
committed by GitHub
parent 57e5df9f28
commit 783c120a8c
9 changed files with 127 additions and 1 deletions

63
accel/rawcpu/buffer.pyx Normal file
View File

@@ -0,0 +1,63 @@
from cbuffer cimport CBuffer
cimport numpy as np
import numpy as np
from typing import Tuple
from tinygrad.helpers import prod
from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps
from tinygrad.shapetracker import ShapeTracker
cdef class RawCPUBuffer:
cdef CBuffer *buf
st: ShapeTracker
def __init__(self, shape, RawCPUBuffer parent=None):
# TODO: copied from ops gpu, generic this?
self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape))
if parent is not None: self.buf = parent.buf
else: self.buf = new CBuffer(prod(shape))
@property
def shape(self): return self.st.shape
def contiguous_op(RawCPUBuffer x) -> RawCPUBuffer:
return x if x.st.contiguous else x.unary_op(UnaryOps.NOOP)
@staticmethod
def fromCPU(np.ndarray x):
ret = RawCPUBuffer([x.shape[i] for i in range(x.ndim)])
ret.buf.copyin(x.data)
return ret
def toCPU(RawCPUBuffer self):
x: RawCPUBuffer
print("toCPU", self.buf.size, self.st)
x = self.contiguous_op()
buf = memoryview(<float[:prod(x.shape)]> x.buf.buf)
return np.frombuffer(buf, dtype=np.float32).reshape(x.shape)
# 1 free generic op same as GPU (superclass with shapetracker?)
def movement_op(RawCPUBuffer x, op, arg): return type(x)(ShapeTracker(x.st).movement_op(op, arg), x)
# 3 actual ops
REQUIRES_SIMPLE_REDUCE = True
def reduce_op(RawCPUBuffer x, op:ReduceOps, new_shape:Tuple[int, ...]):
return x
def unary_op(RawCPUBuffer x, op):
print(op, x.st)
return x
# TODO: shape/strides for x and y combined
def binary_op(RawCPUBuffer x, op, RawCPUBuffer y):
print(op, x.st, y.st)
ret = RawCPUBuffer(x.shape)
ret.buf = new CBuffer(prod(x.shape))
if op == BinaryOps.ADD: ret.buf.add(x.buf, y.buf)
elif op == BinaryOps.MUL: ret.buf.mul(x.buf, y.buf)
else: raise NotImplementedError()
# TODO: write binary op in c++
return ret
# can all be combined into _processing_op

View File

@@ -0,0 +1,10 @@
import os
import numpy as np
def make_ext(modname, pyxfilename):
from distutils.extension import Extension
include_dir = os.path.dirname(pyxfilename)
return Extension(name=modname,
include_dirs=[include_dir, np.get_include()],
sources=[pyxfilename],
language='c++')

26
accel/rawcpu/cbuffer.h Normal file
View File

@@ -0,0 +1,26 @@
class CBuffer {
public:
CBuffer(int size_, void* dat = NULL) {
size = size_;
buf = (float*)malloc(size*4);
}
void copyin(void *dat) {
memcpy(buf, dat, size*4);
}
void add(CBuffer *x, CBuffer *y) {
for (int i = 0; i < size; i++) {
buf[i] = x->buf[i] + y->buf[i];
}
}
void mul(CBuffer *x, CBuffer *y) {
for (int i = 0; i < size; i++) {
buf[i] = x->buf[i] * y->buf[i];
}
}
float *buf;
int size;
};

12
accel/rawcpu/cbuffer.pxd Normal file
View File

@@ -0,0 +1,12 @@
# distutils: language = c++
# distutils: sources = cbuffer.h
cdef extern from "cbuffer.h":
cdef cppclass CBuffer:
CBuffer(int size)
void copyin(void *dat)
void add(CBuffer *a, CBuffer *b)
void mul(CBuffer *a, CBuffer *b)
float *buf
int size

View File

@@ -0,0 +1,9 @@
# type: ignore
import sys
# only pyximport this
import pyximport
py_importer, pyx_importer = pyximport.install()
from accel.rawcpu.buffer import RawCPUBuffer
sys.meta_path.remove(pyx_importer)

View File

@@ -115,6 +115,7 @@ class GPUBuffer:
def reduce_op(x, op:ReduceOps, new_shape:Tuple[int, ...]): return type(x)(new_shape)._processing_op([("A", x)], code="acc", earlycode=GPUBuffer.code_for_op[op], earlybufs=set("A"), start=GPUBuffer.start_for_op[op])
#REQUIRES_SIMPLE_REDUCE = True
# is there a downside to REQUIRES_SIMPLE_REDUCE always?
def _processing_op(ret, bufs: List[Tuple[str, GPUBuffer]]=[], code:str="acc", C:Optional[ConvArgs]=None, start="0.0", reduce_shape=None, earlybufs:Set[str]=set(), earlycode:str="acc") -> GPUBuffer:
assert C is None

View File

@@ -0,0 +1 @@
../../accel/rawcpu/ops_rawcpu.py

View File

@@ -251,7 +251,7 @@ class LazyBuffer:
if getattr(x.dbuffer, "REQUIRES_SIMPLE_REDUCE", False) and (len(new_shape) != 2 or new_shape[1] != 1):
num, red = prod([s for s,n in zip(x.shape, new_shape) if n != 1]), prod([s for s,n in zip(x.shape, new_shape) if n == 1])
x = x.movement_op(MovementOps.PERMUTE, [i for i,n in enumerate(new_shape) if n != 1] + [i for i,n in enumerate(new_shape) if n == 1])
x = x.movement_op(MovementOps.RESHAPE, (num, red))
x = x.movement_op(MovementOps.RESHAPE, (num, red)) # remove this reshape, at the end is enough
return x.reduce_op(op, (num, 1)).movement_op(MovementOps.RESHAPE, new_shape)
else:
return LazyBuffer(x.device, tuple(new_shape), ReduceOps, LazyOp(op, (x,), tuple(new_shape)))

View File

@@ -68,6 +68,7 @@ def view_from_shape(shape:Tuple[int, ...]):
class ShapeTracker:
def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]]):
self.views : List[ViewTypes] = shape.views[:] if isinstance(shape, ShapeTracker) else [view_from_shape(shape)]
def __repr__(self): return f"{'Complex' if len(self.views) > 1 else ''}ShapeTracker<{self.shape}, {self.views}>"
@property
def contiguous(self): return len(self.views) == 1 and self.views[-1].contiguous
@@ -85,11 +86,14 @@ class ShapeTracker:
def movement_op(self, op, arg): getattr(self, str(op).split(".")[1].lower())(*arg); return self
def needs_valid(self): return any(isinstance(v, ZeroView) for v in self.views)
# TODO: this is not really needed, only for testing
def __getitem__(self, val):
locals = {"idx": val, "valid": 1}
exec(self.expr(), None, locals)
return locals["idx"] if locals["valid"] else -1
# TODO: do we really need this for conv?
# if we replace, confirm the ops taken fold into one view
def strided(self, *arg):
view = View([x[0] for x in arg], [x[1] for x in arg])
if self.contiguous: self.views[-1] = view