readme, plus reduce ops

This commit is contained in:
George Hotz
2021-06-16 11:21:06 -07:00
parent ff3fdc58e5
commit b1000d866e
3 changed files with 100 additions and 1 deletions

View File

@@ -67,6 +67,10 @@ class BinaryOps(Enum):
MULACC = 4
POW = 5
class ReduceOps(Enum):
SUM = 0
MAX = 1
for t in Reg:
regfile[t] = np.zeros((SZ, SZ), dtype=np.float32)
@@ -143,6 +147,14 @@ def riski_mulacc():
def riski_pow():
regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] ** regfile[Reg.MATMUL_WEIGHTS]
@count
def riski_reduce_sum():
regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].sum(axis=0)
@count
def riski_reduce_max():
regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].max(axis=0)
# TODO: make accumulate a bit in the instruction available to all
binops = {BinaryOps.ADD: riski_add,
BinaryOps.SUB: riski_sub,
@@ -151,6 +163,9 @@ binops = {BinaryOps.ADD: riski_add,
BinaryOps.MULACC: riski_mulacc,
BinaryOps.POW: riski_pow}
reduceops = {ReduceOps.SUM: riski_reduce_sum,
ReduceOps.MAX: riski_reduce_max}
@count
# TODO: add masks to matmul instruction?
def riski_matmul():
@@ -210,6 +225,12 @@ def cherry_dmaw(address, shp):
# *** CHERRY code to be compiled ***
def cherry_reduceop(x, op, axis):
print(op, x.shape, axis)
cherry_dmar(SLOT(0), x)
return cherry_dmaw(SLOT(2), x.shape)
def cherry_unop(x, op):
cherry_dmar(SLOT(0), x)
cnt = np.prod(x.shape)
@@ -337,7 +358,7 @@ def cherry_matmul(x, w, transpose_x=False, transpose_w=False):
return cherry_dmaw(SLOT(2), (*x.shape[0:-2],M,N))
import unittest
class TestRisk(unittest.TestCase):
class TestCherry(unittest.TestCase):
def test_matmul_even(self):
x = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
w = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)

View File

@@ -32,6 +32,37 @@ class Exp(Function):
ret, = ctx.saved_tensors
return cherry_binop(grad_output, ret, BinaryOps.MUL)
# ************* reduce ops *************
"""
class Sum(Function):
def forward(ctx, input, axis=None):
ctx.save_for_backward(input, axis)
return cherry_reduceop(input, ReduceOps.SUM, axis)
def backward(ctx, grad_output):
input, axis = ctx.saved_tensors
if isinstance(axis, int): axis = [axis]
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
return grad_output.reshape(shape) + np.zeros_like(input)
class Max(Function):
def forward(ctx, inp, axis=None):
if isinstance(axis, int): axis = [axis]
ret = np.amax(inp, axis=None if axis is None else tuple(axis), keepdims=True)
ctx.save_for_backward(inp, axis, ret)
if axis is not None:
ret = ret.reshape([inp.shape[i] for i in range(len(inp.shape)) if i not in axis])
return ret
def backward(ctx, grad_output):
input, axis, ret = ctx.saved_tensors
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
ret2 = (input==ret.reshape(shape))
div = ret2.sum(axis=None if axis is None else tuple(axis), keepdims=True)
return ret2*grad_output.reshape(shape)/div
"""
# ************* binary ops *************
def unbroadcast(out, in_sh):

47
fpga/README Normal file
View File

@@ -0,0 +1,47 @@
Cherry is designed with thneed in mind. Assuming onboard RAM, it'll run without the host.
Single core RISC-V, superscalar, out of order. Targeting 1+ instructions/cycle.
Compute is straightforward, but two questions about memory:
* How much striding do we need? How much does it cost us power and transistor wise?
* Should the copies to SRAM be explicit or caching with the DDR? Caching is a simpler programming model.
Small Board (Arty A7 100T)
=====
* Support DMA over the ethernet interface, 12.5 MB/s
* 65k elements in on board RAM, 18-bit
* Optionally, use the 256MB of DDR3L onboard to hold everything. 2.66 GB/s
* 240 DSP slices, 101k luts
* 4x4x4 matmul = 64 mults, perhaps 8x8x8 matmul = 512 mults
* 6.4 GFLOPS @ 50 mhz
Big Board (Alveo U250)
=====
* Support DMA over PCI-E. 16 GB/s
* 8M elements in on board RAM, 18-bit
* Optionally, use the 64GB of DDR4 onboard to hold everything. 77 GB/s
* 12288 DSP slices, 1.7M luts
* 16x16x16 matmul = 4096 mults, perhaps 32x32x32 matmul = 32768 mults
* 4 TFLOPS @ 500 mhz
Cherry Two (12nm tapeout)
=====
* Support DMA over PCI-E. 16 GB/s
* 8M elements in on board RAM, 19-bit, or 18-bit if that's all we need
* Hopefully we don't need any DDR, is host RAM fast enough?
* 32x32x32 matmul = 32768 mults
* 64 TFLOPS @ 1 ghz
* Target 75W, even if underclocked. One slot, no external power.
* This card should be on par with a 3090 and sell for $1000
Cherry Three (5nm tapeout)
=====
* Support DMA over PCI-E 4.0. 32 GB/s
* 16 cores
* 8M elements in on board RAM of each core (288 MB SRAM on chip)
* Shared ~16GB GDDR6 between cores. Something like 512 GB/s
* 16x 32x32x32 matmul = 32768 mults
* 1 PFLOP @ 1 ghz (finally, a petaflop chip)
* Target 300W
* This card should be on par with a DGX A100 and sell for $2000