From b1000d866e801f0226baf320f588dfe05301ee66 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Wed, 16 Jun 2021 11:21:06 -0700 Subject: [PATCH] readme, plus reduce ops --- extra/cherry.py | 23 +++++++++++++++++++++- extra/ops_cherry.py | 31 ++++++++++++++++++++++++++++++ fpga/README | 47 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 fpga/README diff --git a/extra/cherry.py b/extra/cherry.py index 7bf6c9c195..b237579ee9 100755 --- a/extra/cherry.py +++ b/extra/cherry.py @@ -67,6 +67,10 @@ class BinaryOps(Enum): MULACC = 4 POW = 5 +class ReduceOps(Enum): + SUM = 0 + MAX = 1 + for t in Reg: regfile[t] = np.zeros((SZ, SZ), dtype=np.float32) @@ -143,6 +147,14 @@ def riski_mulacc(): def riski_pow(): regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] ** regfile[Reg.MATMUL_WEIGHTS] +@count +def riski_reduce_sum(): + regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].sum(axis=0) + +@count +def riski_reduce_max(): + regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].max(axis=0) + # TODO: make accumulate a bit in the instruction available to all binops = {BinaryOps.ADD: riski_add, BinaryOps.SUB: riski_sub, @@ -151,6 +163,9 @@ binops = {BinaryOps.ADD: riski_add, BinaryOps.MULACC: riski_mulacc, BinaryOps.POW: riski_pow} +reduceops = {ReduceOps.SUM: riski_reduce_sum, + ReduceOps.MAX: riski_reduce_max} + @count # TODO: add masks to matmul instruction? def riski_matmul(): @@ -210,6 +225,12 @@ def cherry_dmaw(address, shp): # *** CHERRY code to be compiled *** +def cherry_reduceop(x, op, axis): + print(op, x.shape, axis) + cherry_dmar(SLOT(0), x) + + return cherry_dmaw(SLOT(2), x.shape) + def cherry_unop(x, op): cherry_dmar(SLOT(0), x) cnt = np.prod(x.shape) @@ -337,7 +358,7 @@ def cherry_matmul(x, w, transpose_x=False, transpose_w=False): return cherry_dmaw(SLOT(2), (*x.shape[0:-2],M,N)) import unittest -class TestRisk(unittest.TestCase): +class TestCherry(unittest.TestCase): def test_matmul_even(self): x = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32) w = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32) diff --git a/extra/ops_cherry.py b/extra/ops_cherry.py index c540a96a40..f0cf74e4aa 100644 --- a/extra/ops_cherry.py +++ b/extra/ops_cherry.py @@ -32,6 +32,37 @@ class Exp(Function): ret, = ctx.saved_tensors return cherry_binop(grad_output, ret, BinaryOps.MUL) +# ************* reduce ops ************* + +""" +class Sum(Function): + def forward(ctx, input, axis=None): + ctx.save_for_backward(input, axis) + return cherry_reduceop(input, ReduceOps.SUM, axis) + + def backward(ctx, grad_output): + input, axis = ctx.saved_tensors + if isinstance(axis, int): axis = [axis] + shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))] + return grad_output.reshape(shape) + np.zeros_like(input) + +class Max(Function): + def forward(ctx, inp, axis=None): + if isinstance(axis, int): axis = [axis] + ret = np.amax(inp, axis=None if axis is None else tuple(axis), keepdims=True) + ctx.save_for_backward(inp, axis, ret) + if axis is not None: + ret = ret.reshape([inp.shape[i] for i in range(len(inp.shape)) if i not in axis]) + return ret + + def backward(ctx, grad_output): + input, axis, ret = ctx.saved_tensors + shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))] + ret2 = (input==ret.reshape(shape)) + div = ret2.sum(axis=None if axis is None else tuple(axis), keepdims=True) + return ret2*grad_output.reshape(shape)/div +""" + # ************* binary ops ************* def unbroadcast(out, in_sh): diff --git a/fpga/README b/fpga/README new file mode 100644 index 0000000000..333ccd115e --- /dev/null +++ b/fpga/README @@ -0,0 +1,47 @@ +Cherry is designed with thneed in mind. Assuming onboard RAM, it'll run without the host. + +Single core RISC-V, superscalar, out of order. Targeting 1+ instructions/cycle. + +Compute is straightforward, but two questions about memory: +* How much striding do we need? How much does it cost us power and transistor wise? +* Should the copies to SRAM be explicit or caching with the DDR? Caching is a simpler programming model. + +Small Board (Arty A7 100T) +===== +* Support DMA over the ethernet interface, 12.5 MB/s +* 65k elements in on board RAM, 18-bit +* Optionally, use the 256MB of DDR3L onboard to hold everything. 2.66 GB/s +* 240 DSP slices, 101k luts +* 4x4x4 matmul = 64 mults, perhaps 8x8x8 matmul = 512 mults +* 6.4 GFLOPS @ 50 mhz + +Big Board (Alveo U250) +===== +* Support DMA over PCI-E. 16 GB/s +* 8M elements in on board RAM, 18-bit +* Optionally, use the 64GB of DDR4 onboard to hold everything. 77 GB/s +* 12288 DSP slices, 1.7M luts +* 16x16x16 matmul = 4096 mults, perhaps 32x32x32 matmul = 32768 mults +* 4 TFLOPS @ 500 mhz + +Cherry Two (12nm tapeout) +===== +* Support DMA over PCI-E. 16 GB/s +* 8M elements in on board RAM, 19-bit, or 18-bit if that's all we need +* Hopefully we don't need any DDR, is host RAM fast enough? +* 32x32x32 matmul = 32768 mults +* 64 TFLOPS @ 1 ghz +* Target 75W, even if underclocked. One slot, no external power. +* This card should be on par with a 3090 and sell for $1000 + +Cherry Three (5nm tapeout) +===== +* Support DMA over PCI-E 4.0. 32 GB/s +* 16 cores +* 8M elements in on board RAM of each core (288 MB SRAM on chip) +* Shared ~16GB GDDR6 between cores. Something like 512 GB/s +* 16x 32x32x32 matmul = 32768 mults +* 1 PFLOP @ 1 ghz (finally, a petaflop chip) +* Target 300W +* This card should be on par with a DGX A100 and sell for $2000 +