readme, plus reduce ops

2026-01-09 15:08:02 -05:00 · 2021-06-16 11:21:06 -07:00
parent ff3fdc58e5
commit b1000d866e
3 changed files with 100 additions and 1 deletions
--- a/extra/cherry.py
+++ b/extra/cherry.py
@@ -67,6 +67,10 @@ class BinaryOps(Enum):
  MULACC = 4
  POW = 5

+class ReduceOps(Enum):
+  SUM = 0
+  MAX = 1
+
 for t in Reg:
  regfile[t] = np.zeros((SZ, SZ), dtype=np.float32)

@@ -143,6 +147,14 @@ def riski_mulacc():
 def riski_pow():
  regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] ** regfile[Reg.MATMUL_WEIGHTS]

+@count
+def riski_reduce_sum():
+  regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].sum(axis=0)
+
+@count
+def riski_reduce_max():
+  regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].max(axis=0)
+
 # TODO: make accumulate a bit in the instruction available to all
 binops = {BinaryOps.ADD: riski_add,
          BinaryOps.SUB: riski_sub,
@@ -151,6 +163,9 @@ binops = {BinaryOps.ADD: riski_add,
          BinaryOps.MULACC: riski_mulacc,
          BinaryOps.POW: riski_pow}

+reduceops = {ReduceOps.SUM: riski_reduce_sum,
+             ReduceOps.MAX: riski_reduce_max}
+
@count
 # TODO: add masks to matmul instruction?
 def riski_matmul():
@@ -210,6 +225,12 @@ def cherry_dmaw(address, shp):

 # *** CHERRY code to be compiled ***

+def cherry_reduceop(x, op, axis):
+  print(op, x.shape, axis)
+  cherry_dmar(SLOT(0), x)
+
+  return cherry_dmaw(SLOT(2), x.shape)
+
 def cherry_unop(x, op):
  cherry_dmar(SLOT(0), x)
  cnt = np.prod(x.shape)
@@ -337,7 +358,7 @@ def cherry_matmul(x, w, transpose_x=False, transpose_w=False):
  return cherry_dmaw(SLOT(2), (*x.shape[0:-2],M,N))

 import unittest
-class TestRisk(unittest.TestCase):
+class TestCherry(unittest.TestCase):
  def test_matmul_even(self):
    x = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
    w = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
--- a/extra/ops_cherry.py
+++ b/extra/ops_cherry.py
@@ -32,6 +32,37 @@ class Exp(Function):
    ret, = ctx.saved_tensors
    return cherry_binop(grad_output, ret, BinaryOps.MUL)

+# ************* reduce ops *************
+
+"""
+class Sum(Function):
+  def forward(ctx, input, axis=None):
+    ctx.save_for_backward(input, axis)
+    return cherry_reduceop(input, ReduceOps.SUM, axis)
+
+  def backward(ctx, grad_output):
+    input, axis = ctx.saved_tensors
+    if isinstance(axis, int): axis = [axis]
+    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    return grad_output.reshape(shape) + np.zeros_like(input)
+
+class Max(Function):
+  def forward(ctx, inp, axis=None):
+    if isinstance(axis, int): axis = [axis]
+    ret = np.amax(inp, axis=None if axis is None else tuple(axis), keepdims=True)
+    ctx.save_for_backward(inp, axis, ret)
+    if axis is not None:
+      ret = ret.reshape([inp.shape[i] for i in range(len(inp.shape)) if i not in axis])
+    return ret
+
+  def backward(ctx, grad_output):
+    input, axis, ret = ctx.saved_tensors
+    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    ret2 = (input==ret.reshape(shape))
+    div = ret2.sum(axis=None if axis is None else tuple(axis), keepdims=True)
+    return ret2*grad_output.reshape(shape)/div
+"""
+
 # ************* binary ops *************

 def unbroadcast(out, in_sh):
--- a/fpga/README
+++ b/fpga/README
@@ -0,0 +1,47 @@
+Cherry is designed with thneed in mind. Assuming onboard RAM, it'll run without the host.
+
+Single core RISC-V, superscalar, out of order. Targeting 1+ instructions/cycle.
+
+Compute is straightforward, but two questions about memory:
+* How much striding do we need? How much does it cost us power and transistor wise?
+* Should the copies to SRAM be explicit or caching with the DDR? Caching is a simpler programming model.
+
+Small Board (Arty A7 100T)
+=====
+* Support DMA over the ethernet interface, 12.5 MB/s 
+* 65k elements in on board RAM, 18-bit
+* Optionally, use the 256MB of DDR3L onboard to hold everything. 2.66 GB/s
+* 240 DSP slices, 101k luts
+* 4x4x4 matmul = 64 mults, perhaps 8x8x8 matmul = 512 mults
+* 6.4 GFLOPS @ 50 mhz
+
+Big Board (Alveo U250)
+=====
+* Support DMA over PCI-E. 16 GB/s
+* 8M elements in on board RAM, 18-bit
+* Optionally, use the 64GB of DDR4 onboard to hold everything. 77 GB/s
+* 12288 DSP slices, 1.7M luts
+* 16x16x16 matmul = 4096 mults, perhaps 32x32x32 matmul = 32768 mults
+* 4 TFLOPS @ 500 mhz
+
+Cherry Two (12nm tapeout)
+=====
+* Support DMA over PCI-E. 16 GB/s
+* 8M elements in on board RAM, 19-bit, or 18-bit if that's all we need
+* Hopefully we don't need any DDR, is host RAM fast enough?
+* 32x32x32 matmul = 32768 mults
+* 64 TFLOPS @ 1 ghz
+* Target 75W, even if underclocked. One slot, no external power.
+* This card should be on par with a 3090 and sell for $1000
+
+Cherry Three (5nm tapeout)
+=====
+* Support DMA over PCI-E 4.0. 32 GB/s
+* 16 cores
+* 8M elements in on board RAM of each core (288 MB SRAM on chip)
+* Shared ~16GB GDDR6 between cores. Something like 512 GB/s
+* 16x 32x32x32 matmul = 32768 mults
+* 1 PFLOP @ 1 ghz (finally, a petaflop chip)
+* Target 300W
+* This card should be on par with a DGX A100 and sell for $2000
+