From b1000d866e801f0226baf320f588dfe05301ee66 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Wed, 16 Jun 2021 11:21:06 -0700
Subject: [PATCH] readme, plus reduce ops

---
 extra/cherry.py     | 23 +++++++++++++++++++++-
 extra/ops_cherry.py | 31 ++++++++++++++++++++++++++++++
 fpga/README         | 47 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 fpga/README

diff --git a/extra/cherry.py b/extra/cherry.py
index 7bf6c9c195..b237579ee9 100755
--- a/extra/cherry.py
+++ b/extra/cherry.py
@@ -67,6 +67,10 @@ class BinaryOps(Enum):
   MULACC = 4
   POW = 5
 
+class ReduceOps(Enum):
+  SUM = 0
+  MAX = 1
+
 for t in Reg:
   regfile[t] = np.zeros((SZ, SZ), dtype=np.float32)
 
@@ -143,6 +147,14 @@ def riski_mulacc():
 def riski_pow():
   regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] ** regfile[Reg.MATMUL_WEIGHTS]
 
+@count
+def riski_reduce_sum():
+  regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].sum(axis=0)
+
+@count
+def riski_reduce_max():
+  regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].max(axis=0)
+
 # TODO: make accumulate a bit in the instruction available to all
 binops = {BinaryOps.ADD: riski_add,
           BinaryOps.SUB: riski_sub,
@@ -151,6 +163,9 @@ binops = {BinaryOps.ADD: riski_add,
           BinaryOps.MULACC: riski_mulacc,
           BinaryOps.POW: riski_pow}
 
+reduceops = {ReduceOps.SUM: riski_reduce_sum,
+             ReduceOps.MAX: riski_reduce_max}
+
 @count
 # TODO: add masks to matmul instruction?
 def riski_matmul():
@@ -210,6 +225,12 @@ def cherry_dmaw(address, shp):
 
 # *** CHERRY code to be compiled ***
 
+def cherry_reduceop(x, op, axis):
+  print(op, x.shape, axis)
+  cherry_dmar(SLOT(0), x)
+
+  return cherry_dmaw(SLOT(2), x.shape)
+
 def cherry_unop(x, op):
   cherry_dmar(SLOT(0), x)
   cnt = np.prod(x.shape)
@@ -337,7 +358,7 @@ def cherry_matmul(x, w, transpose_x=False, transpose_w=False):
   return cherry_dmaw(SLOT(2), (*x.shape[0:-2],M,N))
 
 import unittest
-class TestRisk(unittest.TestCase):
+class TestCherry(unittest.TestCase):
   def test_matmul_even(self):
     x = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
     w = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
diff --git a/extra/ops_cherry.py b/extra/ops_cherry.py
index c540a96a40..f0cf74e4aa 100644
--- a/extra/ops_cherry.py
+++ b/extra/ops_cherry.py
@@ -32,6 +32,37 @@ class Exp(Function):
     ret, = ctx.saved_tensors
     return cherry_binop(grad_output, ret, BinaryOps.MUL)
 
+# ************* reduce ops *************
+
+"""
+class Sum(Function):
+  def forward(ctx, input, axis=None):
+    ctx.save_for_backward(input, axis)
+    return cherry_reduceop(input, ReduceOps.SUM, axis)
+
+  def backward(ctx, grad_output):
+    input, axis = ctx.saved_tensors
+    if isinstance(axis, int): axis = [axis]
+    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    return grad_output.reshape(shape) + np.zeros_like(input)
+
+class Max(Function):
+  def forward(ctx, inp, axis=None):
+    if isinstance(axis, int): axis = [axis]
+    ret = np.amax(inp, axis=None if axis is None else tuple(axis), keepdims=True)
+    ctx.save_for_backward(inp, axis, ret)
+    if axis is not None:
+      ret = ret.reshape([inp.shape[i] for i in range(len(inp.shape)) if i not in axis])
+    return ret
+
+  def backward(ctx, grad_output):
+    input, axis, ret = ctx.saved_tensors
+    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    ret2 = (input==ret.reshape(shape))
+    div = ret2.sum(axis=None if axis is None else tuple(axis), keepdims=True)
+    return ret2*grad_output.reshape(shape)/div
+"""
+
 # ************* binary ops *************
 
 def unbroadcast(out, in_sh):
diff --git a/fpga/README b/fpga/README
new file mode 100644
index 0000000000..333ccd115e
--- /dev/null
+++ b/fpga/README
@@ -0,0 +1,47 @@
+Cherry is designed with thneed in mind. Assuming onboard RAM, it'll run without the host.
+
+Single core RISC-V, superscalar, out of order. Targeting 1+ instructions/cycle.
+
+Compute is straightforward, but two questions about memory:
+* How much striding do we need? How much does it cost us power and transistor wise?
+* Should the copies to SRAM be explicit or caching with the DDR? Caching is a simpler programming model.
+
+Small Board (Arty A7 100T)
+=====
+* Support DMA over the ethernet interface, 12.5 MB/s 
+* 65k elements in on board RAM, 18-bit
+* Optionally, use the 256MB of DDR3L onboard to hold everything. 2.66 GB/s
+* 240 DSP slices, 101k luts
+* 4x4x4 matmul = 64 mults, perhaps 8x8x8 matmul = 512 mults
+* 6.4 GFLOPS @ 50 mhz
+
+Big Board (Alveo U250)
+=====
+* Support DMA over PCI-E. 16 GB/s
+* 8M elements in on board RAM, 18-bit
+* Optionally, use the 64GB of DDR4 onboard to hold everything. 77 GB/s
+* 12288 DSP slices, 1.7M luts
+* 16x16x16 matmul = 4096 mults, perhaps 32x32x32 matmul = 32768 mults
+* 4 TFLOPS @ 500 mhz
+
+Cherry Two (12nm tapeout)
+=====
+* Support DMA over PCI-E. 16 GB/s
+* 8M elements in on board RAM, 19-bit, or 18-bit if that's all we need
+* Hopefully we don't need any DDR, is host RAM fast enough?
+* 32x32x32 matmul = 32768 mults
+* 64 TFLOPS @ 1 ghz
+* Target 75W, even if underclocked. One slot, no external power.
+* This card should be on par with a 3090 and sell for $1000
+
+Cherry Three (5nm tapeout)
+=====
+* Support DMA over PCI-E 4.0. 32 GB/s
+* 16 cores
+* 8M elements in on board RAM of each core (288 MB SRAM on chip)
+* Shared ~16GB GDDR6 between cores. Something like 512 GB/s
+* 16x 32x32x32 matmul = 32768 mults
+* 1 PFLOP @ 1 ghz (finally, a petaflop chip)
+* Target 300W
+* This card should be on par with a DGX A100 and sell for $2000
+