mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
readme, plus reduce ops
This commit is contained in:
@@ -67,6 +67,10 @@ class BinaryOps(Enum):
|
||||
MULACC = 4
|
||||
POW = 5
|
||||
|
||||
class ReduceOps(Enum):
|
||||
SUM = 0
|
||||
MAX = 1
|
||||
|
||||
for t in Reg:
|
||||
regfile[t] = np.zeros((SZ, SZ), dtype=np.float32)
|
||||
|
||||
@@ -143,6 +147,14 @@ def riski_mulacc():
|
||||
def riski_pow():
|
||||
regfile[Reg.MATMUL_OUTPUT] = regfile[Reg.MATMUL_INPUT] ** regfile[Reg.MATMUL_WEIGHTS]
|
||||
|
||||
@count
|
||||
def riski_reduce_sum():
|
||||
regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].sum(axis=0)
|
||||
|
||||
@count
|
||||
def riski_reduce_max():
|
||||
regfile[Reg.MATMUL_OUTPUT][0] = regfile[Reg.MATMUL_INPUT].max(axis=0)
|
||||
|
||||
# TODO: make accumulate a bit in the instruction available to all
|
||||
binops = {BinaryOps.ADD: riski_add,
|
||||
BinaryOps.SUB: riski_sub,
|
||||
@@ -151,6 +163,9 @@ binops = {BinaryOps.ADD: riski_add,
|
||||
BinaryOps.MULACC: riski_mulacc,
|
||||
BinaryOps.POW: riski_pow}
|
||||
|
||||
reduceops = {ReduceOps.SUM: riski_reduce_sum,
|
||||
ReduceOps.MAX: riski_reduce_max}
|
||||
|
||||
@count
|
||||
# TODO: add masks to matmul instruction?
|
||||
def riski_matmul():
|
||||
@@ -210,6 +225,12 @@ def cherry_dmaw(address, shp):
|
||||
|
||||
# *** CHERRY code to be compiled ***
|
||||
|
||||
def cherry_reduceop(x, op, axis):
|
||||
print(op, x.shape, axis)
|
||||
cherry_dmar(SLOT(0), x)
|
||||
|
||||
return cherry_dmaw(SLOT(2), x.shape)
|
||||
|
||||
def cherry_unop(x, op):
|
||||
cherry_dmar(SLOT(0), x)
|
||||
cnt = np.prod(x.shape)
|
||||
@@ -337,7 +358,7 @@ def cherry_matmul(x, w, transpose_x=False, transpose_w=False):
|
||||
return cherry_dmaw(SLOT(2), (*x.shape[0:-2],M,N))
|
||||
|
||||
import unittest
|
||||
class TestRisk(unittest.TestCase):
|
||||
class TestCherry(unittest.TestCase):
|
||||
def test_matmul_even(self):
|
||||
x = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
|
||||
w = np.random.uniform(size=(SZ*8, SZ*8)).astype(np.float32)
|
||||
|
||||
@@ -32,6 +32,37 @@ class Exp(Function):
|
||||
ret, = ctx.saved_tensors
|
||||
return cherry_binop(grad_output, ret, BinaryOps.MUL)
|
||||
|
||||
# ************* reduce ops *************
|
||||
|
||||
"""
|
||||
class Sum(Function):
|
||||
def forward(ctx, input, axis=None):
|
||||
ctx.save_for_backward(input, axis)
|
||||
return cherry_reduceop(input, ReduceOps.SUM, axis)
|
||||
|
||||
def backward(ctx, grad_output):
|
||||
input, axis = ctx.saved_tensors
|
||||
if isinstance(axis, int): axis = [axis]
|
||||
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
|
||||
return grad_output.reshape(shape) + np.zeros_like(input)
|
||||
|
||||
class Max(Function):
|
||||
def forward(ctx, inp, axis=None):
|
||||
if isinstance(axis, int): axis = [axis]
|
||||
ret = np.amax(inp, axis=None if axis is None else tuple(axis), keepdims=True)
|
||||
ctx.save_for_backward(inp, axis, ret)
|
||||
if axis is not None:
|
||||
ret = ret.reshape([inp.shape[i] for i in range(len(inp.shape)) if i not in axis])
|
||||
return ret
|
||||
|
||||
def backward(ctx, grad_output):
|
||||
input, axis, ret = ctx.saved_tensors
|
||||
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
|
||||
ret2 = (input==ret.reshape(shape))
|
||||
div = ret2.sum(axis=None if axis is None else tuple(axis), keepdims=True)
|
||||
return ret2*grad_output.reshape(shape)/div
|
||||
"""
|
||||
|
||||
# ************* binary ops *************
|
||||
|
||||
def unbroadcast(out, in_sh):
|
||||
|
||||
47
fpga/README
Normal file
47
fpga/README
Normal file
@@ -0,0 +1,47 @@
|
||||
Cherry is designed with thneed in mind. Assuming onboard RAM, it'll run without the host.
|
||||
|
||||
Single core RISC-V, superscalar, out of order. Targeting 1+ instructions/cycle.
|
||||
|
||||
Compute is straightforward, but two questions about memory:
|
||||
* How much striding do we need? How much does it cost us power and transistor wise?
|
||||
* Should the copies to SRAM be explicit or caching with the DDR? Caching is a simpler programming model.
|
||||
|
||||
Small Board (Arty A7 100T)
|
||||
=====
|
||||
* Support DMA over the ethernet interface, 12.5 MB/s
|
||||
* 65k elements in on board RAM, 18-bit
|
||||
* Optionally, use the 256MB of DDR3L onboard to hold everything. 2.66 GB/s
|
||||
* 240 DSP slices, 101k luts
|
||||
* 4x4x4 matmul = 64 mults, perhaps 8x8x8 matmul = 512 mults
|
||||
* 6.4 GFLOPS @ 50 mhz
|
||||
|
||||
Big Board (Alveo U250)
|
||||
=====
|
||||
* Support DMA over PCI-E. 16 GB/s
|
||||
* 8M elements in on board RAM, 18-bit
|
||||
* Optionally, use the 64GB of DDR4 onboard to hold everything. 77 GB/s
|
||||
* 12288 DSP slices, 1.7M luts
|
||||
* 16x16x16 matmul = 4096 mults, perhaps 32x32x32 matmul = 32768 mults
|
||||
* 4 TFLOPS @ 500 mhz
|
||||
|
||||
Cherry Two (12nm tapeout)
|
||||
=====
|
||||
* Support DMA over PCI-E. 16 GB/s
|
||||
* 8M elements in on board RAM, 19-bit, or 18-bit if that's all we need
|
||||
* Hopefully we don't need any DDR, is host RAM fast enough?
|
||||
* 32x32x32 matmul = 32768 mults
|
||||
* 64 TFLOPS @ 1 ghz
|
||||
* Target 75W, even if underclocked. One slot, no external power.
|
||||
* This card should be on par with a 3090 and sell for $1000
|
||||
|
||||
Cherry Three (5nm tapeout)
|
||||
=====
|
||||
* Support DMA over PCI-E 4.0. 32 GB/s
|
||||
* 16 cores
|
||||
* 8M elements in on board RAM of each core (288 MB SRAM on chip)
|
||||
* Shared ~16GB GDDR6 between cores. Something like 512 GB/s
|
||||
* 16x 32x32x32 matmul = 32768 mults
|
||||
* 1 PFLOP @ 1 ghz (finally, a petaflop chip)
|
||||
* Target 300W
|
||||
* This card should be on par with a DGX A100 and sell for $2000
|
||||
|
||||
Reference in New Issue
Block a user