From 99b605146745b7927014eae20e5019cb9ac2c8cf Mon Sep 17 00:00:00 2001 From: George Hotz Date: Mon, 29 Nov 2021 12:40:52 -0500 Subject: [PATCH] add ff_dim to transformer --- cache/.gitignore | 1 + examples/transformer.py | 2 +- examples/vit.py | 16 ++++++++++++++++ models/transformer.py | 10 +++++----- test/test_train.py | 2 +- 5 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 cache/.gitignore create mode 100644 examples/vit.py diff --git a/cache/.gitignore b/cache/.gitignore new file mode 100644 index 0000000000..72e8ffc0db --- /dev/null +++ b/cache/.gitignore @@ -0,0 +1 @@ +* diff --git a/examples/transformer.py b/examples/transformer.py index b6733a8f8b..435c06c323 100755 --- a/examples/transformer.py +++ b/examples/transformer.py @@ -27,7 +27,7 @@ def make_dataset(): from tinygrad.optim import Adam if __name__ == "__main__": - model = Transformer(10, 6, 2, 128, 4) + model = Transformer(10, 6, 2, 128, 4, 32) X_train, Y_train, X_test, Y_test = make_dataset() lr = 0.003 diff --git a/examples/vit.py b/examples/vit.py new file mode 100644 index 0000000000..fec7d76f6d --- /dev/null +++ b/examples/vit.py @@ -0,0 +1,16 @@ + +""" +fn = "gs://vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz" +import tensorflow as tf +with tf.io.gfile.GFile(fn, "rb") as f: + dat = f.read() + with open("cache/"+ fn.rsplit("/", 1)[1], "wb") as g: + g.write(dat) +""" + +import numpy as np +dat = np.load("cache/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz") +for x in dat.keys(): + print(x, dat[x].shape) + + diff --git a/models/transformer.py b/models/transformer.py index 2b14fbfd1a..333202c5a0 100644 --- a/models/transformer.py +++ b/models/transformer.py @@ -11,7 +11,7 @@ def layernorm(x, sz, eps=1e-5): return ret.reshape(shape=in_shape) class TransformerBlock: - def __init__(self, embed_dim, num_heads): + def __init__(self, embed_dim, num_heads, ff_dim): # Multi-Head Attention self.num_heads = num_heads self.head_size = embed_dim // num_heads @@ -24,8 +24,8 @@ class TransformerBlock: self.final = Tensor.uniform(embed_dim, embed_dim) - self.ff1 = Tensor.uniform(embed_dim, embed_dim) - self.ff2 = Tensor.uniform(embed_dim, embed_dim) + self.ff1 = Tensor.uniform(embed_dim, ff_dim) + self.ff2 = Tensor.uniform(ff_dim, embed_dim) def __call__(self, x): # bs x T x embed_dim @@ -54,12 +54,12 @@ class TransformerBlock: class Transformer: # L = layers, H = embed_dim, A = num_heads - def __init__(self, syms, maxlen, layers, embed_dim, num_heads): + def __init__(self, syms, maxlen, layers, embed_dim, num_heads, ff_dim): self.maxlen, self.syms = maxlen, syms self.embed = Tensor.uniform(maxlen+syms, embed_dim, requires_grad=False) self.tbs = [] for i in range(layers): - self.tbs.append(TransformerBlock(embed_dim, num_heads)) + self.tbs.append(TransformerBlock(embed_dim, num_heads, ff_dim)) self.final = Tensor.uniform(embed_dim, syms) def forward(self, x): diff --git a/test/test_train.py b/test/test_train.py index 0ccad33cf8..4ba7fa6462 100644 --- a/test/test_train.py +++ b/test/test_train.py @@ -33,7 +33,7 @@ class TestTrain(unittest.TestCase): def test_transformer(self): # this should be small GPT-2, but the param count is wrong - model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12) + model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12, ff_dim=768*4) X = np.zeros((BS,6), dtype=np.float32) Y = np.zeros((BS,6), dtype=np.int32) train_one_step(model,X,Y)