From 30eb3afbe173b75dc5ec44348cef49fe3eac2421 Mon Sep 17 00:00:00 2001 From: George Hotz Date: Mon, 29 Nov 2021 12:45:27 -0500 Subject: [PATCH] add bias term to transformer --- extra/utils.py | 2 +- models/transformer.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/extra/utils.py b/extra/utils.py index e0c22ae73d..5ac04a98ad 100644 --- a/extra/utils.py +++ b/extra/utils.py @@ -21,7 +21,7 @@ def get_parameters(obj): parameters = [] if isinstance(obj, Tensor): parameters.append(obj) - elif isinstance(obj, list): + elif isinstance(obj, list) or isinstance(obj, tuple): for x in obj: parameters.extend(get_parameters(x)) elif hasattr(obj, '__dict__'): diff --git a/models/transformer.py b/models/transformer.py index 333202c5a0..2f5de17f28 100644 --- a/models/transformer.py +++ b/models/transformer.py @@ -17,10 +17,10 @@ class TransformerBlock: self.head_size = embed_dim // num_heads assert self.head_size * self.num_heads == embed_dim - # looks like bias is useless - self.query_dense = Tensor.uniform(embed_dim, embed_dim) - self.key_dense = Tensor.uniform(embed_dim, embed_dim) - self.value_dense = Tensor.uniform(embed_dim, embed_dim) + # added bias + self.query_dense = (Tensor.uniform(embed_dim, embed_dim), Tensor.uniform(embed_dim)) + self.key_dense = (Tensor.uniform(embed_dim, embed_dim), Tensor.uniform(embed_dim)) + self.value_dense = (Tensor.uniform(embed_dim, embed_dim), Tensor.uniform(embed_dim)) self.final = Tensor.uniform(embed_dim, embed_dim) @@ -34,7 +34,7 @@ class TransformerBlock: inputs = x.reshape(shape=(-1, embed_dim)) # run multi head attention (bs, T, num_heads, head_size) - query, key, value = [inputs.dot(y) \ + query, key, value = [inputs.dot(y[0]).add(y[1].reshape(shape=[1, -1])) \ .reshape(shape=(bs, -1, self.num_heads, self.head_size)) \ for y in [self.query_dense, self.key_dense, self.value_dense]]