diff --git a/examples/llama.py b/examples/llama.py index 8d5c5edadc..fc91e11d1e 100755 --- a/examples/llama.py +++ b/examples/llama.py @@ -10,7 +10,7 @@ from tqdm import tqdm np.set_printoptions(linewidth=200) from typing import Optional, Tuple -from tinygrad.helpers import getenv, DEBUG +from tinygrad.helpers import dtypes, getenv, DEBUG from tinygrad.lazy import Device from extra.helpers import Timing from tinygrad.tensor import Tensor @@ -143,14 +143,13 @@ class Transformer: # get only the part we are using. making it contiguous avoids more kernel calls freqs_cis = self.freqs_cis[:, start_pos:start_pos+seqlen].contiguous().realize() - if seqlen > 1: mask = np.full((1, 1, seqlen, start_pos + seqlen), float("-inf"), dtype=np.float32) mask = np.triu(mask, k=start_pos + 1) # TODO: this is hard to do in tinygrad mask = Tensor(mask) else: mask = None - + # mask = Tensor.full((1, 1, seqlen, start_pos + seqlen), float("-inf"), dtype=dtypes.float32).triu(start_pos+1) if seqlen > 1 else None #TODO: Pending(#942) for layer in self.layers: h.realize() # TODO: why do i need this? h = layer(h, start_pos, freqs_cis, mask) diff --git a/examples/stable_diffusion.py b/examples/stable_diffusion.py index 04c3234b8b..ad250a2011 100644 --- a/examples/stable_diffusion.py +++ b/examples/stable_diffusion.py @@ -460,6 +460,7 @@ class CLIPTextTransformer: x = self.embeddings(input_ids, list(range(len(input_ids)))) causal_attention_mask = np.triu(np.ones((1,1,77,77), dtype=np.float32) * -np.inf, k=1) x = self.encoder(x, Tensor(causal_attention_mask, device=x.device)) + # x = self.encoder(x, Tensor.full((1, 1, 77, 77), float("-inf")).triu(1)) # TODO: Pending(#942) return self.final_layer_norm(x) # Clip tokenizer, taken from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py (MIT license) diff --git a/extra/onnx_ops.py b/extra/onnx_ops.py index 610e5c8929..6e24e1daf7 100644 --- a/extra/onnx_ops.py +++ b/extra/onnx_ops.py @@ -209,6 +209,10 @@ def Or(x:Tensor, y:Tensor): return Where((x==y), x, Tensor.ones(*x.shape)).cast( def Xor(x:Tensor, y:Tensor): return Where((x==y), Tensor.zeros(*x.shape), Tensor.ones(*x.shape)).cast(dtypes.bool) def Not(x:Tensor): return Where((x==1), Tensor.zeros(*x.shape), Tensor.ones(*x.shape)).cast(dtypes.bool) +def Trilu(x: Tensor, k: Union[Tensor, int]=0, upper=1): + k = int(k.numpy().item()) if k is not 0 else 0 # onnx passes k as a tensor int64 with one element, default is 0 + return x.triu(k) if upper else x.tril(k) + def ConstantOfShape(input, value:Tensor=None): if value is None: value=Tensor([0.0]) shape = [int(x) for x in safe_numpy(input)] diff --git a/test/external/external_test_onnx_backend.py b/test/external/external_test_onnx_backend.py index 9164ab51b3..1f4def4cea 100644 --- a/test/external/external_test_onnx_backend.py +++ b/test/external/external_test_onnx_backend.py @@ -124,8 +124,10 @@ backend_test.exclude('test_bitwise_*') backend_test.exclude('test_blackmanwindow_*') backend_test.exclude('test_bernoulli_*') backend_test.exclude('test_cumsum_*') -backend_test.exclude('test_tril_*') -backend_test.exclude('test_triu_*') + +backend_test.exclude('test_tril_zero_cpu') # TODO: zero array support +backend_test.exclude('test_triu_zero_cpu') # TODO: zero array support + backend_test.exclude('test_col2im_*') backend_test.exclude('test_hammingwindow_*') backend_test.exclude('test_hannwindow_*') diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index c840fc9150..4ac2f6557b 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -85,7 +85,7 @@ class TestInferenceMinKernels(unittest.TestCase): args_tiny = {"dim": 512, "multiple_of": 256, "n_heads": 8, "n_layers": 4, "norm_eps": 1e-05, "vocab_size": 1000} model = Transformer(**args_tiny) for p in get_parameters(model): p.assign(np.zeros(p.shape, dtype=p.dtype.np)) - with CLCache(85): + with CLCache(94): model(Tensor([[1,2,3,4]]), 0).realize() @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") diff --git a/test/test_ops.py b/test/test_ops.py index e81c31220e..6054cf6c64 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -125,6 +125,18 @@ class TestOps(unittest.TestCase): tt2 = Tensor.ones(4, requires_grad=True) self.assertRaises(RuntimeError, (tt1 < tt2).sum().backward) + def test_tril(self): + helper_test_op([(3,3)], lambda x: x.tril(), lambda x: x.tril()) + helper_test_op([(3,3)], lambda x: x.tril(1), lambda x: x.tril(1)) + helper_test_op([(3,3)], lambda x: x.tril(-1), lambda x: x.tril(-1)) + helper_test_op([(5,3,3)], lambda x: x.tril(), lambda x: x.tril()) + helper_test_op([(5,3,3)], lambda x: x.tril(1), lambda x: x.tril(1)) + def test_triu(self): + helper_test_op([(3,3)], lambda x: x.triu(), lambda x: x.triu()) + helper_test_op([(3,3)], lambda x: x.triu(1), lambda x: x.triu(1)) + helper_test_op([(3,3)], lambda x: x.triu(-1), lambda x: x.triu(-1)) + helper_test_op([(5,3,3)], lambda x: x.triu(), lambda x: x.triu()) + helper_test_op([(5,3,3)], lambda x: x.triu(1), lambda x: x.triu(1)) def test_maximum(self): helper_test_op([(45,65), (45,65)], torch.maximum, Tensor.maximum) helper_test_op([(), ()], torch.maximum, Tensor.maximum) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index d1cf6f87ef..cd359c23b6 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -481,6 +481,12 @@ class Tensor: def sin(self): return mlops.Sin.apply(self) def cos(self): return ((math.pi/2)-self).sin() def tan(self): return self.sin() / self.cos() + + @staticmethod + def _tri(r:int, c:int, k:int=0) -> Tensor: return Tensor.arange(r).unsqueeze(1).expand(r,c) <= Tensor.arange(c-k, start=-k).unsqueeze(0).expand(r,c) + def triu(self, k:int=0) -> Tensor: return Tensor._tri(self.shape[-2], self.shape[-1], k=k).where(self, Tensor.zeros_like(self)) + def tril(self, k:int=0) -> Tensor: return Tensor._tri(self.shape[-2], self.shape[-1], k=k+1).where(Tensor.zeros_like(self), self) + # ***** math functions (unary) ***** def __neg__(self): return 0.0-self