diff --git a/test/device/test_hcq.py b/test/device/test_hcq.py index 54b55592bc..2261bdad63 100644 --- a/test/device/test_hcq.py +++ b/test/device/test_hcq.py @@ -1,6 +1,7 @@ import unittest, ctypes, struct, os, random, numpy as np from tinygrad import Device, Tensor, dtypes -from tinygrad.helpers import getenv, CI, mv_address, DEBUG +from tinygrad.helpers import getenv, mv_address, DEBUG +from test.helpers import slow from tinygrad.device import Buffer, BufferSpec from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer from tinygrad.runtime.autogen import libc @@ -220,7 +221,7 @@ class TestHCQ(unittest.TestCase): mv_buf1 = buf1.as_buffer().cast('Q') assert libc.memcmp(mv_address(mv_buf1), buf2._buf.va_addr, sz) == 0 - @unittest.skipIf(CI, "skip in CI") + @slow def test_copy_64bit(self): if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue") diff --git a/test/device/test_ocl.py b/test/device/test_ocl.py index 6f58b909db..3727546f50 100644 --- a/test/device/test_ocl.py +++ b/test/device/test_ocl.py @@ -2,12 +2,11 @@ import unittest from tinygrad import Device from tinygrad.device import Buffer from tinygrad.dtype import dtypes -from tinygrad.helpers import CI from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram @unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL") class TestCLError(unittest.TestCase): - @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory") + @unittest.skip("allocates tons of memory") def test_oom(self): with self.assertRaises(RuntimeError) as err: allocator = CLAllocator(CLDevice()) diff --git a/test/external/external_test_hcq.py b/test/external/external_test_hcq.py index 2ae0371fe1..df77134cd9 100644 --- a/test/external/external_test_hcq.py +++ b/test/external/external_test_hcq.py @@ -261,7 +261,7 @@ class TestHCQ(unittest.TestCase): et = _time_queue(q, TestHCQ.d0) gb_s = (SZ/1e9)/et print(f"same device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s") - assert (0.3 if CI else 10) <= gb_s <= 1000 + assert 0.3 <= gb_s <= 1000 def test_cross_device_copy_bandwidth(self): SZ = 2_000_000_000 @@ -273,7 +273,7 @@ class TestHCQ(unittest.TestCase): et = _time_queue(q, TestHCQ.d0) gb_s = (SZ/1e9)/et print(f"cross device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s") - assert (0.3 if CI else 2) <= gb_s <= 50 + assert 0.3 <= gb_s <= 50 def test_interleave_compute_and_copy(self): q = TestHCQ.compute_queue() diff --git a/test/external/external_test_mamba.py b/test/external/external_test_mamba.py index 3213b7c6b9..6eb6c2a1d5 100644 --- a/test/external/external_test_mamba.py +++ b/test/external/external_test_mamba.py @@ -1,12 +1,12 @@ import unittest -from tinygrad.helpers import CI +from test.helpers import slow from examples.mamba import Mamba, generate from transformers import AutoTokenizer PROMPT = 'Why is gravity ' TOKENIZER = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") -@unittest.skipIf(CI, "model is slow for CI") +@slow class TestMamba(unittest.TestCase): def test_mamba_130M(self): OUT_130M = '''Why is gravity \nnot a good idea?\n\nA:''' diff --git a/test/external/fuzz_shape_ops.py b/test/external/fuzz_shape_ops.py index f60e58e912..9e310c3582 100644 --- a/test/external/fuzz_shape_ops.py +++ b/test/external/fuzz_shape_ops.py @@ -8,11 +8,11 @@ from hypothesis.extra import numpy as stn import numpy as np import torch from tinygrad import Tensor -from tinygrad.helpers import CI, getenv +from tinygrad.helpers import getenv settings.register_profile(__file__, settings.default, - max_examples=100 if CI else 250, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) + max_examples=100, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) # torch wraparound for large numbers diff --git a/test/helpers.py b/test/helpers.py index 0e9661bf41..96524b5bf3 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -1,4 +1,4 @@ -import time, struct, functools +import os, time, struct, functools, unittest from typing import Any, Callable import numpy as np from tinygrad import Tensor, dtypes, Device @@ -9,6 +9,9 @@ from tinygrad.dtype import DType from tinygrad.nn.state import get_parameters from tinygrad.helpers import T, CI from tinygrad.codegen import full_rewrite + +# decorator to skip slow tests by default, run with RUN_SLOW=1 to include them +slow = unittest.skipUnless(os.getenv("RUN_SLOW"), "slow test, set RUN_SLOW=1 to run") from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler def derandomize_model(model): diff --git a/test/models/test_efficientnet.py b/test/models/test_efficientnet.py index 3a5b3324ba..df936299d2 100644 --- a/test/models/test_efficientnet.py +++ b/test/models/test_efficientnet.py @@ -4,7 +4,8 @@ import numpy as np from PIL import Image from tinygrad import Tensor -from tinygrad.helpers import getenv, CI +from tinygrad.helpers import getenv +from test.helpers import slow from extra.models.efficientnet import EfficientNet from extra.models.vit import ViT from extra.models.resnet import ResNet50 @@ -56,12 +57,12 @@ class TestEfficientNet(unittest.TestCase): def tearDownClass(cls): del cls.model - @unittest.skipIf(CI, "covered by test_chicken_car") + @slow def test_chicken(self): labels = _infer(self.model, chicken_img) self.assertEqual(_LABELS[labels[0]], "hen") - @unittest.skipIf(CI, "covered by test_chicken_car") + @slow def test_car(self): labels = _infer(self.model, car_img) self.assertEqual(_LABELS[labels[0]], "sports car, sport car") diff --git a/test/models/test_mnist.py b/test/models/test_mnist.py index 2f5e939862..a6ef2879bc 100644 --- a/test/models/test_mnist.py +++ b/test/models/test_mnist.py @@ -1,8 +1,8 @@ #!/usr/bin/env python import unittest import numpy as np -from tinygrad import Tensor, Device -from tinygrad.helpers import CI +from tinygrad import Tensor +from test.helpers import slow from tinygrad.nn.state import get_parameters from tinygrad.nn import optim, BatchNorm2d from extra.training import train, evaluate @@ -49,7 +49,7 @@ class TinyConvNet: x = x.reshape(shape=[x.shape[0], -1]) return x.dot(self.l1) -@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow") +@slow class TestMNIST(unittest.TestCase): def test_sgd_onestep(self): np.random.seed(1337) diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py index 7e797a2314..2868906084 100644 --- a/test/models/test_real_world.py +++ b/test/models/test_real_world.py @@ -5,11 +5,12 @@ from tinygrad.nn import optim from tinygrad.nn.state import get_parameters from tinygrad.engine.jit import TinyJit from tinygrad import Tensor, Device, GlobalCounters, dtypes, Variable -from tinygrad.helpers import CI, Context +from tinygrad.helpers import Context +from test.helpers import slow from extra.lr_scheduler import OneCycleLR from test.helpers import derandomize_model -from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS +from examples.gpt2 import Transformer as GPT2Transformer from examples.hlb_cifar10 import SpeedyResNet, hyp from examples.llama import Transformer as LLaMaTransformer from examples.stable_diffusion import UNetModel, unet_params @@ -20,7 +21,7 @@ global_mem_used = 0 def helper_test(nm, gen, model, max_memory_allowed, max_kernels_allowed, all_jitted=False): with Context(JIT=2): tms = [] - for _ in range(2 if CI else 4): + for _ in range(2): early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()] GlobalCounters.reset() Device[Device.DEFAULT].synchronize() @@ -52,7 +53,7 @@ class TestRealWorld(unittest.TestCase): def tearDown(self): dtypes.default_float = self.old_float - @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow, covered by METAL") + @slow @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16") def test_stable_diffusion(self): params = unet_params @@ -92,14 +93,14 @@ class TestRealWorld(unittest.TestCase): dtypes.default_float = dtypes.float16 args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000} - model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"])) + model = GPT2Transformer(**args_tiny) derandomize_model(model) @TinyJit def test(t, v): with Context(JIT=0): return model(t, v).realize() - helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23 if CI else 0.9, 160 if CI else 468, all_jitted=True) + helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23, 160, all_jitted=True) - @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow") + @slow def test_train_mnist(self): from examples.beautiful_mnist import Model with Tensor.train(): @@ -117,7 +118,7 @@ class TestRealWorld(unittest.TestCase): helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.017, 103) - @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow") + @slow def test_forward_cifar(self): BS = 32 # with training batchnorm still though @@ -127,7 +128,7 @@ class TestRealWorld(unittest.TestCase): def run(X): return model(X) helper_test("forward_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), run, 0.033, 27) - @unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow") + @slow def test_train_cifar(self): with Tensor.train(): model = SpeedyResNet(Tensor.ones((12,3,2,2))) @@ -157,7 +158,7 @@ class TestRealWorld(unittest.TestCase): final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=4) assert not np.isnan(lr_scheduler.min_lr), "lr too small or initial_div_facotr too big for half" - @unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow") + @slow def test_bert(self): with Tensor.train(): args_tiny = {"attention_probs_dropout_prob": 0.0, "hidden_dropout_prob": 0.0, "vocab_size": 30522, "type_vocab_size": 2, diff --git a/test/models/test_train.py b/test/models/test_train.py index fe5114742b..4a5288d62a 100644 --- a/test/models/test_train.py +++ b/test/models/test_train.py @@ -3,7 +3,8 @@ import numpy as np from tinygrad import Device from tinygrad.nn.state import get_parameters from tinygrad.nn import optim -from tinygrad.helpers import getenv, CI +from tinygrad.helpers import getenv +from test.helpers import slow from extra.training import train from extra.models.convnext import ConvNeXt from extra.models.efficientnet import EfficientNet @@ -38,7 +39,7 @@ class TestTrain(unittest.TestCase): train_one_step(model,X,Y) check_gc() - @unittest.skipIf(CI, "slow") + @slow def test_efficientnet(self): model = EfficientNet(0) X = np.zeros((BS,3,224,224), dtype=np.float32) @@ -46,7 +47,7 @@ class TestTrain(unittest.TestCase): train_one_step(model,X,Y) check_gc() - @unittest.skipIf(CI, "slow") + @slow def test_vit(self): model = ViT() X = np.zeros((BS,3,224,224), dtype=np.float32) @@ -54,7 +55,7 @@ class TestTrain(unittest.TestCase): train_one_step(model,X,Y) check_gc() - @unittest.skipIf(CI, "slow") + @slow def test_transformer(self): # this should be small GPT-2, but the param count is wrong # (real ff_dim is 768*4) @@ -64,7 +65,7 @@ class TestTrain(unittest.TestCase): train_one_step(model,X,Y) check_gc() - @unittest.skipIf(CI, "slow") + @slow def test_resnet(self): X = np.zeros((BS, 3, 224, 224), dtype=np.float32) Y = np.zeros((BS), dtype=np.int32) diff --git a/test/models/test_whisper.py b/test/models/test_whisper.py index 8779f21c9b..9555ab9c7d 100644 --- a/test/models/test_whisper.py +++ b/test/models/test_whisper.py @@ -2,7 +2,8 @@ import unittest import pathlib from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform import examples.mlperf.metrics as metrics -from tinygrad.helpers import CI, fetch, CPU_LLVM +from tinygrad.helpers import fetch +from test.helpers import slow from tinygrad import Device, dtypes from tinygrad.device import is_dtype_supported @@ -75,11 +76,11 @@ class TestWhisper(unittest.TestCase): def test_transcribe_file1(self): self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_1), TRANSCRIPTION_1) - @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI") + @slow def test_transcribe_file2(self): self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_2), TRANSCRIPTION_2) - @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI") + @slow def test_transcribe_batch12(self): waveforms = [load_file_waveform(TEST_FILE_1), load_file_waveform(TEST_FILE_2)] transcriptions = transcribe_waveform(self.model, self.enc, waveforms) @@ -95,14 +96,14 @@ class TestWhisper(unittest.TestCase): self.assertEqual(TRANSCRIPTION_1, transcriptions[1]) @unittest.skip("file 3 url is broken") - @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI") + @slow def test_transcribe_long(self): waveform = [load_file_waveform(fetch(TEST_FILE_3_URL))] transcription = transcribe_waveform(self.model, self.enc, waveform) self.assertWER(transcription, TRANSCRIPTION_3, 0.085) @unittest.skip("file 3 url is broken") - @unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI") + @slow def test_transcribe_long_no_batch(self): waveforms = [load_file_waveform(fetch(TEST_FILE_3_URL)), load_file_waveform(TEST_FILE_1)] diff --git a/test/opt/test_tensor_cores.py b/test/opt/test_tensor_cores.py index d293de1283..983b1e3209 100644 --- a/test/opt/test_tensor_cores.py +++ b/test/opt/test_tensor_cores.py @@ -7,7 +7,8 @@ from tinygrad.tensor import _to_np_dtype from tinygrad.uop.ops import Ops from tinygrad.dtype import DType from tinygrad.device import is_dtype_supported -from tinygrad.helpers import AMX, CI, AMD_LLVM, CPU_LLVM +from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM +from test.helpers import slow from tinygrad.engine.realize import CompiledRunner, get_program from tinygrad.codegen.opt import Opt, OptOps, KernelOptError @@ -119,7 +120,7 @@ class TestTensorCores(unittest.TestCase): helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False) @unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device") - @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here") + @slow @unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores") def test_tensor_cores_multi_reduce(self): for tc in Device[Device.DEFAULT].renderer.tensor_cores: diff --git a/test/test_arange.py b/test/test_arange.py index 962db05efa..00ef30e290 100644 --- a/test/test_arange.py +++ b/test/test_arange.py @@ -1,7 +1,7 @@ import unittest import numpy as np from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable -from tinygrad.helpers import CI, Context, getenv +from tinygrad.helpers import Context, getenv from tinygrad.engine.realize import run_schedule from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program from tinygrad.uop.ops import Ops @@ -143,7 +143,7 @@ class TestIndexing(unittest.TestCase): def test_llama_embedding(self, noopt=1, op_limit=65536): # llama3 is 128256 - vocab_size, embed_size = (10, 3) if CI else (32000, 4096) + vocab_size, embed_size = (10, 3) emb = nn.Embedding(vocab_size, embed_size) emb_w = emb.weight.numpy() x = Tensor([1,2,3,4]) @@ -161,7 +161,7 @@ class TestIndexing(unittest.TestCase): # TODO: reshape to match torch, should we do this in nn? np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8) # at least the arange is being fused - def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000) + def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000) if __name__ == "__main__": unittest.main() diff --git a/test/test_graph.py b/test/test_graph.py index c5b3281ba8..1d009e218c 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -3,7 +3,7 @@ import functools, unittest, ctypes from tinygrad.device import Device, Buffer from tinygrad.tensor import Tensor, _to_np_dtype -from tinygrad.helpers import Context, CI, dedup, from_mv +from tinygrad.helpers import Context, dedup, from_mv from tinygrad.dtype import dtypes from tinygrad.engine.jit import MultiGraphRunner from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner @@ -12,8 +12,8 @@ from test.helpers import needs_second_gpu np.random.seed(1337) Tensor.manual_seed(1337) -BUF_SIZE = 4096 if CI else 4096 * 128 -RUN_CNT = 4 if CI else 32 +BUF_SIZE = 4096 +RUN_CNT = 4 cached_prgs = {} def helper_exec_op(device, outbuf, inbufs): diff --git a/test/test_multitensor.py b/test/test_multitensor.py index bbb0bc6952..1409822775 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -2,12 +2,12 @@ import unittest, functools, random from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable from tinygrad.device import is_dtype_supported from tinygrad.uop.ops import Ops, UOp -from tinygrad.helpers import CI, getenv, prod, Context +from tinygrad.helpers import getenv, prod, Context from tinygrad.nn.state import get_parameters, get_state_dict from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule import numpy as np from hypothesis import given, strategies as strat, settings -from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu +from test.helpers import not_support_multi_device, needs_second_gpu, slow settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False)) settings.load_profile("my_profile") @@ -420,7 +420,7 @@ class TestMultiTensor(unittest.TestCase): np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6) # NOTE: this is failing on LLVM CI, no idea why. Works locally. - @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU") + @slow def test_data_parallel_resnet(self): from extra.models.resnet import ResNet18 @@ -456,7 +456,7 @@ class TestMultiTensor(unittest.TestCase): # sometimes there is zeros in these grads... why? np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5) - @unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU") + @slow @unittest.skip("TODO: pm_rangeify hangs") def test_data_parallel_resnet_train_step(self): from extra.models.resnet import ResNet18 diff --git a/test/test_nn.py b/test/test_nn.py index 0ffd7f3d28..df1072202e 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -4,14 +4,14 @@ import numpy as np import torch from tinygrad import Tensor, Device, TinyJit, dtypes from tinygrad.uop.ops import Ops -from tinygrad.helpers import GlobalCounters, CI, Context +from tinygrad.helpers import GlobalCounters, Context from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell from tinygrad.nn.state import load_state_dict from tinygrad.engine.realize import run_schedule -from test.helpers import not_support_multi_device, needs_second_gpu +from test.helpers import not_support_multi_device, needs_second_gpu, slow -@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") +@slow class TestNN(unittest.TestCase): def test_batchnorm2d(self, training=False, threed=False, track_running_stats=True): with Tensor.train(training): diff --git a/test/test_ops.py b/test/test_ops.py index ce730a19c6..6d4349a35c 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -11,8 +11,7 @@ if getenv("TINY_BACKEND"): import tinygrad.nn.torch # noqa: F401 # pylint: disable=unused-import torch.set_default_device("tiny") -if CI: - warnings.filterwarnings("ignore", message="Non-empty compiler output encountered") +warnings.filterwarnings("ignore", message="Non-empty compiler output encountered") FORWARD_ONLY = getenv("FORWARD_ONLY", 0) PRINT_TENSORS = getenv("PRINT_TENSORS", 0) diff --git a/test/test_optim.py b/test/test_optim.py index 5ed5925710..4d27b60b6e 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -3,9 +3,8 @@ import torch import unittest from tinygrad import Tensor, Device, dtypes from tinygrad.nn.optim import Adam, SGD, AdamW, Muon, LAMB -from tinygrad.helpers import CI from tinygrad.device import is_dtype_supported -from test.helpers import needs_second_gpu +from test.helpers import needs_second_gpu, slow np.random.seed(1337) x_init = np.random.randn(1,4).astype(np.float32) @@ -42,7 +41,7 @@ def step(tensor, optim, steps=1, teeny=False, **kwargs): optim.step() return net.x.detach().numpy(), net.W.detach().numpy() -@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow") +@slow class TestOptim(unittest.TestCase): def setUp(self): self.old_training = Tensor.training diff --git a/test/test_rangeify.py b/test/test_rangeify.py index 9dd0dde998..ad480d7949 100644 --- a/test/test_rangeify.py +++ b/test/test_rangeify.py @@ -1,6 +1,6 @@ import unittest from tinygrad import Tensor, nn, Device -from tinygrad.helpers import Context, GlobalCounters, CI, getenv, PCONTIG, DEBUG +from tinygrad.helpers import Context, GlobalCounters, getenv, PCONTIG, DEBUG from tinygrad.uop.ops import graph_rewrite, PatternMatcher, UPat, Ops from tinygrad.codegen.opt import OptOps, Opt from tinygrad.renderer.ptx import PTXRenderer @@ -153,199 +153,6 @@ class TestPcontig(unittest.TestCase): opts += (Opt(OptOps.UPCAST, 4, 4),) self.test_flash_attention(opts) -# *** non CI rangeify tests below this line *** - -N = 256 - -@unittest.skipIf(CI, "useless in CI, doesn't test anything") -class TestRangeifyOpt(unittest.TestCase): - def test_randperm(self): - Tensor.randperm(10000).realize() - - def test_one_getitem(self): - X = Tensor.empty(10000) - sel = Tensor.arange(1000).contiguous().realize() - Xsel = X[sel] - Tensor.realize(Xsel) - - def test_two_getitem(self): - # this is splitting on the child even when it really shouldn't - X = Tensor.empty(10000) - Y = Tensor.empty(10000) - sel = Tensor.arange(1000).contiguous().realize() - Xsel, Ysel = X[sel], Y[sel] - Tensor.realize(Xsel, Ysel) - - def test_resnetconv(self): - conv1 = nn.Conv2d(3, 8, kernel_size=7, stride=2, bias=False, padding=3) - conv1.weight.replace(conv1.weight.empty_like()) - x = Tensor.empty(1, 3, 56, 56) - x = conv1(x).pad([1,1,1,1])+1 - x.realize() - - # CPU=1 NOOPT=1 DEBUG=4 RANGEIFY=1 python3 test/test_rangeify.py TestRangeifyOpt.test_matmul_reshaped - def test_matmul_reshaped(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - (A@B).reshape(N*N).contiguous().realize() - - def test_reduce_reshapes(self): - A = Tensor.empty(8,8,8,8).permute(1,0,3,2).flatten() - A.sum().realize() - -@unittest.skipIf(CI, "useless in CI, doesn't test anything") -class TestRangeify(unittest.TestCase): - def test_groupnorm(self): - # ranges 1 and 3 are merging - x = nn.GroupNorm(32, 128) - x(Tensor.empty(1, 128, 64, 64)).realize() - - def test_expand_children(self): - A = Tensor.empty(N, N).sum(axis=1) - ba = A.expand(N, N) - ((ba+1).sum(axis=1) + (ba+2).sum(axis=0)).realize() - - def test_partial_contig(self): - A = Tensor.empty(64, 64, 64) - ret = A.sum(axis=2).contiguous(arg=(1,)).sum(axis=1) - ret.realize() - - @unittest.skip("RANGEIFY=0 does nothing") - def test_double_gemm_real(self): - def go(): - with Context(DEBUG=0): - Tensor.manual_seed(1337) - A,B,C = [Tensor.randn(N, N) for _ in range(3)] - Tensor.realize(A, B, C) - GlobalCounters.reset() - return (A@B@C).realize() - rng = go() - with Context(RANGEIFY=0, DEBUG=2): - ref = go() - mse = ((rng-ref)**2).sum().item() - print(f"mse: {mse}") - self.assertLessEqual(mse, 1e-2) - - def test_double_gemm(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - (A@B@C).realize() - - def test_double_gemm_exp(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - (((A@B).exp()@C).exp()).realize() - - def test_double_gemm_exp_child(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - # A@B is used with exp, and also on the sum. this is two kernels now, is this right? - ret = A@B - ((ret.exp()@C)+ret).realize() - - def test_double_gemm_relu(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - (((A@B).relu()@C).relu()).realize() - - def test_double_gemm_relu_half_contig(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - (((A@B).relu().contiguous(arg=(1,))@C).relu()).realize() - - def test_double_gemm_half_contig(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - ((A@B).contiguous(arg=(1,))@C).realize() - - def test_double_gemm_contig(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - ((A@B).contiguous()@C).realize() - - def test_many_gemm(self): - A = Tensor.empty(N, N) - B = Tensor.empty(N, N) - C = Tensor.empty(N, N) - D = Tensor.empty(N, N) - E = Tensor.empty(N, N) - F = Tensor.empty(N, N) - (A@B@C@D@E@F).realize() - - def test_conv2d(self): - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 3, 3) - x.conv2d(w1).realize() - - def test_conv2d_elu(self): - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 3, 3) - x.conv2d(w1).elu().realize() - - def test_conv2d_t(self): - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 3, 3) - (x*2).conv2d(w1).realize() - - def test_double_conv2d(self): - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 3, 3) - w2 = Tensor.empty(12, 8, 3, 3) - x.conv2d(w1).conv2d(w2).realize() - - def test_resnet_conv2d(self): - x = Tensor.empty(1, 8, 32, 32) - w1 = Tensor.empty(8, 8, 3, 3) - w2 = Tensor.empty(8, 8, 1, 1) - x.conv2d(w1).conv2d(w2).realize() - - def test_xception_conv2d(self): - # NOTE: this fusion is bad, it's recomputing the inner many times - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 1, 1) - w2 = Tensor.empty(8, 1, 3, 3) - x.conv2d(w1).conv2d(w2, groups=8).realize() - - def test_conv_maxpool_contig(self): self.test_conv_maxpool(True) - def test_conv_maxpool(self, contig=False): - GlobalCounters.reset() - x = Tensor.empty(32, 16, 64, 64) - l1 = nn.Conv2d(16, 16, 3) - for p in nn.state.get_parameters(l1): p.replace(Tensor.empty(p.shape)) - x = l1(x) - if contig: x = x.contiguous() - x.max_pool2d().realize() - - def test_double_conv2d_half_contig(self): - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 3, 3) - w2 = Tensor.empty(12, 8, 3, 3) - # NOTE: this contiguous doesn't help - x.conv2d(w1).contiguous(arg=(1,)).conv2d(w2).permute(0,2,3,1).contiguous().realize() - - def test_double_conv2d_contig(self): - x = Tensor.empty(1, 4, 32, 32) - w1 = Tensor.empty(8, 4, 3, 3) - w2 = Tensor.empty(12, 8, 3, 3) - x.conv2d(w1).contiguous().conv2d(w2).realize() - - def test_transformer_ffn(self): - from tinygrad.apps.llm import TransformerBlock - from tinygrad import nn - blk = TransformerBlock(1024, 4096, 1, 1, 1e-5, head_dim=1024, rope_theta=10000.0) - for p in nn.state.get_parameters(blk): p.replace(Tensor.empty(p.shape)) - - x = Tensor.empty(128, 1024) - out = blk._feed_forward(x) - out.realize() - # contiguous + reduce can support ranges? @unittest.skip("pm_rangeify no longer exists. test this in a different way") diff --git a/test/test_schedule.py b/test/test_schedule.py index 9e10966980..d4c77fbc9f 100644 --- a/test/test_schedule.py +++ b/test/test_schedule.py @@ -1755,7 +1755,7 @@ class TestSchedule(unittest.TestCase): @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") def test_precompute_freqs_cis(self): from extra.models.llama import precompute_freqs_cis - args = {"dim":32 if CI else 128, "end":2048 if CI else 8192, "theta":10000} + args = {"dim":32, "end":2048, "theta":10000} fused = precompute_freqs_cis(**args) run_schedule(check_schedule(fused, 1)) if getenv("CHECK", 1): diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py index 9b0aed2b0b..17b8c4672a 100644 --- a/test/unit/test_disk_tensor.py +++ b/test/unit/test_disk_tensor.py @@ -3,7 +3,8 @@ import numpy as np from tinygrad import Tensor, Device, dtypes from tinygrad.dtype import DType from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load -from tinygrad.helpers import Timing, fetch, temp, CI, OSX +from tinygrad.helpers import Timing, fetch, temp, OSX +from test.helpers import slow from tinygrad.device import is_dtype_supported def compare_weights_both(url): @@ -340,8 +341,8 @@ class TestDiskTensor(unittest.TestCase): on_dev = t.to(Device.DEFAULT).realize() np.testing.assert_equal(on_dev.numpy(), t.numpy()) + @slow def test_copy_from_disk_huge(self): - if CI and not hasattr(Device["DISK"], 'io_uring'): self.skipTest("slow on ci without iouring") fn = pathlib.Path(temp("dt_copy_from_disk_huge")) fn.unlink(missing_ok=True) diff --git a/test/unit/test_dtype_spec.py b/test/unit/test_dtype_spec.py index c84e1ebcf7..a3dfaea9e0 100644 --- a/test/unit/test_dtype_spec.py +++ b/test/unit/test_dtype_spec.py @@ -2,7 +2,8 @@ import unittest, math, operator, subprocess, struct from tinygrad.tensor import Tensor, dtypes, Device from tinygrad.dtype import DType, DTYPES_DICT, truncate, float_to_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float from tinygrad.device import is_dtype_supported -from tinygrad.helpers import getenv, CI, DEBUG +from tinygrad.helpers import getenv, DEBUG +from test.helpers import slow from hypothesis import given, settings, strategies as strat import numpy as np import torch @@ -594,7 +595,7 @@ class TestAutoCastType(unittest.TestCase): dtypes.default_float = old_default_float @unittest.skipIf(Device.DEFAULT == "PYTHON", "very slow") - @unittest.skipIf(CI and Device.DEFAULT == "AMD", "very slow") + @slow @unittest.skipIf(Device.DEFAULT == "WEBGPU", "Binding size is larger than the maximum storage buffer binding size") @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half") def test_mean_half_precision_underflow(self): diff --git a/test/unit/test_hashing.py b/test/unit/test_hashing.py index 6024b3090e..cbee0f6bfb 100644 --- a/test/unit/test_hashing.py +++ b/test/unit/test_hashing.py @@ -1,8 +1,8 @@ from typing_extensions import Callable import hashlib, random, unittest from tinygrad import Tensor, Device, getenv, dtypes +from test.helpers import slow from tinygrad.device import is_dtype_supported -from tinygrad.helpers import CI from tinygrad.uop.ops import UOp from tinygrad.engine.jit import TinyJit @@ -58,7 +58,7 @@ class TestKeccak(unittest.TestCase): self.assertEqual(bytes(Tensor(b"abc").keccak().tolist()), bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532")) - @unittest.skipIf(CI, "times out in ci") + @slow def test_long(self): data = b"\x00" * 4 self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16)) @@ -74,7 +74,7 @@ class TestKeccak(unittest.TestCase): self.assertEqual(bytes(out[1].tolist()), bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532")) self.assertEqual(bytes(out[2].tolist()), bytearray.fromhex("8e0d8f672252acb0 ffc5093db8653b18 1513bf9a2097e737 b4f73533dcaf46df")) - @unittest.skipIf(CI, "redundant with test_variable_bs") + @slow def test_variable_bs_jit(self): def f(data): return data.keccak() diff --git a/test/unit/test_helpers.py b/test/unit/test_helpers.py index be2f7192f0..63eca3f705 100644 --- a/test/unit/test_helpers.py +++ b/test/unit/test_helpers.py @@ -1,6 +1,6 @@ import ctypes, gzip, unittest, timeit from tinygrad import Variable -from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, CI, mv_address, get_contraction +from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, mv_address, get_contraction from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv, polyN, time_to_str, cdiv, cmod, getbits from tinygrad.tensor import Tensor, get_shape import numpy as np @@ -198,7 +198,7 @@ class TestMemoryview(unittest.TestCase): mv[0] = 2 assert base[0] == 2 - @unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory") + @unittest.skip("allocates tons of memory") def test_to_mv(self): sizes = [ (16, "16 B"), diff --git a/test/unit/test_indexing.py b/test/unit/test_indexing.py index 1b38e52190..ba82e9ba02 100644 --- a/test/unit/test_indexing.py +++ b/test/unit/test_indexing.py @@ -5,7 +5,8 @@ import numpy as np from tinygrad import Tensor, dtypes, Device, TinyJit from tinygrad.device import is_dtype_supported -from tinygrad.helpers import CI, all_same, prod +from tinygrad.helpers import all_same, prod +from test.helpers import slow random.seed(42) @@ -1140,7 +1141,7 @@ def get_set_tensor(indexed: Tensor, indexer): set_tensor = Tensor.randint(set_count, high=set_count).reshape(set_size) #.cast(dtypes.float64) return set_tensor -@unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow") +@slow class TestAdvancedIndexing(unittest.TestCase): def test_integer_array_indexing(self): # pick a random valid indexer type diff --git a/test/unit/test_shm_tensor.py b/test/unit/test_shm_tensor.py index ecb2846d13..2b2b07c052 100644 --- a/test/unit/test_shm_tensor.py +++ b/test/unit/test_shm_tensor.py @@ -20,7 +20,7 @@ class TestRawShmBuffer(unittest.TestCase): assert np.allclose(t.numpy(), t2.numpy()) s.unlink() - @unittest.skipIf(CI, "CI doesn't like big shared memory") + @unittest.skip("big shared memory") def test_e2e_big(self): # bigger than this doesn't work on Linux, maybe this is a limit somewhere? t = Tensor.randn(2048, 128, 8).realize()