mark slow tests as slow instead of as CI (#13736)

* mark slow tests as slow instead of as CI

* CI shouldn't have different behavior

* more skips / CI

* slow
This commit is contained in:
George Hotz
2025-12-17 10:29:57 -04:00
committed by GitHub
parent 9015a22523
commit 3dbde178c1
26 changed files with 80 additions and 264 deletions

View File

@@ -1,6 +1,7 @@
import unittest, ctypes, struct, os, random, numpy as np
from tinygrad import Device, Tensor, dtypes
from tinygrad.helpers import getenv, CI, mv_address, DEBUG
from tinygrad.helpers import getenv, mv_address, DEBUG
from test.helpers import slow
from tinygrad.device import Buffer, BufferSpec
from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
from tinygrad.runtime.autogen import libc
@@ -220,7 +221,7 @@ class TestHCQ(unittest.TestCase):
mv_buf1 = buf1.as_buffer().cast('Q')
assert libc.memcmp(mv_address(mv_buf1), buf2._buf.va_addr, sz) == 0
@unittest.skipIf(CI, "skip in CI")
@slow
def test_copy_64bit(self):
if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue")

View File

@@ -2,12 +2,11 @@ import unittest
from tinygrad import Device
from tinygrad.device import Buffer
from tinygrad.dtype import dtypes
from tinygrad.helpers import CI
from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
class TestCLError(unittest.TestCase):
@unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
@unittest.skip("allocates tons of memory")
def test_oom(self):
with self.assertRaises(RuntimeError) as err:
allocator = CLAllocator(CLDevice())

View File

@@ -261,7 +261,7 @@ class TestHCQ(unittest.TestCase):
et = _time_queue(q, TestHCQ.d0)
gb_s = (SZ/1e9)/et
print(f"same device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
assert (0.3 if CI else 10) <= gb_s <= 1000
assert 0.3 <= gb_s <= 1000
def test_cross_device_copy_bandwidth(self):
SZ = 2_000_000_000
@@ -273,7 +273,7 @@ class TestHCQ(unittest.TestCase):
et = _time_queue(q, TestHCQ.d0)
gb_s = (SZ/1e9)/et
print(f"cross device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
assert (0.3 if CI else 2) <= gb_s <= 50
assert 0.3 <= gb_s <= 50
def test_interleave_compute_and_copy(self):
q = TestHCQ.compute_queue()

View File

@@ -1,12 +1,12 @@
import unittest
from tinygrad.helpers import CI
from test.helpers import slow
from examples.mamba import Mamba, generate
from transformers import AutoTokenizer
PROMPT = 'Why is gravity '
TOKENIZER = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
@unittest.skipIf(CI, "model is slow for CI")
@slow
class TestMamba(unittest.TestCase):
def test_mamba_130M(self):
OUT_130M = '''Why is gravity \nnot a good idea?\n\nA:'''

View File

@@ -8,11 +8,11 @@ from hypothesis.extra import numpy as stn
import numpy as np
import torch
from tinygrad import Tensor
from tinygrad.helpers import CI, getenv
from tinygrad.helpers import getenv
settings.register_profile(__file__, settings.default,
max_examples=100 if CI else 250, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
max_examples=100, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
# torch wraparound for large numbers

View File

@@ -1,4 +1,4 @@
import time, struct, functools
import os, time, struct, functools, unittest
from typing import Any, Callable
import numpy as np
from tinygrad import Tensor, dtypes, Device
@@ -9,6 +9,9 @@ from tinygrad.dtype import DType
from tinygrad.nn.state import get_parameters
from tinygrad.helpers import T, CI
from tinygrad.codegen import full_rewrite
# decorator to skip slow tests by default, run with RUN_SLOW=1 to include them
slow = unittest.skipUnless(os.getenv("RUN_SLOW"), "slow test, set RUN_SLOW=1 to run")
from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler
def derandomize_model(model):

View File

@@ -4,7 +4,8 @@ import numpy as np
from PIL import Image
from tinygrad import Tensor
from tinygrad.helpers import getenv, CI
from tinygrad.helpers import getenv
from test.helpers import slow
from extra.models.efficientnet import EfficientNet
from extra.models.vit import ViT
from extra.models.resnet import ResNet50
@@ -56,12 +57,12 @@ class TestEfficientNet(unittest.TestCase):
def tearDownClass(cls):
del cls.model
@unittest.skipIf(CI, "covered by test_chicken_car")
@slow
def test_chicken(self):
labels = _infer(self.model, chicken_img)
self.assertEqual(_LABELS[labels[0]], "hen")
@unittest.skipIf(CI, "covered by test_chicken_car")
@slow
def test_car(self):
labels = _infer(self.model, car_img)
self.assertEqual(_LABELS[labels[0]], "sports car, sport car")

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad import Tensor, Device
from tinygrad.helpers import CI
from tinygrad import Tensor
from test.helpers import slow
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim, BatchNorm2d
from extra.training import train, evaluate
@@ -49,7 +49,7 @@ class TinyConvNet:
x = x.reshape(shape=[x.shape[0], -1])
return x.dot(self.l1)
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
@slow
class TestMNIST(unittest.TestCase):
def test_sgd_onestep(self):
np.random.seed(1337)

View File

@@ -5,11 +5,12 @@ from tinygrad.nn import optim
from tinygrad.nn.state import get_parameters
from tinygrad.engine.jit import TinyJit
from tinygrad import Tensor, Device, GlobalCounters, dtypes, Variable
from tinygrad.helpers import CI, Context
from tinygrad.helpers import Context
from test.helpers import slow
from extra.lr_scheduler import OneCycleLR
from test.helpers import derandomize_model
from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
from examples.gpt2 import Transformer as GPT2Transformer
from examples.hlb_cifar10 import SpeedyResNet, hyp
from examples.llama import Transformer as LLaMaTransformer
from examples.stable_diffusion import UNetModel, unet_params
@@ -20,7 +21,7 @@ global_mem_used = 0
def helper_test(nm, gen, model, max_memory_allowed, max_kernels_allowed, all_jitted=False):
with Context(JIT=2):
tms = []
for _ in range(2 if CI else 4):
for _ in range(2):
early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()]
GlobalCounters.reset()
Device[Device.DEFAULT].synchronize()
@@ -52,7 +53,7 @@ class TestRealWorld(unittest.TestCase):
def tearDown(self):
dtypes.default_float = self.old_float
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow, covered by METAL")
@slow
@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16")
def test_stable_diffusion(self):
params = unet_params
@@ -92,14 +93,14 @@ class TestRealWorld(unittest.TestCase):
dtypes.default_float = dtypes.float16
args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
model = GPT2Transformer(**args_tiny)
derandomize_model(model)
@TinyJit
def test(t, v):
with Context(JIT=0): return model(t, v).realize()
helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23 if CI else 0.9, 160 if CI else 468, all_jitted=True)
helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23, 160, all_jitted=True)
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
@slow
def test_train_mnist(self):
from examples.beautiful_mnist import Model
with Tensor.train():
@@ -117,7 +118,7 @@ class TestRealWorld(unittest.TestCase):
helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.017, 103)
@unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
@slow
def test_forward_cifar(self):
BS = 32
# with training batchnorm still though
@@ -127,7 +128,7 @@ class TestRealWorld(unittest.TestCase):
def run(X): return model(X)
helper_test("forward_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), run, 0.033, 27)
@unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
@slow
def test_train_cifar(self):
with Tensor.train():
model = SpeedyResNet(Tensor.ones((12,3,2,2)))
@@ -157,7 +158,7 @@ class TestRealWorld(unittest.TestCase):
final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=4)
assert not np.isnan(lr_scheduler.min_lr), "lr too small or initial_div_facotr too big for half"
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
@slow
def test_bert(self):
with Tensor.train():
args_tiny = {"attention_probs_dropout_prob": 0.0, "hidden_dropout_prob": 0.0, "vocab_size": 30522, "type_vocab_size": 2,

View File

@@ -3,7 +3,8 @@ import numpy as np
from tinygrad import Device
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import getenv, CI
from tinygrad.helpers import getenv
from test.helpers import slow
from extra.training import train
from extra.models.convnext import ConvNeXt
from extra.models.efficientnet import EfficientNet
@@ -38,7 +39,7 @@ class TestTrain(unittest.TestCase):
train_one_step(model,X,Y)
check_gc()
@unittest.skipIf(CI, "slow")
@slow
def test_efficientnet(self):
model = EfficientNet(0)
X = np.zeros((BS,3,224,224), dtype=np.float32)
@@ -46,7 +47,7 @@ class TestTrain(unittest.TestCase):
train_one_step(model,X,Y)
check_gc()
@unittest.skipIf(CI, "slow")
@slow
def test_vit(self):
model = ViT()
X = np.zeros((BS,3,224,224), dtype=np.float32)
@@ -54,7 +55,7 @@ class TestTrain(unittest.TestCase):
train_one_step(model,X,Y)
check_gc()
@unittest.skipIf(CI, "slow")
@slow
def test_transformer(self):
# this should be small GPT-2, but the param count is wrong
# (real ff_dim is 768*4)
@@ -64,7 +65,7 @@ class TestTrain(unittest.TestCase):
train_one_step(model,X,Y)
check_gc()
@unittest.skipIf(CI, "slow")
@slow
def test_resnet(self):
X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
Y = np.zeros((BS), dtype=np.int32)

View File

@@ -2,7 +2,8 @@ import unittest
import pathlib
from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform
import examples.mlperf.metrics as metrics
from tinygrad.helpers import CI, fetch, CPU_LLVM
from tinygrad.helpers import fetch
from test.helpers import slow
from tinygrad import Device, dtypes
from tinygrad.device import is_dtype_supported
@@ -75,11 +76,11 @@ class TestWhisper(unittest.TestCase):
def test_transcribe_file1(self):
self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_1), TRANSCRIPTION_1)
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI")
@slow
def test_transcribe_file2(self):
self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_2), TRANSCRIPTION_2)
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI")
@slow
def test_transcribe_batch12(self):
waveforms = [load_file_waveform(TEST_FILE_1), load_file_waveform(TEST_FILE_2)]
transcriptions = transcribe_waveform(self.model, self.enc, waveforms)
@@ -95,14 +96,14 @@ class TestWhisper(unittest.TestCase):
self.assertEqual(TRANSCRIPTION_1, transcriptions[1])
@unittest.skip("file 3 url is broken")
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI")
@slow
def test_transcribe_long(self):
waveform = [load_file_waveform(fetch(TEST_FILE_3_URL))]
transcription = transcribe_waveform(self.model, self.enc, waveform)
self.assertWER(transcription, TRANSCRIPTION_3, 0.085)
@unittest.skip("file 3 url is broken")
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI")
@slow
def test_transcribe_long_no_batch(self):
waveforms = [load_file_waveform(fetch(TEST_FILE_3_URL)), load_file_waveform(TEST_FILE_1)]

View File

@@ -7,7 +7,8 @@ from tinygrad.tensor import _to_np_dtype
from tinygrad.uop.ops import Ops
from tinygrad.dtype import DType
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import AMX, CI, AMD_LLVM, CPU_LLVM
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM
from test.helpers import slow
from tinygrad.engine.realize import CompiledRunner, get_program
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
@@ -119,7 +120,7 @@ class TestTensorCores(unittest.TestCase):
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
@unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here")
@slow
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
def test_tensor_cores_multi_reduce(self):
for tc in Device[Device.DEFAULT].renderer.tensor_cores:

View File

@@ -1,7 +1,7 @@
import unittest
import numpy as np
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
from tinygrad.helpers import CI, Context, getenv
from tinygrad.helpers import Context, getenv
from tinygrad.engine.realize import run_schedule
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
from tinygrad.uop.ops import Ops
@@ -143,7 +143,7 @@ class TestIndexing(unittest.TestCase):
def test_llama_embedding(self, noopt=1, op_limit=65536):
# llama3 is 128256
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
vocab_size, embed_size = (10, 3)
emb = nn.Embedding(vocab_size, embed_size)
emb_w = emb.weight.numpy()
x = Tensor([1,2,3,4])
@@ -161,7 +161,7 @@ class TestIndexing(unittest.TestCase):
# TODO: reshape to match torch, should we do this in nn?
np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
# at least the arange is being fused
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000)
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000)
if __name__ == "__main__":
unittest.main()

View File

@@ -3,7 +3,7 @@ import functools, unittest, ctypes
from tinygrad.device import Device, Buffer
from tinygrad.tensor import Tensor, _to_np_dtype
from tinygrad.helpers import Context, CI, dedup, from_mv
from tinygrad.helpers import Context, dedup, from_mv
from tinygrad.dtype import dtypes
from tinygrad.engine.jit import MultiGraphRunner
from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner
@@ -12,8 +12,8 @@ from test.helpers import needs_second_gpu
np.random.seed(1337)
Tensor.manual_seed(1337)
BUF_SIZE = 4096 if CI else 4096 * 128
RUN_CNT = 4 if CI else 32
BUF_SIZE = 4096
RUN_CNT = 4
cached_prgs = {}
def helper_exec_op(device, outbuf, inbufs):

View File

@@ -2,12 +2,12 @@ import unittest, functools, random
from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable
from tinygrad.device import is_dtype_supported
from tinygrad.uop.ops import Ops, UOp
from tinygrad.helpers import CI, getenv, prod, Context
from tinygrad.helpers import getenv, prod, Context
from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
import numpy as np
from hypothesis import given, strategies as strat, settings
from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu
from test.helpers import not_support_multi_device, needs_second_gpu, slow
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile")
@@ -420,7 +420,7 @@ class TestMultiTensor(unittest.TestCase):
np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)
# NOTE: this is failing on LLVM CI, no idea why. Works locally.
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
@slow
def test_data_parallel_resnet(self):
from extra.models.resnet import ResNet18
@@ -456,7 +456,7 @@ class TestMultiTensor(unittest.TestCase):
# sometimes there is zeros in these grads... why?
np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
@slow
@unittest.skip("TODO: pm_rangeify hangs")
def test_data_parallel_resnet_train_step(self):
from extra.models.resnet import ResNet18

View File

@@ -4,14 +4,14 @@ import numpy as np
import torch
from tinygrad import Tensor, Device, TinyJit, dtypes
from tinygrad.uop.ops import Ops
from tinygrad.helpers import GlobalCounters, CI, Context
from tinygrad.helpers import GlobalCounters, Context
from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding
from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
from tinygrad.nn.state import load_state_dict
from tinygrad.engine.realize import run_schedule
from test.helpers import not_support_multi_device, needs_second_gpu
from test.helpers import not_support_multi_device, needs_second_gpu, slow
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
@slow
class TestNN(unittest.TestCase):
def test_batchnorm2d(self, training=False, threed=False, track_running_stats=True):
with Tensor.train(training):

View File

@@ -11,8 +11,7 @@ if getenv("TINY_BACKEND"):
import tinygrad.nn.torch # noqa: F401 # pylint: disable=unused-import
torch.set_default_device("tiny")
if CI:
warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
PRINT_TENSORS = getenv("PRINT_TENSORS", 0)

View File

@@ -3,9 +3,8 @@ import torch
import unittest
from tinygrad import Tensor, Device, dtypes
from tinygrad.nn.optim import Adam, SGD, AdamW, Muon, LAMB
from tinygrad.helpers import CI
from tinygrad.device import is_dtype_supported
from test.helpers import needs_second_gpu
from test.helpers import needs_second_gpu, slow
np.random.seed(1337)
x_init = np.random.randn(1,4).astype(np.float32)
@@ -42,7 +41,7 @@ def step(tensor, optim, steps=1, teeny=False, **kwargs):
optim.step()
return net.x.detach().numpy(), net.W.detach().numpy()
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
@slow
class TestOptim(unittest.TestCase):
def setUp(self):
self.old_training = Tensor.training

View File

@@ -1,6 +1,6 @@
import unittest
from tinygrad import Tensor, nn, Device
from tinygrad.helpers import Context, GlobalCounters, CI, getenv, PCONTIG, DEBUG
from tinygrad.helpers import Context, GlobalCounters, getenv, PCONTIG, DEBUG
from tinygrad.uop.ops import graph_rewrite, PatternMatcher, UPat, Ops
from tinygrad.codegen.opt import OptOps, Opt
from tinygrad.renderer.ptx import PTXRenderer
@@ -153,199 +153,6 @@ class TestPcontig(unittest.TestCase):
opts += (Opt(OptOps.UPCAST, 4, 4),)
self.test_flash_attention(opts)
# *** non CI rangeify tests below this line ***
N = 256
@unittest.skipIf(CI, "useless in CI, doesn't test anything")
class TestRangeifyOpt(unittest.TestCase):
def test_randperm(self):
Tensor.randperm(10000).realize()
def test_one_getitem(self):
X = Tensor.empty(10000)
sel = Tensor.arange(1000).contiguous().realize()
Xsel = X[sel]
Tensor.realize(Xsel)
def test_two_getitem(self):
# this is splitting on the child even when it really shouldn't
X = Tensor.empty(10000)
Y = Tensor.empty(10000)
sel = Tensor.arange(1000).contiguous().realize()
Xsel, Ysel = X[sel], Y[sel]
Tensor.realize(Xsel, Ysel)
def test_resnetconv(self):
conv1 = nn.Conv2d(3, 8, kernel_size=7, stride=2, bias=False, padding=3)
conv1.weight.replace(conv1.weight.empty_like())
x = Tensor.empty(1, 3, 56, 56)
x = conv1(x).pad([1,1,1,1])+1
x.realize()
# CPU=1 NOOPT=1 DEBUG=4 RANGEIFY=1 python3 test/test_rangeify.py TestRangeifyOpt.test_matmul_reshaped
def test_matmul_reshaped(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
(A@B).reshape(N*N).contiguous().realize()
def test_reduce_reshapes(self):
A = Tensor.empty(8,8,8,8).permute(1,0,3,2).flatten()
A.sum().realize()
@unittest.skipIf(CI, "useless in CI, doesn't test anything")
class TestRangeify(unittest.TestCase):
def test_groupnorm(self):
# ranges 1 and 3 are merging
x = nn.GroupNorm(32, 128)
x(Tensor.empty(1, 128, 64, 64)).realize()
def test_expand_children(self):
A = Tensor.empty(N, N).sum(axis=1)
ba = A.expand(N, N)
((ba+1).sum(axis=1) + (ba+2).sum(axis=0)).realize()
def test_partial_contig(self):
A = Tensor.empty(64, 64, 64)
ret = A.sum(axis=2).contiguous(arg=(1,)).sum(axis=1)
ret.realize()
@unittest.skip("RANGEIFY=0 does nothing")
def test_double_gemm_real(self):
def go():
with Context(DEBUG=0):
Tensor.manual_seed(1337)
A,B,C = [Tensor.randn(N, N) for _ in range(3)]
Tensor.realize(A, B, C)
GlobalCounters.reset()
return (A@B@C).realize()
rng = go()
with Context(RANGEIFY=0, DEBUG=2):
ref = go()
mse = ((rng-ref)**2).sum().item()
print(f"mse: {mse}")
self.assertLessEqual(mse, 1e-2)
def test_double_gemm(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
(A@B@C).realize()
def test_double_gemm_exp(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
(((A@B).exp()@C).exp()).realize()
def test_double_gemm_exp_child(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
# A@B is used with exp, and also on the sum. this is two kernels now, is this right?
ret = A@B
((ret.exp()@C)+ret).realize()
def test_double_gemm_relu(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
(((A@B).relu()@C).relu()).realize()
def test_double_gemm_relu_half_contig(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
(((A@B).relu().contiguous(arg=(1,))@C).relu()).realize()
def test_double_gemm_half_contig(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
((A@B).contiguous(arg=(1,))@C).realize()
def test_double_gemm_contig(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
((A@B).contiguous()@C).realize()
def test_many_gemm(self):
A = Tensor.empty(N, N)
B = Tensor.empty(N, N)
C = Tensor.empty(N, N)
D = Tensor.empty(N, N)
E = Tensor.empty(N, N)
F = Tensor.empty(N, N)
(A@B@C@D@E@F).realize()
def test_conv2d(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
x.conv2d(w1).realize()
def test_conv2d_elu(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
x.conv2d(w1).elu().realize()
def test_conv2d_t(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
(x*2).conv2d(w1).realize()
def test_double_conv2d(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
w2 = Tensor.empty(12, 8, 3, 3)
x.conv2d(w1).conv2d(w2).realize()
def test_resnet_conv2d(self):
x = Tensor.empty(1, 8, 32, 32)
w1 = Tensor.empty(8, 8, 3, 3)
w2 = Tensor.empty(8, 8, 1, 1)
x.conv2d(w1).conv2d(w2).realize()
def test_xception_conv2d(self):
# NOTE: this fusion is bad, it's recomputing the inner many times
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 1, 1)
w2 = Tensor.empty(8, 1, 3, 3)
x.conv2d(w1).conv2d(w2, groups=8).realize()
def test_conv_maxpool_contig(self): self.test_conv_maxpool(True)
def test_conv_maxpool(self, contig=False):
GlobalCounters.reset()
x = Tensor.empty(32, 16, 64, 64)
l1 = nn.Conv2d(16, 16, 3)
for p in nn.state.get_parameters(l1): p.replace(Tensor.empty(p.shape))
x = l1(x)
if contig: x = x.contiguous()
x.max_pool2d().realize()
def test_double_conv2d_half_contig(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
w2 = Tensor.empty(12, 8, 3, 3)
# NOTE: this contiguous doesn't help
x.conv2d(w1).contiguous(arg=(1,)).conv2d(w2).permute(0,2,3,1).contiguous().realize()
def test_double_conv2d_contig(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
w2 = Tensor.empty(12, 8, 3, 3)
x.conv2d(w1).contiguous().conv2d(w2).realize()
def test_transformer_ffn(self):
from tinygrad.apps.llm import TransformerBlock
from tinygrad import nn
blk = TransformerBlock(1024, 4096, 1, 1, 1e-5, head_dim=1024, rope_theta=10000.0)
for p in nn.state.get_parameters(blk): p.replace(Tensor.empty(p.shape))
x = Tensor.empty(128, 1024)
out = blk._feed_forward(x)
out.realize()
# contiguous + reduce can support ranges?
@unittest.skip("pm_rangeify no longer exists. test this in a different way")

View File

@@ -1755,7 +1755,7 @@ class TestSchedule(unittest.TestCase):
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
def test_precompute_freqs_cis(self):
from extra.models.llama import precompute_freqs_cis
args = {"dim":32 if CI else 128, "end":2048 if CI else 8192, "theta":10000}
args = {"dim":32, "end":2048, "theta":10000}
fused = precompute_freqs_cis(**args)
run_schedule(check_schedule(fused, 1))
if getenv("CHECK", 1):

View File

@@ -3,7 +3,8 @@ import numpy as np
from tinygrad import Tensor, Device, dtypes
from tinygrad.dtype import DType
from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
from tinygrad.helpers import Timing, fetch, temp, CI, OSX
from tinygrad.helpers import Timing, fetch, temp, OSX
from test.helpers import slow
from tinygrad.device import is_dtype_supported
def compare_weights_both(url):
@@ -340,8 +341,8 @@ class TestDiskTensor(unittest.TestCase):
on_dev = t.to(Device.DEFAULT).realize()
np.testing.assert_equal(on_dev.numpy(), t.numpy())
@slow
def test_copy_from_disk_huge(self):
if CI and not hasattr(Device["DISK"], 'io_uring'): self.skipTest("slow on ci without iouring")
fn = pathlib.Path(temp("dt_copy_from_disk_huge"))
fn.unlink(missing_ok=True)

View File

@@ -2,7 +2,8 @@ import unittest, math, operator, subprocess, struct
from tinygrad.tensor import Tensor, dtypes, Device
from tinygrad.dtype import DType, DTYPES_DICT, truncate, float_to_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import getenv, CI, DEBUG
from tinygrad.helpers import getenv, DEBUG
from test.helpers import slow
from hypothesis import given, settings, strategies as strat
import numpy as np
import torch
@@ -594,7 +595,7 @@ class TestAutoCastType(unittest.TestCase):
dtypes.default_float = old_default_float
@unittest.skipIf(Device.DEFAULT == "PYTHON", "very slow")
@unittest.skipIf(CI and Device.DEFAULT == "AMD", "very slow")
@slow
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "Binding size is larger than the maximum storage buffer binding size")
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
def test_mean_half_precision_underflow(self):

View File

@@ -1,8 +1,8 @@
from typing_extensions import Callable
import hashlib, random, unittest
from tinygrad import Tensor, Device, getenv, dtypes
from test.helpers import slow
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import CI
from tinygrad.uop.ops import UOp
from tinygrad.engine.jit import TinyJit
@@ -58,7 +58,7 @@ class TestKeccak(unittest.TestCase):
self.assertEqual(bytes(Tensor(b"abc").keccak().tolist()),
bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532"))
@unittest.skipIf(CI, "times out in ci")
@slow
def test_long(self):
data = b"\x00" * 4
self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
@@ -74,7 +74,7 @@ class TestKeccak(unittest.TestCase):
self.assertEqual(bytes(out[1].tolist()), bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532"))
self.assertEqual(bytes(out[2].tolist()), bytearray.fromhex("8e0d8f672252acb0 ffc5093db8653b18 1513bf9a2097e737 b4f73533dcaf46df"))
@unittest.skipIf(CI, "redundant with test_variable_bs")
@slow
def test_variable_bs_jit(self):
def f(data):
return data.keccak()

View File

@@ -1,6 +1,6 @@
import ctypes, gzip, unittest, timeit
from tinygrad import Variable
from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, CI, mv_address, get_contraction
from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, mv_address, get_contraction
from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv, polyN, time_to_str, cdiv, cmod, getbits
from tinygrad.tensor import Tensor, get_shape
import numpy as np
@@ -198,7 +198,7 @@ class TestMemoryview(unittest.TestCase):
mv[0] = 2
assert base[0] == 2
@unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
@unittest.skip("allocates tons of memory")
def test_to_mv(self):
sizes = [
(16, "16 B"),

View File

@@ -5,7 +5,8 @@ import numpy as np
from tinygrad import Tensor, dtypes, Device, TinyJit
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import CI, all_same, prod
from tinygrad.helpers import all_same, prod
from test.helpers import slow
random.seed(42)
@@ -1140,7 +1141,7 @@ def get_set_tensor(indexed: Tensor, indexer):
set_tensor = Tensor.randint(set_count, high=set_count).reshape(set_size) #.cast(dtypes.float64)
return set_tensor
@unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow")
@slow
class TestAdvancedIndexing(unittest.TestCase):
def test_integer_array_indexing(self):
# pick a random valid indexer type

View File

@@ -20,7 +20,7 @@ class TestRawShmBuffer(unittest.TestCase):
assert np.allclose(t.numpy(), t2.numpy())
s.unlink()
@unittest.skipIf(CI, "CI doesn't like big shared memory")
@unittest.skip("big shared memory")
def test_e2e_big(self):
# bigger than this doesn't work on Linux, maybe this is a limit somewhere?
t = Tensor.randn(2048, 128, 8).realize()