mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-06 21:53:53 -05:00
mark slow tests as slow instead of as CI (#13736)
* mark slow tests as slow instead of as CI * CI shouldn't have different behavior * more skips / CI * slow
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import unittest, ctypes, struct, os, random, numpy as np
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.helpers import getenv, CI, mv_address, DEBUG
|
||||
from tinygrad.helpers import getenv, mv_address, DEBUG
|
||||
from test.helpers import slow
|
||||
from tinygrad.device import Buffer, BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
|
||||
from tinygrad.runtime.autogen import libc
|
||||
@@ -220,7 +221,7 @@ class TestHCQ(unittest.TestCase):
|
||||
mv_buf1 = buf1.as_buffer().cast('Q')
|
||||
assert libc.memcmp(mv_address(mv_buf1), buf2._buf.va_addr, sz) == 0
|
||||
|
||||
@unittest.skipIf(CI, "skip in CI")
|
||||
@slow
|
||||
def test_copy_64bit(self):
|
||||
if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue")
|
||||
|
||||
|
||||
@@ -2,12 +2,11 @@ import unittest
|
||||
from tinygrad import Device
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
|
||||
class TestCLError(unittest.TestCase):
|
||||
@unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
|
||||
@unittest.skip("allocates tons of memory")
|
||||
def test_oom(self):
|
||||
with self.assertRaises(RuntimeError) as err:
|
||||
allocator = CLAllocator(CLDevice())
|
||||
|
||||
4
test/external/external_test_hcq.py
vendored
4
test/external/external_test_hcq.py
vendored
@@ -261,7 +261,7 @@ class TestHCQ(unittest.TestCase):
|
||||
et = _time_queue(q, TestHCQ.d0)
|
||||
gb_s = (SZ/1e9)/et
|
||||
print(f"same device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
|
||||
assert (0.3 if CI else 10) <= gb_s <= 1000
|
||||
assert 0.3 <= gb_s <= 1000
|
||||
|
||||
def test_cross_device_copy_bandwidth(self):
|
||||
SZ = 2_000_000_000
|
||||
@@ -273,7 +273,7 @@ class TestHCQ(unittest.TestCase):
|
||||
et = _time_queue(q, TestHCQ.d0)
|
||||
gb_s = (SZ/1e9)/et
|
||||
print(f"cross device copy: {et*1e3:.2f} ms, {gb_s:.2f} GB/s")
|
||||
assert (0.3 if CI else 2) <= gb_s <= 50
|
||||
assert 0.3 <= gb_s <= 50
|
||||
|
||||
def test_interleave_compute_and_copy(self):
|
||||
q = TestHCQ.compute_queue()
|
||||
|
||||
4
test/external/external_test_mamba.py
vendored
4
test/external/external_test_mamba.py
vendored
@@ -1,12 +1,12 @@
|
||||
import unittest
|
||||
from tinygrad.helpers import CI
|
||||
from test.helpers import slow
|
||||
from examples.mamba import Mamba, generate
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
PROMPT = 'Why is gravity '
|
||||
TOKENIZER = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
||||
|
||||
@unittest.skipIf(CI, "model is slow for CI")
|
||||
@slow
|
||||
class TestMamba(unittest.TestCase):
|
||||
def test_mamba_130M(self):
|
||||
OUT_130M = '''Why is gravity \nnot a good idea?\n\nA:'''
|
||||
|
||||
4
test/external/fuzz_shape_ops.py
vendored
4
test/external/fuzz_shape_ops.py
vendored
@@ -8,11 +8,11 @@ from hypothesis.extra import numpy as stn
|
||||
import numpy as np
|
||||
import torch
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.helpers import CI, getenv
|
||||
from tinygrad.helpers import getenv
|
||||
|
||||
|
||||
settings.register_profile(__file__, settings.default,
|
||||
max_examples=100 if CI else 250, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
|
||||
max_examples=100, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
|
||||
|
||||
|
||||
# torch wraparound for large numbers
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import time, struct, functools
|
||||
import os, time, struct, functools, unittest
|
||||
from typing import Any, Callable
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, dtypes, Device
|
||||
@@ -9,6 +9,9 @@ from tinygrad.dtype import DType
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.helpers import T, CI
|
||||
from tinygrad.codegen import full_rewrite
|
||||
|
||||
# decorator to skip slow tests by default, run with RUN_SLOW=1 to include them
|
||||
slow = unittest.skipUnless(os.getenv("RUN_SLOW"), "slow test, set RUN_SLOW=1 to run")
|
||||
from tinygrad.runtime.ops_python import PythonProgram, PythonRenderer, PythonCompiler
|
||||
|
||||
def derandomize_model(model):
|
||||
|
||||
@@ -4,7 +4,8 @@ import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.helpers import getenv, CI
|
||||
from tinygrad.helpers import getenv
|
||||
from test.helpers import slow
|
||||
from extra.models.efficientnet import EfficientNet
|
||||
from extra.models.vit import ViT
|
||||
from extra.models.resnet import ResNet50
|
||||
@@ -56,12 +57,12 @@ class TestEfficientNet(unittest.TestCase):
|
||||
def tearDownClass(cls):
|
||||
del cls.model
|
||||
|
||||
@unittest.skipIf(CI, "covered by test_chicken_car")
|
||||
@slow
|
||||
def test_chicken(self):
|
||||
labels = _infer(self.model, chicken_img)
|
||||
self.assertEqual(_LABELS[labels[0]], "hen")
|
||||
|
||||
@unittest.skipIf(CI, "covered by test_chicken_car")
|
||||
@slow
|
||||
def test_car(self):
|
||||
labels = _infer(self.model, car_img)
|
||||
self.assertEqual(_LABELS[labels[0]], "sports car, sport car")
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, Device
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad import Tensor
|
||||
from test.helpers import slow
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.nn import optim, BatchNorm2d
|
||||
from extra.training import train, evaluate
|
||||
@@ -49,7 +49,7 @@ class TinyConvNet:
|
||||
x = x.reshape(shape=[x.shape[0], -1])
|
||||
return x.dot(self.l1)
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
|
||||
@slow
|
||||
class TestMNIST(unittest.TestCase):
|
||||
def test_sgd_onestep(self):
|
||||
np.random.seed(1337)
|
||||
|
||||
@@ -5,11 +5,12 @@ from tinygrad.nn import optim
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.engine.jit import TinyJit
|
||||
from tinygrad import Tensor, Device, GlobalCounters, dtypes, Variable
|
||||
from tinygrad.helpers import CI, Context
|
||||
from tinygrad.helpers import Context
|
||||
from test.helpers import slow
|
||||
from extra.lr_scheduler import OneCycleLR
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.gpt2 import Transformer as GPT2Transformer, MODEL_PARAMS as GPT2_MODEL_PARAMS
|
||||
from examples.gpt2 import Transformer as GPT2Transformer
|
||||
from examples.hlb_cifar10 import SpeedyResNet, hyp
|
||||
from examples.llama import Transformer as LLaMaTransformer
|
||||
from examples.stable_diffusion import UNetModel, unet_params
|
||||
@@ -20,7 +21,7 @@ global_mem_used = 0
|
||||
def helper_test(nm, gen, model, max_memory_allowed, max_kernels_allowed, all_jitted=False):
|
||||
with Context(JIT=2):
|
||||
tms = []
|
||||
for _ in range(2 if CI else 4):
|
||||
for _ in range(2):
|
||||
early_gen = [x.realize() if isinstance(x, Tensor) else x for x in gen()]
|
||||
GlobalCounters.reset()
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
@@ -52,7 +53,7 @@ class TestRealWorld(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
dtypes.default_float = self.old_float
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow, covered by METAL")
|
||||
@slow
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16")
|
||||
def test_stable_diffusion(self):
|
||||
params = unet_params
|
||||
@@ -92,14 +93,14 @@ class TestRealWorld(unittest.TestCase):
|
||||
dtypes.default_float = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-5, "vocab_size": 1000}
|
||||
model = GPT2Transformer(**(args_tiny if CI else GPT2_MODEL_PARAMS["gpt2-medium"]))
|
||||
model = GPT2Transformer(**args_tiny)
|
||||
derandomize_model(model)
|
||||
@TinyJit
|
||||
def test(t, v):
|
||||
with Context(JIT=0): return model(t, v).realize()
|
||||
helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23 if CI else 0.9, 160 if CI else 468, all_jitted=True)
|
||||
helper_test("test_gpt2", lambda: (Tensor([[1,]]),Variable("pos", 1, 100).bind(1)), test, 0.23, 160, all_jitted=True)
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
|
||||
@slow
|
||||
def test_train_mnist(self):
|
||||
from examples.beautiful_mnist import Model
|
||||
with Tensor.train():
|
||||
@@ -117,7 +118,7 @@ class TestRealWorld(unittest.TestCase):
|
||||
|
||||
helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.017, 103)
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
|
||||
@slow
|
||||
def test_forward_cifar(self):
|
||||
BS = 32
|
||||
# with training batchnorm still though
|
||||
@@ -127,7 +128,7 @@ class TestRealWorld(unittest.TestCase):
|
||||
def run(X): return model(X)
|
||||
helper_test("forward_cifar", lambda: (Tensor.randn(BS, 3, 32, 32),), run, 0.033, 27)
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
|
||||
@slow
|
||||
def test_train_cifar(self):
|
||||
with Tensor.train():
|
||||
model = SpeedyResNet(Tensor.ones((12,3,2,2)))
|
||||
@@ -157,7 +158,7 @@ class TestRealWorld(unittest.TestCase):
|
||||
final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=4)
|
||||
assert not np.isnan(lr_scheduler.min_lr), "lr too small or initial_div_facotr too big for half"
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
|
||||
@slow
|
||||
def test_bert(self):
|
||||
with Tensor.train():
|
||||
args_tiny = {"attention_probs_dropout_prob": 0.0, "hidden_dropout_prob": 0.0, "vocab_size": 30522, "type_vocab_size": 2,
|
||||
|
||||
@@ -3,7 +3,8 @@ import numpy as np
|
||||
from tinygrad import Device
|
||||
from tinygrad.nn.state import get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import getenv, CI
|
||||
from tinygrad.helpers import getenv
|
||||
from test.helpers import slow
|
||||
from extra.training import train
|
||||
from extra.models.convnext import ConvNeXt
|
||||
from extra.models.efficientnet import EfficientNet
|
||||
@@ -38,7 +39,7 @@ class TestTrain(unittest.TestCase):
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
@unittest.skipIf(CI, "slow")
|
||||
@slow
|
||||
def test_efficientnet(self):
|
||||
model = EfficientNet(0)
|
||||
X = np.zeros((BS,3,224,224), dtype=np.float32)
|
||||
@@ -46,7 +47,7 @@ class TestTrain(unittest.TestCase):
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
@unittest.skipIf(CI, "slow")
|
||||
@slow
|
||||
def test_vit(self):
|
||||
model = ViT()
|
||||
X = np.zeros((BS,3,224,224), dtype=np.float32)
|
||||
@@ -54,7 +55,7 @@ class TestTrain(unittest.TestCase):
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
@unittest.skipIf(CI, "slow")
|
||||
@slow
|
||||
def test_transformer(self):
|
||||
# this should be small GPT-2, but the param count is wrong
|
||||
# (real ff_dim is 768*4)
|
||||
@@ -64,7 +65,7 @@ class TestTrain(unittest.TestCase):
|
||||
train_one_step(model,X,Y)
|
||||
check_gc()
|
||||
|
||||
@unittest.skipIf(CI, "slow")
|
||||
@slow
|
||||
def test_resnet(self):
|
||||
X = np.zeros((BS, 3, 224, 224), dtype=np.float32)
|
||||
Y = np.zeros((BS), dtype=np.int32)
|
||||
|
||||
@@ -2,7 +2,8 @@ import unittest
|
||||
import pathlib
|
||||
from examples.whisper import init_whisper, load_file_waveform, transcribe_file, transcribe_waveform
|
||||
import examples.mlperf.metrics as metrics
|
||||
from tinygrad.helpers import CI, fetch, CPU_LLVM
|
||||
from tinygrad.helpers import fetch
|
||||
from test.helpers import slow
|
||||
from tinygrad import Device, dtypes
|
||||
from tinygrad.device import is_dtype_supported
|
||||
|
||||
@@ -75,11 +76,11 @@ class TestWhisper(unittest.TestCase):
|
||||
def test_transcribe_file1(self):
|
||||
self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_1), TRANSCRIPTION_1)
|
||||
|
||||
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI")
|
||||
@slow
|
||||
def test_transcribe_file2(self):
|
||||
self.assertEqual(transcribe_file(self.model, self.enc, TEST_FILE_2), TRANSCRIPTION_2)
|
||||
|
||||
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too many tests for CI")
|
||||
@slow
|
||||
def test_transcribe_batch12(self):
|
||||
waveforms = [load_file_waveform(TEST_FILE_1), load_file_waveform(TEST_FILE_2)]
|
||||
transcriptions = transcribe_waveform(self.model, self.enc, waveforms)
|
||||
@@ -95,14 +96,14 @@ class TestWhisper(unittest.TestCase):
|
||||
self.assertEqual(TRANSCRIPTION_1, transcriptions[1])
|
||||
|
||||
@unittest.skip("file 3 url is broken")
|
||||
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI")
|
||||
@slow
|
||||
def test_transcribe_long(self):
|
||||
waveform = [load_file_waveform(fetch(TEST_FILE_3_URL))]
|
||||
transcription = transcribe_waveform(self.model, self.enc, waveform)
|
||||
self.assertWER(transcription, TRANSCRIPTION_3, 0.085)
|
||||
|
||||
@unittest.skip("file 3 url is broken")
|
||||
@unittest.skipIf(CI or (Device.DEFAULT == "CPU" and CPU_LLVM), "too long for CI")
|
||||
@slow
|
||||
def test_transcribe_long_no_batch(self):
|
||||
waveforms = [load_file_waveform(fetch(TEST_FILE_3_URL)), load_file_waveform(TEST_FILE_1)]
|
||||
|
||||
|
||||
@@ -7,7 +7,8 @@ from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import AMX, CI, AMD_LLVM, CPU_LLVM
|
||||
from tinygrad.helpers import AMX, AMD_LLVM, CPU_LLVM
|
||||
from test.helpers import slow
|
||||
from tinygrad.engine.realize import CompiledRunner, get_program
|
||||
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
||||
|
||||
@@ -119,7 +120,7 @@ class TestTensorCores(unittest.TestCase):
|
||||
helper_tc_ensure_uops_and_opts_count(tc.dims[0], tc.dims[1], tc.dims[2]//8, tc.dtype_in, tc.dtype_out, tc_opt=2, ensure_triggered=False)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "not generated on EMULATED device")
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here")
|
||||
@slow
|
||||
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
||||
def test_tensor_cores_multi_reduce(self):
|
||||
for tc in Device[Device.DEFAULT].renderer.tensor_cores:
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
|
||||
from tinygrad.helpers import CI, Context, getenv
|
||||
from tinygrad.helpers import Context, getenv
|
||||
from tinygrad.engine.realize import run_schedule
|
||||
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
|
||||
from tinygrad.uop.ops import Ops
|
||||
@@ -143,7 +143,7 @@ class TestIndexing(unittest.TestCase):
|
||||
|
||||
def test_llama_embedding(self, noopt=1, op_limit=65536):
|
||||
# llama3 is 128256
|
||||
vocab_size, embed_size = (10, 3) if CI else (32000, 4096)
|
||||
vocab_size, embed_size = (10, 3)
|
||||
emb = nn.Embedding(vocab_size, embed_size)
|
||||
emb_w = emb.weight.numpy()
|
||||
x = Tensor([1,2,3,4])
|
||||
@@ -161,7 +161,7 @@ class TestIndexing(unittest.TestCase):
|
||||
# TODO: reshape to match torch, should we do this in nn?
|
||||
np.testing.assert_allclose(z.numpy().reshape(4, embed_size), torch_z.detach().numpy(), atol=1e-8, rtol=1e-8)
|
||||
# at least the arange is being fused
|
||||
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000 if CI else 5_898_240_000)
|
||||
def test_llama_embedding_opt(self): self.test_llama_embedding(0, 1_736_704_000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -3,7 +3,7 @@ import functools, unittest, ctypes
|
||||
|
||||
from tinygrad.device import Device, Buffer
|
||||
from tinygrad.tensor import Tensor, _to_np_dtype
|
||||
from tinygrad.helpers import Context, CI, dedup, from_mv
|
||||
from tinygrad.helpers import Context, dedup, from_mv
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
from tinygrad.engine.realize import ExecItem, BufferXfer, get_runner, CompiledRunner
|
||||
@@ -12,8 +12,8 @@ from test.helpers import needs_second_gpu
|
||||
|
||||
np.random.seed(1337)
|
||||
Tensor.manual_seed(1337)
|
||||
BUF_SIZE = 4096 if CI else 4096 * 128
|
||||
RUN_CNT = 4 if CI else 32
|
||||
BUF_SIZE = 4096
|
||||
RUN_CNT = 4
|
||||
|
||||
cached_prgs = {}
|
||||
def helper_exec_op(device, outbuf, inbufs):
|
||||
|
||||
@@ -2,12 +2,12 @@ import unittest, functools, random
|
||||
from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.uop.ops import Ops, UOp
|
||||
from tinygrad.helpers import CI, getenv, prod, Context
|
||||
from tinygrad.helpers import getenv, prod, Context
|
||||
from tinygrad.nn.state import get_parameters, get_state_dict
|
||||
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
|
||||
import numpy as np
|
||||
from hypothesis import given, strategies as strat, settings
|
||||
from test.helpers import REAL_DEV, not_support_multi_device, needs_second_gpu
|
||||
from test.helpers import not_support_multi_device, needs_second_gpu, slow
|
||||
|
||||
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
|
||||
settings.load_profile("my_profile")
|
||||
@@ -420,7 +420,7 @@ class TestMultiTensor(unittest.TestCase):
|
||||
np.testing.assert_allclose(y.numpy(), y_shard.numpy(), atol=1e-6, rtol=1e-6)
|
||||
|
||||
# NOTE: this is failing on LLVM CI, no idea why. Works locally.
|
||||
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
|
||||
@slow
|
||||
def test_data_parallel_resnet(self):
|
||||
from extra.models.resnet import ResNet18
|
||||
|
||||
@@ -456,7 +456,7 @@ class TestMultiTensor(unittest.TestCase):
|
||||
# sometimes there is zeros in these grads... why?
|
||||
np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)
|
||||
|
||||
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
|
||||
@slow
|
||||
@unittest.skip("TODO: pm_rangeify hangs")
|
||||
def test_data_parallel_resnet_train_step(self):
|
||||
from extra.models.resnet import ResNet18
|
||||
|
||||
@@ -4,14 +4,14 @@ import numpy as np
|
||||
import torch
|
||||
from tinygrad import Tensor, Device, TinyJit, dtypes
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.helpers import GlobalCounters, CI, Context
|
||||
from tinygrad.helpers import GlobalCounters, Context
|
||||
from tinygrad.nn import Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, Embedding
|
||||
from tinygrad.nn import BatchNorm, LayerNorm, LayerNorm2d, GroupNorm, InstanceNorm, RMSNorm, LSTMCell
|
||||
from tinygrad.nn.state import load_state_dict
|
||||
from tinygrad.engine.realize import run_schedule
|
||||
from test.helpers import not_support_multi_device, needs_second_gpu
|
||||
from test.helpers import not_support_multi_device, needs_second_gpu, slow
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
|
||||
@slow
|
||||
class TestNN(unittest.TestCase):
|
||||
def test_batchnorm2d(self, training=False, threed=False, track_running_stats=True):
|
||||
with Tensor.train(training):
|
||||
|
||||
@@ -11,8 +11,7 @@ if getenv("TINY_BACKEND"):
|
||||
import tinygrad.nn.torch # noqa: F401 # pylint: disable=unused-import
|
||||
torch.set_default_device("tiny")
|
||||
|
||||
if CI:
|
||||
warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
|
||||
warnings.filterwarnings("ignore", message="Non-empty compiler output encountered")
|
||||
|
||||
FORWARD_ONLY = getenv("FORWARD_ONLY", 0)
|
||||
PRINT_TENSORS = getenv("PRINT_TENSORS", 0)
|
||||
|
||||
@@ -3,9 +3,8 @@ import torch
|
||||
import unittest
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.nn.optim import Adam, SGD, AdamW, Muon, LAMB
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from test.helpers import needs_second_gpu
|
||||
from test.helpers import needs_second_gpu, slow
|
||||
|
||||
np.random.seed(1337)
|
||||
x_init = np.random.randn(1,4).astype(np.float32)
|
||||
@@ -42,7 +41,7 @@ def step(tensor, optim, steps=1, teeny=False, **kwargs):
|
||||
optim.step()
|
||||
return net.x.detach().numpy(), net.W.detach().numpy()
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in {"CUDA", "NV"}, "slow")
|
||||
@slow
|
||||
class TestOptim(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.old_training = Tensor.training
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
from tinygrad import Tensor, nn, Device
|
||||
from tinygrad.helpers import Context, GlobalCounters, CI, getenv, PCONTIG, DEBUG
|
||||
from tinygrad.helpers import Context, GlobalCounters, getenv, PCONTIG, DEBUG
|
||||
from tinygrad.uop.ops import graph_rewrite, PatternMatcher, UPat, Ops
|
||||
from tinygrad.codegen.opt import OptOps, Opt
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
@@ -153,199 +153,6 @@ class TestPcontig(unittest.TestCase):
|
||||
opts += (Opt(OptOps.UPCAST, 4, 4),)
|
||||
self.test_flash_attention(opts)
|
||||
|
||||
# *** non CI rangeify tests below this line ***
|
||||
|
||||
N = 256
|
||||
|
||||
@unittest.skipIf(CI, "useless in CI, doesn't test anything")
|
||||
class TestRangeifyOpt(unittest.TestCase):
|
||||
def test_randperm(self):
|
||||
Tensor.randperm(10000).realize()
|
||||
|
||||
def test_one_getitem(self):
|
||||
X = Tensor.empty(10000)
|
||||
sel = Tensor.arange(1000).contiguous().realize()
|
||||
Xsel = X[sel]
|
||||
Tensor.realize(Xsel)
|
||||
|
||||
def test_two_getitem(self):
|
||||
# this is splitting on the child even when it really shouldn't
|
||||
X = Tensor.empty(10000)
|
||||
Y = Tensor.empty(10000)
|
||||
sel = Tensor.arange(1000).contiguous().realize()
|
||||
Xsel, Ysel = X[sel], Y[sel]
|
||||
Tensor.realize(Xsel, Ysel)
|
||||
|
||||
def test_resnetconv(self):
|
||||
conv1 = nn.Conv2d(3, 8, kernel_size=7, stride=2, bias=False, padding=3)
|
||||
conv1.weight.replace(conv1.weight.empty_like())
|
||||
x = Tensor.empty(1, 3, 56, 56)
|
||||
x = conv1(x).pad([1,1,1,1])+1
|
||||
x.realize()
|
||||
|
||||
# CPU=1 NOOPT=1 DEBUG=4 RANGEIFY=1 python3 test/test_rangeify.py TestRangeifyOpt.test_matmul_reshaped
|
||||
def test_matmul_reshaped(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
(A@B).reshape(N*N).contiguous().realize()
|
||||
|
||||
def test_reduce_reshapes(self):
|
||||
A = Tensor.empty(8,8,8,8).permute(1,0,3,2).flatten()
|
||||
A.sum().realize()
|
||||
|
||||
@unittest.skipIf(CI, "useless in CI, doesn't test anything")
|
||||
class TestRangeify(unittest.TestCase):
|
||||
def test_groupnorm(self):
|
||||
# ranges 1 and 3 are merging
|
||||
x = nn.GroupNorm(32, 128)
|
||||
x(Tensor.empty(1, 128, 64, 64)).realize()
|
||||
|
||||
def test_expand_children(self):
|
||||
A = Tensor.empty(N, N).sum(axis=1)
|
||||
ba = A.expand(N, N)
|
||||
((ba+1).sum(axis=1) + (ba+2).sum(axis=0)).realize()
|
||||
|
||||
def test_partial_contig(self):
|
||||
A = Tensor.empty(64, 64, 64)
|
||||
ret = A.sum(axis=2).contiguous(arg=(1,)).sum(axis=1)
|
||||
ret.realize()
|
||||
|
||||
@unittest.skip("RANGEIFY=0 does nothing")
|
||||
def test_double_gemm_real(self):
|
||||
def go():
|
||||
with Context(DEBUG=0):
|
||||
Tensor.manual_seed(1337)
|
||||
A,B,C = [Tensor.randn(N, N) for _ in range(3)]
|
||||
Tensor.realize(A, B, C)
|
||||
GlobalCounters.reset()
|
||||
return (A@B@C).realize()
|
||||
rng = go()
|
||||
with Context(RANGEIFY=0, DEBUG=2):
|
||||
ref = go()
|
||||
mse = ((rng-ref)**2).sum().item()
|
||||
print(f"mse: {mse}")
|
||||
self.assertLessEqual(mse, 1e-2)
|
||||
|
||||
def test_double_gemm(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
(A@B@C).realize()
|
||||
|
||||
def test_double_gemm_exp(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
(((A@B).exp()@C).exp()).realize()
|
||||
|
||||
def test_double_gemm_exp_child(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
# A@B is used with exp, and also on the sum. this is two kernels now, is this right?
|
||||
ret = A@B
|
||||
((ret.exp()@C)+ret).realize()
|
||||
|
||||
def test_double_gemm_relu(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
(((A@B).relu()@C).relu()).realize()
|
||||
|
||||
def test_double_gemm_relu_half_contig(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
(((A@B).relu().contiguous(arg=(1,))@C).relu()).realize()
|
||||
|
||||
def test_double_gemm_half_contig(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
((A@B).contiguous(arg=(1,))@C).realize()
|
||||
|
||||
def test_double_gemm_contig(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
((A@B).contiguous()@C).realize()
|
||||
|
||||
def test_many_gemm(self):
|
||||
A = Tensor.empty(N, N)
|
||||
B = Tensor.empty(N, N)
|
||||
C = Tensor.empty(N, N)
|
||||
D = Tensor.empty(N, N)
|
||||
E = Tensor.empty(N, N)
|
||||
F = Tensor.empty(N, N)
|
||||
(A@B@C@D@E@F).realize()
|
||||
|
||||
def test_conv2d(self):
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 3, 3)
|
||||
x.conv2d(w1).realize()
|
||||
|
||||
def test_conv2d_elu(self):
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 3, 3)
|
||||
x.conv2d(w1).elu().realize()
|
||||
|
||||
def test_conv2d_t(self):
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 3, 3)
|
||||
(x*2).conv2d(w1).realize()
|
||||
|
||||
def test_double_conv2d(self):
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 3, 3)
|
||||
w2 = Tensor.empty(12, 8, 3, 3)
|
||||
x.conv2d(w1).conv2d(w2).realize()
|
||||
|
||||
def test_resnet_conv2d(self):
|
||||
x = Tensor.empty(1, 8, 32, 32)
|
||||
w1 = Tensor.empty(8, 8, 3, 3)
|
||||
w2 = Tensor.empty(8, 8, 1, 1)
|
||||
x.conv2d(w1).conv2d(w2).realize()
|
||||
|
||||
def test_xception_conv2d(self):
|
||||
# NOTE: this fusion is bad, it's recomputing the inner many times
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 1, 1)
|
||||
w2 = Tensor.empty(8, 1, 3, 3)
|
||||
x.conv2d(w1).conv2d(w2, groups=8).realize()
|
||||
|
||||
def test_conv_maxpool_contig(self): self.test_conv_maxpool(True)
|
||||
def test_conv_maxpool(self, contig=False):
|
||||
GlobalCounters.reset()
|
||||
x = Tensor.empty(32, 16, 64, 64)
|
||||
l1 = nn.Conv2d(16, 16, 3)
|
||||
for p in nn.state.get_parameters(l1): p.replace(Tensor.empty(p.shape))
|
||||
x = l1(x)
|
||||
if contig: x = x.contiguous()
|
||||
x.max_pool2d().realize()
|
||||
|
||||
def test_double_conv2d_half_contig(self):
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 3, 3)
|
||||
w2 = Tensor.empty(12, 8, 3, 3)
|
||||
# NOTE: this contiguous doesn't help
|
||||
x.conv2d(w1).contiguous(arg=(1,)).conv2d(w2).permute(0,2,3,1).contiguous().realize()
|
||||
|
||||
def test_double_conv2d_contig(self):
|
||||
x = Tensor.empty(1, 4, 32, 32)
|
||||
w1 = Tensor.empty(8, 4, 3, 3)
|
||||
w2 = Tensor.empty(12, 8, 3, 3)
|
||||
x.conv2d(w1).contiguous().conv2d(w2).realize()
|
||||
|
||||
def test_transformer_ffn(self):
|
||||
from tinygrad.apps.llm import TransformerBlock
|
||||
from tinygrad import nn
|
||||
blk = TransformerBlock(1024, 4096, 1, 1, 1e-5, head_dim=1024, rope_theta=10000.0)
|
||||
for p in nn.state.get_parameters(blk): p.replace(Tensor.empty(p.shape))
|
||||
|
||||
x = Tensor.empty(128, 1024)
|
||||
out = blk._feed_forward(x)
|
||||
out.realize()
|
||||
|
||||
# contiguous + reduce can support ranges?
|
||||
|
||||
@unittest.skip("pm_rangeify no longer exists. test this in a different way")
|
||||
|
||||
@@ -1755,7 +1755,7 @@ class TestSchedule(unittest.TestCase):
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
|
||||
def test_precompute_freqs_cis(self):
|
||||
from extra.models.llama import precompute_freqs_cis
|
||||
args = {"dim":32 if CI else 128, "end":2048 if CI else 8192, "theta":10000}
|
||||
args = {"dim":32, "end":2048, "theta":10000}
|
||||
fused = precompute_freqs_cis(**args)
|
||||
run_schedule(check_schedule(fused, 1))
|
||||
if getenv("CHECK", 1):
|
||||
|
||||
@@ -3,7 +3,8 @@ import numpy as np
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.dtype import DType
|
||||
from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
|
||||
from tinygrad.helpers import Timing, fetch, temp, CI, OSX
|
||||
from tinygrad.helpers import Timing, fetch, temp, OSX
|
||||
from test.helpers import slow
|
||||
from tinygrad.device import is_dtype_supported
|
||||
|
||||
def compare_weights_both(url):
|
||||
@@ -340,8 +341,8 @@ class TestDiskTensor(unittest.TestCase):
|
||||
on_dev = t.to(Device.DEFAULT).realize()
|
||||
np.testing.assert_equal(on_dev.numpy(), t.numpy())
|
||||
|
||||
@slow
|
||||
def test_copy_from_disk_huge(self):
|
||||
if CI and not hasattr(Device["DISK"], 'io_uring'): self.skipTest("slow on ci without iouring")
|
||||
|
||||
fn = pathlib.Path(temp("dt_copy_from_disk_huge"))
|
||||
fn.unlink(missing_ok=True)
|
||||
|
||||
@@ -2,7 +2,8 @@ import unittest, math, operator, subprocess, struct
|
||||
from tinygrad.tensor import Tensor, dtypes, Device
|
||||
from tinygrad.dtype import DType, DTYPES_DICT, truncate, float_to_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import getenv, CI, DEBUG
|
||||
from tinygrad.helpers import getenv, DEBUG
|
||||
from test.helpers import slow
|
||||
from hypothesis import given, settings, strategies as strat
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -594,7 +595,7 @@ class TestAutoCastType(unittest.TestCase):
|
||||
dtypes.default_float = old_default_float
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "PYTHON", "very slow")
|
||||
@unittest.skipIf(CI and Device.DEFAULT == "AMD", "very slow")
|
||||
@slow
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "Binding size is larger than the maximum storage buffer binding size")
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
|
||||
def test_mean_half_precision_underflow(self):
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from typing_extensions import Callable
|
||||
import hashlib, random, unittest
|
||||
from tinygrad import Tensor, Device, getenv, dtypes
|
||||
from test.helpers import slow
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import CI
|
||||
from tinygrad.uop.ops import UOp
|
||||
from tinygrad.engine.jit import TinyJit
|
||||
|
||||
@@ -58,7 +58,7 @@ class TestKeccak(unittest.TestCase):
|
||||
self.assertEqual(bytes(Tensor(b"abc").keccak().tolist()),
|
||||
bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532"))
|
||||
|
||||
@unittest.skipIf(CI, "times out in ci")
|
||||
@slow
|
||||
def test_long(self):
|
||||
data = b"\x00" * 4
|
||||
self.assertEqual(bytes(Tensor(data).keccak("shake_128").tolist()), hashlib.shake_128(data).digest(16))
|
||||
@@ -74,7 +74,7 @@ class TestKeccak(unittest.TestCase):
|
||||
self.assertEqual(bytes(out[1].tolist()), bytearray.fromhex("3a985da74fe225b2 045c172d6bd390bd 855f086e3e9d525b 46bfe24511431532"))
|
||||
self.assertEqual(bytes(out[2].tolist()), bytearray.fromhex("8e0d8f672252acb0 ffc5093db8653b18 1513bf9a2097e737 b4f73533dcaf46df"))
|
||||
|
||||
@unittest.skipIf(CI, "redundant with test_variable_bs")
|
||||
@slow
|
||||
def test_variable_bs_jit(self):
|
||||
def f(data):
|
||||
return data.keccak()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import ctypes, gzip, unittest, timeit
|
||||
from tinygrad import Variable
|
||||
from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, CI, mv_address, get_contraction
|
||||
from tinygrad.helpers import Context, ContextVar, argfix, colored, word_wrap, is_numpy_ndarray, mv_address, get_contraction
|
||||
from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv, polyN, time_to_str, cdiv, cmod, getbits
|
||||
from tinygrad.tensor import Tensor, get_shape
|
||||
import numpy as np
|
||||
@@ -198,7 +198,7 @@ class TestMemoryview(unittest.TestCase):
|
||||
mv[0] = 2
|
||||
assert base[0] == 2
|
||||
|
||||
@unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
|
||||
@unittest.skip("allocates tons of memory")
|
||||
def test_to_mv(self):
|
||||
sizes = [
|
||||
(16, "16 B"),
|
||||
|
||||
@@ -5,7 +5,8 @@ import numpy as np
|
||||
|
||||
from tinygrad import Tensor, dtypes, Device, TinyJit
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import CI, all_same, prod
|
||||
from tinygrad.helpers import all_same, prod
|
||||
from test.helpers import slow
|
||||
|
||||
random.seed(42)
|
||||
|
||||
@@ -1140,7 +1141,7 @@ def get_set_tensor(indexed: Tensor, indexer):
|
||||
set_tensor = Tensor.randint(set_count, high=set_count).reshape(set_size) #.cast(dtypes.float64)
|
||||
return set_tensor
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow")
|
||||
@slow
|
||||
class TestAdvancedIndexing(unittest.TestCase):
|
||||
def test_integer_array_indexing(self):
|
||||
# pick a random valid indexer type
|
||||
|
||||
@@ -20,7 +20,7 @@ class TestRawShmBuffer(unittest.TestCase):
|
||||
assert np.allclose(t.numpy(), t2.numpy())
|
||||
s.unlink()
|
||||
|
||||
@unittest.skipIf(CI, "CI doesn't like big shared memory")
|
||||
@unittest.skip("big shared memory")
|
||||
def test_e2e_big(self):
|
||||
# bigger than this doesn't work on Linux, maybe this is a limit somewhere?
|
||||
t = Tensor.randn(2048, 128, 8).realize()
|
||||
|
||||
Reference in New Issue
Block a user