move high level stuff to unit tests [pr] (#10708)

* move high level stuff to unit tests [pr]

* process replay on unit tests

* fix pr, less compute

* set omp num threads

* set 200MB buffer size limit

* delete junk

* fix tests

* faster

* move test_indexing to unit

* faster
This commit is contained in:
George Hotz
2025-06-08 14:05:56 -07:00
committed by GitHub
parent 171580e9ec
commit 81b9c04574
14 changed files with 42 additions and 32 deletions

View File

@@ -93,10 +93,13 @@ runs:
. .venv/bin/activate
fi
python -m pip install -e . ${{ inputs.pydeps }}
- name: Export venv
- name: Set up venv environment
shell: bash
run: |
echo "VIRTUAL_ENV=${{ github.workspace }}/.venv" >> "$GITHUB_ENV"
echo "OMP_NUM_THREADS=1" >> "$GITHUB_ENV"
# no buffers should be over 300MB in CI
echo "MAX_BUFFER_SIZE=300000000" >> "$GITHUB_ENV"
if [[ "$RUNNER_OS" == "Windows" ]]; then
echo "${{ github.workspace }}/.venv/Scripts" >> "$GITHUB_PATH"
else

View File

@@ -364,6 +364,8 @@ jobs:
# run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing
- name: Run GC tests
run: PYTHONPATH="." python test/external/external_uop_gc.py
- name: Run process replay tests
uses: ./.github/actions/process-replay
- name: Repo line count < 14000 lines
run: MAX_LINE_COUNT=14000 python sz.py
@@ -475,9 +477,9 @@ jobs:
run: CPU=1 PYTHONPATH=. python3 test/test_quantize_onnx.py
- name: Run REMOTE=1 Test
run: |
REMOTEDEV=CPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py
REMOTEDEV=GPU REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py
REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
REMOTEDEV=CPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_jit.py test/test_multitensor.py
REMOTEDEV=GPU REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py test/test_jit.py
REMOTEDEV=GPU IMAGE=2 REMOTE=1 python3 -m pytest -n=auto test/test_tiny.py test/test_image_dtype.py
- name: Test Optimization Helpers
run: PYTHONPATH="." DEBUG=1 python3 extra/optimization/test_helpers.py
- name: Test Action Space
@@ -618,7 +620,7 @@ jobs:
if: matrix.backend=='amdllvm'
run: python test/test_amd_llvm.py
- name: Run pytest (amd)
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/test_hcq.py test/external/external_test_am.py --durations=20
- name: Run TRANSCENDENTAL math
run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
- name: Run TestOps.test_add with SQTT
@@ -654,7 +656,8 @@ jobs:
PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Run pytest (cuda)
run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --durations=20
# skip multitensor because it's slow
run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --ignore test/test_multitensor.py --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay

View File

@@ -28,7 +28,7 @@ repos:
pass_filenames: false
- id: tests
name: subset of tests
entry: env MAX_BUFFER_SIZE=200000000 PYTHONPATH="." python3 -m pytest -n=4 test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py
entry: env MAX_BUFFER_SIZE=300000000 PYTHONPATH="." python3 -m pytest -n=4 --ignore=test/unit/test_keccak.py --ignore=test/unit/test_indexing.py test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_assign.py test/test_symbolic_shapetracker.py
language: system
always_run: true
pass_filenames: false

View File

@@ -152,7 +152,7 @@ class TestOpenImagesDataset(ExternalTestDatasets):
ref_tgt = postprocess_targets(ref_tgt, anchors.unsqueeze(0))
ref_boxes, ref_labels = ref_tgt[0]["boxes"], ref_tgt[0]["labels"]
np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy())
np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6)
np.testing.assert_equal(tinygrad_boxes[0].numpy(), ref_boxes.numpy())
np.testing.assert_equal(tinygrad_labels[0].numpy(), ref_labels.numpy())
@@ -165,7 +165,7 @@ class TestOpenImagesDataset(ExternalTestDatasets):
for ((tinygrad_img, _, _, _), (ref_img, _)) in zip(tinygrad_dataloader, ref_dataloader):
ref_img, _ = transform(ref_img.unsqueeze(0))
np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy())
np.testing.assert_allclose(self._normalize_img(tinygrad_img.numpy()), ref_img.tensors.transpose(1, 3).numpy(), rtol=1e-6)
if __name__ == '__main__':
unittest.main()

View File

@@ -34,7 +34,7 @@ class ProcessReplayWarning(Warning): pass
def replay_kernelize(ret:dict[UOp, UOp], big_sink:UOp) -> tuple[str, str, tuple[Any, ...]]:
UOp.unique_num = itertools.count(max([u.arg for u in big_sink.toposort() if u.op is Ops.UNIQUE], default=0)+1)
new_sink = big_sink.substitute(get_kernelize_map(big_sink))
def to_str(ret:UOp):
def to_str(ret:UOp) -> str:
asts = [repr(u.arg.ast) for u in ret.toposort() if u.op is Ops.KERNEL]
return "\n".join([f"{len(asts)} kernels", *asts])
return to_str(new_sink), to_str(ret[big_sink]), (big_sink,)
@@ -44,7 +44,9 @@ def replay_linearize(k:Kernel, _:Kernel, name_override=None, ast_transform=None)
# this should be made fully functional. It's fine for process replay since copy returns a fresh instance
k2 = k.copy()
k2.linearize(name_override=name_override or to_function_name(k.name), ast_transform=ast_transform)
def to_str(ret:Kernel): return ret.opts.render(ret.uops)
def to_str(ret:Kernel) -> str:
try: return ret.opts.render(ret.uops)
except NotImplementedError: return "" # NULL backend doesn't have a renderer, this is okay
return to_str(k2), to_str(k), (k.ast, k.opts, k.applied_opts)
replayers: dict[str, Callable[..., tuple[str, str, tuple[Any, ...]]]] = {"get_kernelize_map":replay_kernelize, "linearize":replay_linearize}

View File

@@ -61,7 +61,7 @@ class TestRealWorld(unittest.TestCase):
derandomize_model(model)
@TinyJit
def test(t, t2): return model(t, Tensor([801]), t2).realize()
helper_test("test_sd", lambda: (Tensor.randn(1, 4, 64, 64),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515)
helper_test("test_sd", lambda: (Tensor.randn(1, 4, 32, 32),Tensor.randn(1, 77, params["ctx_dim"])), test, 18.0, 515)
def test_unet_resblock(self):
model = [ResBlock(16, 24, 16) for _ in range(4)]

View File

@@ -523,6 +523,7 @@ class TestTypeSpec(unittest.TestCase):
dtypes.default_float = default_float
assert dtypes.default_float == default_float
@unittest.skip("this test is slow and spawning whole pythons")
def test_env_set_default_float(self):
# check default
subprocess.run(['python3 -c "from tinygrad import dtypes; assert dtypes.default_float == dtypes.float"'],

View File

@@ -1254,6 +1254,7 @@ class TestLinearizerFailures(unittest.TestCase):
opts = [Opt(op=OptOps.TC, axis=0, arg=(-1, 2, 1))]
helper_test_lin(Kernel(ast, opts=Device[Device.DEFAULT].renderer), opts=opts, failed_platforms=[])
@unittest.skip("allocating over 200MB buffer")
@unittest.skipIf(CI and Device.DEFAULT in {"METAL"}, "hangs metal gpu CI")
def test_failure_52(self):
# resnet beam.

View File

@@ -1213,8 +1213,8 @@ class TestMultiTransformer(unittest.TestCase):
device = tuple(f"{Device.DEFAULT}:{i}" for i in range(2))
from extra.models.llama import Transformer
args = {"dim": 64, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024,
"hidden_dim": 64, "max_context": 12}
args = {"dim": 32, "n_heads": 1, "n_kv_heads": 1, "n_layers": 2, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 1024,
"hidden_dim": 32, "max_context": 12}
real_model = Transformer(**args)
shard_model = Transformer(**args)

View File

@@ -2122,8 +2122,8 @@ class TestOps(unittest.TestCase):
lambda x,w: Tensor.conv2d(x,w,padding=p).relu())
def _test_conv2d(self, bs=1, cin=1, cout=6):
for H in [1,2,3]:
for W in [1,2,3,5]:
for H in [2,3]:
for W in [1,3,5]:
for groups in [1,3] if cin == 3 and cout == 6 and H == 3 and W == 3 else [1]:
with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
helper_test_op([(bs,cin,5,7), (cout,cin//groups,H,W)],
@@ -2300,7 +2300,7 @@ class TestOps(unittest.TestCase):
def test_max_pool2d(self):
for ksz in [(2,2), (3,3), 2, 3, (3,2), (5,5), (5,1)]:
with self.subTest(kernel_size=ksz):
helper_test_op([(32,2,110,28)],
helper_test_op([(32,2,11,28)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
lambda x: Tensor.max_pool2d(x, kernel_size=ksz))
@@ -2308,7 +2308,7 @@ class TestOps(unittest.TestCase):
for ksz in [(2,2), (3,3), 2, 3, (3,2)]:
for p in [1, (1,0), (0,1)]:
with self.subTest(kernel_size=ksz, padding=p):
helper_test_op([(32,2,110,28)],
helper_test_op([(32,2,11,28)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz, padding=p),
lambda x: Tensor.max_pool2d(x, kernel_size=ksz, padding=p))
self.helper_test_exception([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), padding=(1,1,1)),
@@ -2324,40 +2324,40 @@ class TestOps(unittest.TestCase):
def test_max_pool2d_padding_int(self):
ksz = (2,2)
helper_test_op([(32,2,110,28)],
helper_test_op([(32,2,11,28)],
lambda x: torch.nn.functional.max_pool2d(x.int(), kernel_size=ksz, padding=1),
lambda x: Tensor.max_pool2d(x.int(), kernel_size=ksz, padding=1), forward_only=True)
def test_max_pool2d_bigger_stride(self):
for stride in [(2,3), (3,2), 2, 3]:
with self.subTest(stride=stride):
helper_test_op([(32,2,110,28)],
helper_test_op([(32,2,11,28)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride),
lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride))
def test_max_pool2d_bigger_stride_dilation(self):
for stride, dilation in zip([(2,3), (3,2), 2, 3, 4], [(3,2), (2,3), 2, 3, 6]):
with self.subTest(stride=stride):
helper_test_op([(32,2,110,28)],
helper_test_op([(32,2,11,28)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation),
lambda x: Tensor.max_pool2d(x, kernel_size=(2,2), stride=stride, dilation=dilation))
@unittest.skipIf( Device.DEFAULT in {"CUDA", "NV"}, "CUDA fails on this")
def test_max_pool2d_unit_stride(self):
helper_test_op([(8, 2, 17, 14)],
helper_test_op([(3, 2, 17, 14)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=1),
lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=1))
def test_max_pool2d_smaller_stride(self):
for stride in [(2,3), (3,2), 2, 3]:
with self.subTest(stride=stride):
helper_test_op([(8, 2, 17, 14)],
helper_test_op([(3, 2, 17, 14)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), stride=stride),
lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), stride=stride))
def test_max_pool2d_dilation(self):
for dilation in [(2, 3), (3, 2), 2, 3]:
helper_test_op([(8, 2, 17, 14)],
helper_test_op([(3, 2, 17, 14)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=(5,5), dilation=dilation),
lambda x: Tensor.max_pool2d(x, kernel_size=(5,5), dilation=dilation))
@@ -2541,13 +2541,13 @@ class TestOps(unittest.TestCase):
def test_interpolate_nearest_exact(self): self.test_interpolate_nearest("nearest-exact")
def test_interpolate_bilinear(self):
for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]:
for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]:
helper_test_op([(2,3)+in_sz],
lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear"),
lambda x: Tensor.interpolate(x, size=out_sz, mode="linear"), atol=1e-4)
def test_interpolate_bilinear_corners_aligned(self):
for in_sz, out_sz in [((52,40),(29,31)), ((52,29),(31,40)), ((29,31),(40,52))]:
for in_sz, out_sz in [((12,20),(9,31)), ((12,9),(31,20)), ((9,31),(20,12))]:
helper_test_op([(2,3)+in_sz],
lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="bilinear", align_corners=True),
lambda x: Tensor.interpolate(x, size=out_sz, mode="linear", align_corners=True), atol=1e-4)
@@ -2838,7 +2838,7 @@ class TestOps(unittest.TestCase):
b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False)
a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False)
for reduce in ("sum", "prod", "mean", "amin", "amax"):
for dim in (0,1,2,-1,-2,-3):
for dim in (-1,1,-3):
helper_test_op([(4,5,6), (4,5,6)],
lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce),
lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True)

View File

@@ -1053,9 +1053,9 @@ class TestSchedule(unittest.TestCase):
def test_scaled_dot_product_attention_multireduce_fusion(self):
Tensor.manual_seed(0)
q = Tensor.randn(32,8,16,64).realize()
k = Tensor.randn(32,8,16,64).realize()
v = Tensor.randn(32,8,16,64).realize()
q = Tensor.randn(32,8,16,8).realize()
k = Tensor.randn(32,8,16,8).realize()
v = Tensor.randn(32,8,16,8).realize()
out = Tensor.scaled_dot_product_attention(q,k,v)
run_schedule(check_schedule(out, 5))
if getenv("CHECK", 1):
@@ -1296,7 +1296,7 @@ class TestSchedule(unittest.TestCase):
def test_sgd_4convs_fuse(self):
with Tensor.train():
img = Tensor.empty(2,3,64,64)
img = Tensor.empty(2,3,16,16)
c1 = nn.Conv2d(3,4,3,bias=False)
c2 = nn.Conv2d(4,8,3,bias=False)
c3 = nn.Conv2d(8,16,3,bias=False)
@@ -1309,7 +1309,7 @@ class TestSchedule(unittest.TestCase):
def test_sgd_4convs_fuse_conv_bw(self):
with Tensor.train():
img = Tensor.empty(2,3,64,64)
img = Tensor.empty(2,3,16,16)
c1 = nn.Conv2d(3,4,3,bias=False)
c2 = nn.Conv2d(4,8,3,bias=False)
c3 = nn.Conv2d(8,16,3,bias=False)