print test durations and add speed (#2107)

* print test durations * decrease sizes to increase speed * faster * GPU/CLANG onnx in seperate runner * test split, move ONNX CPU CI * simpler tests * simpler uops test * faster * less cuda apt * running ninja install * apt install * split fancy indexing
2026-01-09 15:08:02 -05:00 · 2023-10-18 13:46:42 -07:00
parent e2a1c2aaa6
commit 15da96f393
8 changed files with 123 additions and 112 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,6 +2,8 @@ name: Unit Tests

 on:
  push:
+    branches:
+      - master
  pull_request:
  workflow_dispatch:

@@ -43,6 +45,22 @@ jobs:
      run: sudo apt install sloccount
    - name: Check <5000 lines
      run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
+    - name: Test Docs
+      run: python docs/abstractions.py
+    - name: Test Quickstart
+      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py &&  PYTHONPATH=. python quickstart.py
+    - name: Fuzz Test symbolic
+      run: python test/external/fuzz_symbolic.py
+    - name: Fuzz Test shapetracker
+      run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
+    - name: Use as an external package
+      run: |
+        mkdir $HOME/test_external_dir
+        cd $HOME/test_external_dir
+        python -m venv venv
+        source venv/bin/activate
+        pip install $GITHUB_WORKSPACE
+        python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"

  testcpuimagenet:
    name: CPU and ImageNet to C Tests
@@ -63,32 +81,14 @@ jobs:
        key: testing-packages-${{ hashFiles('**/setup.py') }}
    - name: Install Dependencies
      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Test Docs
-      run: python docs/abstractions.py
-    - name: Test Quickstart
-      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py &&  PYTHONPATH=. python quickstart.py
    - name: Run Pytest
-      run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
-    - name: Run ONNX
-      run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py
-    - name: Fuzz Test symbolic
-      run: python test/external/fuzz_symbolic.py
-    - name: Fuzz Test shapetracker
-      run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
+      run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)" --durations=20
    - name: Compile EfficientNet to C
      run: PYTHONPATH="." CLANG=1 python examples/compile_efficientnet.py > recognize.c
    - name: Compile C to native
      run: clang -O2 recognize.c -lm -o recognize
    - name: Test EfficientNet
      run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
-    - name: Use as an external package
-      run: |
-        mkdir $HOME/test_external_dir
-        cd $HOME/test_external_dir
-        python -m venv venv
-        source venv/bin/activate
-        pip install $GITHUB_WORKSPACE
-        python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"

  testtorch:
    name: Torch Tests
@@ -110,7 +110,7 @@ jobs:
    - name: Install Dependencies
      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
    - name: Run Pytest
-      run: TORCH=1 python -m pytest -n=auto test/
+      run: TORCH=1 python -m pytest -n=auto test/ --durations=20
    - name: Run ONNX
      run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py

@@ -118,8 +118,8 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        task: [optimage, openpilot]
-    name: ${{ matrix.task=='optimage'&&'GPU OPT and IMAGE Tests'|| matrix.task=='openpilot'&&'openpilot (OpenCL) Tests'}}
+        task: [optimage, openpilot, onnx]
+    name: ${{ matrix.task=='optimage'&&'GPU OPT and IMAGE Tests' || matrix.task=='openpilot'&&'openpilot (OpenCL) Tests' || matrix.task=='onnx'&&'ONNX Tests' }}
    runs-on: ubuntu-20.04
    timeout-minutes: 20

@@ -175,6 +175,15 @@ jobs:
        run: |
          PYTHONPATH="." python test/external/dist/test_world.py
          PYTHONPATH="." python test/external/dist/test_collectives.py
+      - if: ${{ matrix.task == 'onnx' }}
+        name: Test ONNX (CPU)
+        run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - if: ${{ matrix.task == 'onnx' }}
+        name: Test ONNX (GPU)
+        run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - if: ${{ matrix.task == 'onnx' }}
+        name: Test ONNX (CLANG)
+        run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20

  testmetalwebgpu:
    name: Metal and WebGPU Tests
@@ -293,42 +302,35 @@ jobs:
          DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run pytest (not cuda)
        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton'
-        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
-      - name: Run ONNX (not cuda)
-        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend!='clang'
-        run: python -m pytest -n=auto test/external/external_test_onnx_backend.py
+        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}' --durations=20
+      - name: Run ONNX (only LLVM)
+        if: matrix.backend == 'llvm'
+        run: python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - name: Run pytest (cuda)
-        if: matrix.backend=='cuda'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
-      - name: Run pytest (ptx)
-        if: matrix.backend=='ptx'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
-      - name: Run pytest (triton)
-        if: matrix.backend=='triton'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
+        if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton'
+        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models --durations=20

-  testunicorn:
-    name: ARM64 unicorn Test
-    runs-on: ubuntu-latest
-    if: ${{false}}
-    timeout-minutes: 20
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v3
-      - name: Set up Python 3.11
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Cache python packages
-        uses: actions/cache@v3
-        with:
-          path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
-          key: testing-arm-packages-${{ hashFiles('**/setup.py') }}
-      - name: Install cross-assembler
-        run: |
-          sudo apt update -y
-          sudo apt install -y --no-install-recommends gcc-aarch64-linux-gnu
-      - name: Install dependencies
-        run: pip install -e '.[testing,arm]' --extra-index-url https://download.pytorch.org/whl/cpu
-      - name: Test arm
-        run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py  --ignore=test/unit/test_disk_tensor.py
+  #testunicorn:
+  #  name: ARM64 unicorn Test
+  #  runs-on: ubuntu-latest
+  #  timeout-minutes: 20
+  #  steps:
+  #    - name: Checkout Code
+  #      uses: actions/checkout@v3
+  #    - name: Set up Python 3.11
+  #      uses: actions/setup-python@v4
+  #      with:
+  #        python-version: 3.11
+  #    - name: Cache python packages
+  #      uses: actions/cache@v3
+  #      with:
+  #        path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
+  #        key: testing-arm-packages-${{ hashFiles('**/setup.py') }}
+  #    - name: Install cross-assembler
+  #      run: |
+  #        sudo apt update -y
+  #        sudo apt install -y --no-install-recommends gcc-aarch64-linux-gnu
+  #    - name: Install dependencies
+  #      run: pip install -e '.[testing,arm]' --extra-index-url https://download.pytorch.org/whl/cpu
+  #    - name: Test arm
+  #      run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py  --ignore=test/unit/test_disk_tensor.py
--- a/test/models/test_efficientnet.py
+++ b/test/models/test_efficientnet.py
@@ -68,7 +68,7 @@ class TestEfficientNet(unittest.TestCase):
    self.assertEqual(label, "hen")

  def test_chicken_bigbatch(self):
-    label = _infer(self.model, chicken_img, 4)
+    label = _infer(self.model, chicken_img, 2)
    self.assertEqual(label, "hen")

  def test_car(self):
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -72,7 +72,7 @@ class TestRealWorld(unittest.TestCase):
    # NOTE: only test one pass, not testing the dynamic shape autoregressive part
    helper_test("test_llama", lambda: (Tensor([[1,]]),), test, 0.22 if CI else 13.5, 126 if CI else 486, all_jitted=True)

-  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM"] or not CI), "needs JIT, too long on CI LLVM")
  def test_gpt2(self):
    Tensor.default_type = dtypes.float16

@@ -83,7 +83,7 @@ class TestRealWorld(unittest.TestCase):
    def test(t): return model(t, 0).realize()
    helper_test("test_gpt2", lambda: (Tensor([[1,]]),), test, 0.21 if CI else 0.9, 129 if CI else 369, all_jitted=True)

-  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM", "CLANG"] or not CI), "needs JIT, too long on CI LLVM and CLANG")
  def test_train_cifar(self):
    # TODO: with default device
    #old_default = Device.DEFAULT
--- a/test/models/test_train.py
+++ b/test/models/test_train.py
@@ -13,7 +13,7 @@ from models.vit import ViT
 from models.resnet import ResNet18
 import pytest

-pytestmark = pytest.mark.exclude_gpu
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]

 BS = getenv("BS", 2)

--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2,7 +2,7 @@
 import unittest
 import numpy as np
 from extra.utils import WINDOWS
-from tinygrad.helpers import getenv
+from tinygrad.helpers import CI
 from tinygrad.jit import TinyJit
 from tinygrad.tensor import Tensor, Device
 from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, GroupNorm, LayerNorm, LayerNorm2d, Embedding, InstanceNorm
@@ -90,7 +90,7 @@ class TestNN(unittest.TestCase):
    _test_linear(Tensor.randn(BS, T, in_dim)) # test with more dims

  def test_conv1d(self):
-    BS, C1, W = 4, 16, 224
+    BS, C1, W = 4, 16, 224//4
    C2, K, S, P = 64, 7, 2, 1

    # create in tinygrad
@@ -110,7 +110,7 @@ class TestNN(unittest.TestCase):
    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)

  def test_conv2d(self):
-    BS, C1, H, W = 4, 16, 224, 224
+    BS, C1, H, W = 4, 16, 224//4, 224//4
    C2, K, S, P = 64, 7, 2, 1

    # create in tinygrad
@@ -166,9 +166,9 @@ class TestNN(unittest.TestCase):

    Tensor.wino = False

-  @unittest.skipIf(getenv("CI", "") != "" and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
+  @unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
  def test_conv_transpose1d(self):
-    BS, C1, W = 4, 16, 224
+    BS, C1, W = 4, 16, 224//4
    C2, K, S, P = 64, 7, 2, 1

    # create in tinygrad
@@ -187,9 +187,9 @@ class TestNN(unittest.TestCase):
    torch_z = torch_layer(torch_x)
    np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)

-  @unittest.skipIf(getenv("CI", "") != "" and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
+  @unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
  def test_conv_transpose2d(self):
-    BS, C1, H, W = 4, 16, 224, 224
+    BS, C1, H, W = 4, 16, 224//4, 224//4
    C2, K, S, P = 64, 7, 2, 1

    # create in tinygrad
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -847,8 +847,8 @@ class TestOps(unittest.TestCase):
  def test_conv1d(self):
    for bs in [1,8]:
      for cin in [1,3]:
-        for groups in [1,3] if cin == 3 else [1]:
-          for H in [1,2,5]:
+        for H in [1,2,5]:
+          for groups in [1,3] if cin == 3 and H == 5 else [1]:
            with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H):
              helper_test_op([(bs,cin,11), (6,cin//groups,H)],
                lambda x,w: torch.nn.functional.conv1d(x,w,groups=groups).relu(),
@@ -886,13 +886,13 @@ class TestOps(unittest.TestCase):
              lambda x,w: Tensor.conv2d(x,w,padding=p).relu(), atol=1e-4)

  def test_conv2d(self):
-    for bs in [1,8]:
+    for bs in [1,4]:
      for cin in [1,3]:
-        for groups in [1,3] if cin == 3 else [1]:
-          for H in [1,2,5]:
-            for W in [1,2,3,5]:
+        for H in [1,2,3]:
+          for W in [1,2,3,5]:
+            for groups in [1,3] if cin == 3 and H == 3 and W == 3 else [1]:
              with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
-                helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
+                helper_test_op([(bs,cin,11,7), (6,cin//groups,H,W)],
                  lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
                  lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=1e-4, grad_rtol=1e-5)

@@ -1094,7 +1094,7 @@ class TestOps(unittest.TestCase):

  def test_cat(self):
    for dim in range(-2, 3):
-      helper_test_op([(45,65, 90), (45,65,90), (45,65,90)], lambda x,y,z: torch.cat((x,y,z), dim), lambda x,y,z: x.cat(y, z, dim=dim))
+      helper_test_op([(45,65,9), (45,65,9), (45,65,9)], lambda x,y,z: torch.cat((x,y,z), dim), lambda x,y,z: x.cat(y, z, dim=dim))

    with self.assertRaises(AssertionError):
      a = Tensor(3.14)
@@ -1117,12 +1117,12 @@ class TestOps(unittest.TestCase):
    np.testing.assert_allclose(Tensor.stack([a, a]).numpy(), Tensor([3.14, 3.14]).numpy())

  def test_repeat(self):
-    x = Tensor.randn(45, 65, 3)
+    x = Tensor.randn(4, 6, 3)
    base_repeats = [2, 4, 3]

    for reps in [[], [4], [2, 1], [3, 2, 2]]:
      repeats = base_repeats + reps
-      helper_test_op([(45, 65, 3)], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))
+      helper_test_op([(4, 6, 3)], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))
      helper_test_op([()], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))

    with self.assertRaises(AssertionError):
@@ -1157,7 +1157,7 @@ class TestOps(unittest.TestCase):
    n = (x < 0).where(x, 1).numpy()
    assert np.all(n == 1.)

-  def test_slice_fancy_indexing(self):
+  def _get_index_randoms(self):
    # indices cannot have gradient
    # TODO currently does not support IndexError for out of bounds idx values
    a = torch.randint(low=-1, high=1, size=(2,1,1,1,1,1), dtype=torch.int64, requires_grad=False)
@@ -1166,34 +1166,43 @@ class TestOps(unittest.TestCase):
    d = torch.randint(high=4, size=(2,1,1,5,1,1), dtype=torch.int64, requires_grad=False)
    e = torch.randint(high=1, size=(1,1,1,1,6,1), dtype=torch.int64, requires_grad=False)
    i, j, k, o, p = [Tensor(tor.detach().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False) for tor in [a,b,c,d,e]]
+    return a,b,c,d,e,i,j,k,o,p
+
+  def test_slice_fancy_indexing_no_dim_collapse(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    # no dim collapse from int or dim injection from None
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,c,d,e], lambda x: x[i,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[:,b,c,d,e], lambda x: x[:,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[:,b,c,d,:], lambda x: x[:,j,k,o,:])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,...], lambda x: x[i,j,...])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,...,e], lambda x: x[i,...,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[...,c,:,e], lambda x: x[...,k,:,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,c,d,e], lambda x: x[i,j,k,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[:,b,c,d,:], lambda x: x[:,j,k,o,:])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,...], lambda x: x[i,j,...])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,...,e], lambda x: x[i,...,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[...,c,:,e], lambda x: x[...,k,:,p])
+
+  def test_slice_fancy_indexing_dim_collapse_int(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    # dim collapse from int
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,c,d,e], lambda x: x[1,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,c,d,2], lambda x: x[i,j,k,o,2])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,3,d,e], lambda x: x[i,j,3,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,c,d,2], lambda x: x[1,j,k,o,2])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,2,d,2], lambda x: x[1,j,2,o,2])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,2,2,2,e], lambda x: x[i,2,2,2,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,:,3:11:2,d,0:2], lambda x: x[1,:,3:11:2,o,0:2])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,b,c,d,e], lambda x: x[1,j,k,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,3,d,e], lambda x: x[i,j,3,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,b,2,d,2], lambda x: x[1,j,2,o,2])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,2,2,2,e], lambda x: x[i,2,2,2,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,:,3:11:2,d,0:2], lambda x: x[1,:,3:11:2,o,0:2])
+
+  def test_slice_fancy_indexing_dim_inject_none(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    # dim injection from None
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,c,d,e], lambda x: x[None,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,c,d,None], lambda x: x[i,j,k,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,None,d,e], lambda x: x[i,j,None,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,None,None,None,e], lambda x: x[i,None,None,None,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,None,d,None], lambda x: x[None,j,None,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,c,d,None], lambda x: x[None,j,k,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,:,None,d,e], lambda x: x[i,:,None,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[None,b,c,d,e], lambda x: x[None,j,k,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,c,d,None], lambda x: x[i,j,k,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,None,d,e], lambda x: x[i,j,None,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[None,b,c,d,None], lambda x: x[None,j,k,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,:,None,d,e], lambda x: x[i,:,None,o,p])
+
+  def test_slice_fancy_indexing_dim_inject_and_collapse(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
    # dim injection and collapse
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,None,d,1], lambda x: x[1,j,None,o,1])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,2,d,None], lambda x: x[None,j,2,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,1,None,d,e], lambda x: x[None,1,None,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[...,1,d,None], lambda x: x[...,1,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,b,None,d,1], lambda x: x[1,j,None,o,1])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[None,b,2,d,None], lambda x: x[None,j,2,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[...,1,d,None], lambda x: x[...,1,o,None])
+
+  def test_slice_fancy_indexing_with_idx(self):
    # indexing using idx with different dim
    helper_test_op([(2,3)], lambda x: x[torch.tensor([[0,0,0],[0,0,0]]), torch.tensor(1)], lambda x: x[Tensor([[0,0,0],[0,0,0]]), Tensor(1)])
    helper_test_op([(2,3)], lambda x: x[torch.tensor([1]), torch.tensor([[0,0,0],[0,0,0]])], lambda x: x[Tensor([1]), Tensor([[0,0,0],[0,0,0]])])
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -44,18 +44,18 @@ class TestUOps(unittest.TestCase):

  def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):
    for f in [_test_single_value, _test_single_value_const]:
-      for a in [-2.0, 0.0, 1.0, 2.0]:
+      for a in [-2.0, 0.0, 1.0]:
        self._equal(f([a], bop, dt), fxn(a))

  def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):
    for f in [_test_single_value, _test_single_value_const]:
-      for a in [-2.0, 0.0, 1.0, 2.0]:
-        for b in [-3.0, 1.0, 3.0] + ([] if no_b_zero else [0.0]):
+      for a in [-2.0, 0.0, 1.0]:
+        for b in [-3.0, 1.0] + ([] if no_b_zero else [0.0]):
          self._equal(f([a,b], bop, dt), fxn(a,b))

  def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):
    for f in [_test_single_value, _test_single_value_const]:
-      for a in [-2.0, 0, 1, 2.0]:
+      for a in [-2.0, 0, 1]:
        for b in [-3.0, 3.0]:
          for c in [-4.0, 4.0]:
            self._equal(f([a,b,c], bop, dt), fxn(a,b,c))
--- a/test/test_winograd.py
+++ b/test/test_winograd.py
@@ -1,5 +1,5 @@
 import unittest
-from tinygrad.helpers import Timing
+from tinygrad.helpers import Timing, CI
 from tinygrad.tensor import Tensor
 from tinygrad.ops import LoadOps
 from tinygrad.codegen.linearizer import Linearizer
@@ -31,9 +31,9 @@ class TestWinograd(unittest.TestCase):

  def test_profile(self):
    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
-    pr = start_profile()
+    if not CI: pr = start_profile()
    out = Tensor.conv2d(x,w).realize()
-    stop_profile(pr, sort='time')
+    if not CI: stop_profile(pr, sort='time')
    out.numpy()

 if __name__ == '__main__':