From 15da96f39379ef51b91c2afd0cc394185d6f8e2c Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Wed, 18 Oct 2023 13:46:42 -0700
Subject: [PATCH] print test durations and add speed (#2107)

* print test durations

* decrease sizes to increase speed

* faster

* GPU/CLANG onnx in seperate runner

* test split, move ONNX CPU CI

* simpler tests

* simpler uops test

* faster

* less cuda apt

* running ninja install

* apt install

* split fancy indexing
---
 .github/workflows/test.yml       | 120 ++++++++++++++++---------------
 test/models/test_efficientnet.py |   2 +-
 test/models/test_real_world.py   |   4 +-
 test/models/test_train.py        |   2 +-
 test/test_nn.py                  |  14 ++--
 test/test_ops.py                 |  79 +++++++++++---------
 test/test_uops.py                |   8 +--
 test/test_winograd.py            |   6 +-
 8 files changed, 123 insertions(+), 112 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 38d46ceaf0..ab311ddeca 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,6 +2,8 @@ name: Unit Tests
 
 on:
   push:
+    branches:
+      - master
   pull_request:
   workflow_dispatch:
 
@@ -43,6 +45,22 @@ jobs:
       run: sudo apt install sloccount
     - name: Check <5000 lines
       run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
+    - name: Test Docs
+      run: python docs/abstractions.py
+    - name: Test Quickstart
+      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py &&  PYTHONPATH=. python quickstart.py
+    - name: Fuzz Test symbolic
+      run: python test/external/fuzz_symbolic.py
+    - name: Fuzz Test shapetracker
+      run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
+    - name: Use as an external package
+      run: |
+        mkdir $HOME/test_external_dir
+        cd $HOME/test_external_dir
+        python -m venv venv
+        source venv/bin/activate
+        pip install $GITHUB_WORKSPACE
+        python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
 
   testcpuimagenet:
     name: CPU and ImageNet to C Tests
@@ -63,32 +81,14 @@ jobs:
         key: testing-packages-${{ hashFiles('**/setup.py') }}
     - name: Install Dependencies
       run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Test Docs
-      run: python docs/abstractions.py
-    - name: Test Quickstart
-      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py &&  PYTHONPATH=. python quickstart.py
     - name: Run Pytest
-      run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
-    - name: Run ONNX
-      run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py
-    - name: Fuzz Test symbolic
-      run: python test/external/fuzz_symbolic.py
-    - name: Fuzz Test shapetracker
-      run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
+      run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)" --durations=20
     - name: Compile EfficientNet to C
       run: PYTHONPATH="." CLANG=1 python examples/compile_efficientnet.py > recognize.c
     - name: Compile C to native
       run: clang -O2 recognize.c -lm -o recognize
     - name: Test EfficientNet
       run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
-    - name: Use as an external package
-      run: |
-        mkdir $HOME/test_external_dir
-        cd $HOME/test_external_dir
-        python -m venv venv
-        source venv/bin/activate
-        pip install $GITHUB_WORKSPACE
-        python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
 
   testtorch:
     name: Torch Tests
@@ -110,7 +110,7 @@ jobs:
     - name: Install Dependencies
       run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Run Pytest
-      run: TORCH=1 python -m pytest -n=auto test/
+      run: TORCH=1 python -m pytest -n=auto test/ --durations=20
     - name: Run ONNX
       run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py
 
@@ -118,8 +118,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        task: [optimage, openpilot]
-    name: ${{ matrix.task=='optimage'&&'GPU OPT and IMAGE Tests'|| matrix.task=='openpilot'&&'openpilot (OpenCL) Tests'}}
+        task: [optimage, openpilot, onnx]
+    name: ${{ matrix.task=='optimage'&&'GPU OPT and IMAGE Tests' || matrix.task=='openpilot'&&'openpilot (OpenCL) Tests' || matrix.task=='onnx'&&'ONNX Tests' }}
     runs-on: ubuntu-20.04
     timeout-minutes: 20
 
@@ -175,6 +175,15 @@ jobs:
         run: |
           PYTHONPATH="." python test/external/dist/test_world.py
           PYTHONPATH="." python test/external/dist/test_collectives.py
+      - if: ${{ matrix.task == 'onnx' }}
+        name: Test ONNX (CPU)
+        run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - if: ${{ matrix.task == 'onnx' }}
+        name: Test ONNX (GPU)
+        run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
+      - if: ${{ matrix.task == 'onnx' }}
+        name: Test ONNX (CLANG)
+        run: CLANG=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
 
   testmetalwebgpu:
     name: Metal and WebGPU Tests
@@ -293,42 +302,35 @@ jobs:
           DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Run pytest (not cuda)
         if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton'
-        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
-      - name: Run ONNX (not cuda)
-        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend!='clang'
-        run: python -m pytest -n=auto test/external/external_test_onnx_backend.py
+        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}' --durations=20
+      - name: Run ONNX (only LLVM)
+        if: matrix.backend == 'llvm'
+        run: python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
       - name: Run pytest (cuda)
-        if: matrix.backend=='cuda'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
-      - name: Run pytest (ptx)
-        if: matrix.backend=='ptx'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
-      - name: Run pytest (triton)
-        if: matrix.backend=='triton'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
+        if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton'
+        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models --durations=20
 
-  testunicorn:
-    name: ARM64 unicorn Test
-    runs-on: ubuntu-latest
-    if: ${{false}}
-    timeout-minutes: 20
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v3
-      - name: Set up Python 3.11
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Cache python packages
-        uses: actions/cache@v3
-        with:
-          path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
-          key: testing-arm-packages-${{ hashFiles('**/setup.py') }}
-      - name: Install cross-assembler
-        run: |
-          sudo apt update -y
-          sudo apt install -y --no-install-recommends gcc-aarch64-linux-gnu
-      - name: Install dependencies
-        run: pip install -e '.[testing,arm]' --extra-index-url https://download.pytorch.org/whl/cpu
-      - name: Test arm
-        run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py  --ignore=test/unit/test_disk_tensor.py
+  #testunicorn:
+  #  name: ARM64 unicorn Test
+  #  runs-on: ubuntu-latest
+  #  timeout-minutes: 20
+  #  steps:
+  #    - name: Checkout Code
+  #      uses: actions/checkout@v3
+  #    - name: Set up Python 3.11
+  #      uses: actions/setup-python@v4
+  #      with:
+  #        python-version: 3.11
+  #    - name: Cache python packages
+  #      uses: actions/cache@v3
+  #      with:
+  #        path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
+  #        key: testing-arm-packages-${{ hashFiles('**/setup.py') }}
+  #    - name: Install cross-assembler
+  #      run: |
+  #        sudo apt update -y
+  #        sudo apt install -y --no-install-recommends gcc-aarch64-linux-gnu
+  #    - name: Install dependencies
+  #      run: pip install -e '.[testing,arm]' --extra-index-url https://download.pytorch.org/whl/cpu
+  #    - name: Test arm
+  #      run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py  --ignore=test/unit/test_disk_tensor.py
diff --git a/test/models/test_efficientnet.py b/test/models/test_efficientnet.py
index 45f509a379..76108a5e6b 100644
--- a/test/models/test_efficientnet.py
+++ b/test/models/test_efficientnet.py
@@ -68,7 +68,7 @@ class TestEfficientNet(unittest.TestCase):
     self.assertEqual(label, "hen")
 
   def test_chicken_bigbatch(self):
-    label = _infer(self.model, chicken_img, 4)
+    label = _infer(self.model, chicken_img, 2)
     self.assertEqual(label, "hen")
 
   def test_car(self):
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index b114ecdcd3..22ffb70a9f 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -72,7 +72,7 @@ class TestRealWorld(unittest.TestCase):
     # NOTE: only test one pass, not testing the dynamic shape autoregressive part
     helper_test("test_llama", lambda: (Tensor([[1,]]),), test, 0.22 if CI else 13.5, 126 if CI else 486, all_jitted=True)
 
-  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM"] or not CI), "needs JIT, too long on CI LLVM")
   def test_gpt2(self):
     Tensor.default_type = dtypes.float16
 
@@ -83,7 +83,7 @@ class TestRealWorld(unittest.TestCase):
     def test(t): return model(t, 0).realize()
     helper_test("test_gpt2", lambda: (Tensor([[1,]]),), test, 0.21 if CI else 0.9, 129 if CI else 369, all_jitted=True)
 
-  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and Device.DEFAULT not in ["LLVM"], "needs JIT, too long on CI LLVM")
+  @unittest.skipUnless(Device.DEFAULT in JIT_SUPPORTED_DEVICE and (Device.DEFAULT not in ["LLVM", "CLANG"] or not CI), "needs JIT, too long on CI LLVM and CLANG")
   def test_train_cifar(self):
     # TODO: with default device
     #old_default = Device.DEFAULT
diff --git a/test/models/test_train.py b/test/models/test_train.py
index 8931fe3640..b987ee39f8 100644
--- a/test/models/test_train.py
+++ b/test/models/test_train.py
@@ -13,7 +13,7 @@ from models.vit import ViT
 from models.resnet import ResNet18
 import pytest
 
-pytestmark = pytest.mark.exclude_gpu
+pytestmark = [pytest.mark.exclude_gpu, pytest.mark.exclude_clang]
 
 BS = getenv("BS", 2)
 
diff --git a/test/test_nn.py b/test/test_nn.py
index b8d04dfa9b..75aac3c49e 100755
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2,7 +2,7 @@
 import unittest
 import numpy as np
 from extra.utils import WINDOWS
-from tinygrad.helpers import getenv
+from tinygrad.helpers import CI
 from tinygrad.jit import TinyJit
 from tinygrad.tensor import Tensor, Device
 from tinygrad.nn import BatchNorm2d, Conv1d, ConvTranspose1d, Conv2d, ConvTranspose2d, Linear, GroupNorm, LayerNorm, LayerNorm2d, Embedding, InstanceNorm
@@ -90,7 +90,7 @@ class TestNN(unittest.TestCase):
     _test_linear(Tensor.randn(BS, T, in_dim)) # test with more dims
 
   def test_conv1d(self):
-    BS, C1, W = 4, 16, 224
+    BS, C1, W = 4, 16, 224//4
     C2, K, S, P = 64, 7, 2, 1
 
     # create in tinygrad
@@ -110,7 +110,7 @@ class TestNN(unittest.TestCase):
     np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
 
   def test_conv2d(self):
-    BS, C1, H, W = 4, 16, 224, 224
+    BS, C1, H, W = 4, 16, 224//4, 224//4
     C2, K, S, P = 64, 7, 2, 1
 
     # create in tinygrad
@@ -166,9 +166,9 @@ class TestNN(unittest.TestCase):
 
     Tensor.wino = False
 
-  @unittest.skipIf(getenv("CI", "") != "" and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
+  @unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
   def test_conv_transpose1d(self):
-    BS, C1, W = 4, 16, 224
+    BS, C1, W = 4, 16, 224//4
     C2, K, S, P = 64, 7, 2, 1
 
     # create in tinygrad
@@ -187,9 +187,9 @@ class TestNN(unittest.TestCase):
     torch_z = torch_layer(torch_x)
     np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=1e-5)
 
-  @unittest.skipIf(getenv("CI", "") != "" and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
+  @unittest.skipIf(CI and (WINDOWS or Device.DEFAULT == "WEBGPU"), "runs out of memory in CI")
   def test_conv_transpose2d(self):
-    BS, C1, H, W = 4, 16, 224, 224
+    BS, C1, H, W = 4, 16, 224//4, 224//4
     C2, K, S, P = 64, 7, 2, 1
 
     # create in tinygrad
diff --git a/test/test_ops.py b/test/test_ops.py
index 943ae4c560..7ff6afd792 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -847,8 +847,8 @@ class TestOps(unittest.TestCase):
   def test_conv1d(self):
     for bs in [1,8]:
       for cin in [1,3]:
-        for groups in [1,3] if cin == 3 else [1]:
-          for H in [1,2,5]:
+        for H in [1,2,5]:
+          for groups in [1,3] if cin == 3 and H == 5 else [1]:
             with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H):
               helper_test_op([(bs,cin,11), (6,cin//groups,H)],
                 lambda x,w: torch.nn.functional.conv1d(x,w,groups=groups).relu(),
@@ -886,13 +886,13 @@ class TestOps(unittest.TestCase):
               lambda x,w: Tensor.conv2d(x,w,padding=p).relu(), atol=1e-4)
 
   def test_conv2d(self):
-    for bs in [1,8]:
+    for bs in [1,4]:
       for cin in [1,3]:
-        for groups in [1,3] if cin == 3 else [1]:
-          for H in [1,2,5]:
-            for W in [1,2,3,5]:
+        for H in [1,2,3]:
+          for W in [1,2,3,5]:
+            for groups in [1,3] if cin == 3 and H == 3 and W == 3 else [1]:
               with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
-                helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
+                helper_test_op([(bs,cin,11,7), (6,cin//groups,H,W)],
                   lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
                   lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=1e-4, grad_rtol=1e-5)
 
@@ -1094,7 +1094,7 @@ class TestOps(unittest.TestCase):
 
   def test_cat(self):
     for dim in range(-2, 3):
-      helper_test_op([(45,65, 90), (45,65,90), (45,65,90)], lambda x,y,z: torch.cat((x,y,z), dim), lambda x,y,z: x.cat(y, z, dim=dim))
+      helper_test_op([(45,65,9), (45,65,9), (45,65,9)], lambda x,y,z: torch.cat((x,y,z), dim), lambda x,y,z: x.cat(y, z, dim=dim))
 
     with self.assertRaises(AssertionError):
       a = Tensor(3.14)
@@ -1117,12 +1117,12 @@ class TestOps(unittest.TestCase):
     np.testing.assert_allclose(Tensor.stack([a, a]).numpy(), Tensor([3.14, 3.14]).numpy())
 
   def test_repeat(self):
-    x = Tensor.randn(45, 65, 3)
+    x = Tensor.randn(4, 6, 3)
     base_repeats = [2, 4, 3]
 
     for reps in [[], [4], [2, 1], [3, 2, 2]]:
       repeats = base_repeats + reps
-      helper_test_op([(45, 65, 3)], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))
+      helper_test_op([(4, 6, 3)], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))
       helper_test_op([()], lambda x: x.repeat(*repeats), lambda x: x.repeat(repeats))
 
     with self.assertRaises(AssertionError):
@@ -1157,7 +1157,7 @@ class TestOps(unittest.TestCase):
     n = (x < 0).where(x, 1).numpy()
     assert np.all(n == 1.)
 
-  def test_slice_fancy_indexing(self):
+  def _get_index_randoms(self):
     # indices cannot have gradient
     # TODO currently does not support IndexError for out of bounds idx values
     a = torch.randint(low=-1, high=1, size=(2,1,1,1,1,1), dtype=torch.int64, requires_grad=False)
@@ -1166,34 +1166,43 @@ class TestOps(unittest.TestCase):
     d = torch.randint(high=4, size=(2,1,1,5,1,1), dtype=torch.int64, requires_grad=False)
     e = torch.randint(high=1, size=(1,1,1,1,6,1), dtype=torch.int64, requires_grad=False)
     i, j, k, o, p = [Tensor(tor.detach().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False) for tor in [a,b,c,d,e]]
+    return a,b,c,d,e,i,j,k,o,p
+
+  def test_slice_fancy_indexing_no_dim_collapse(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     # no dim collapse from int or dim injection from None
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,c,d,e], lambda x: x[i,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[:,b,c,d,e], lambda x: x[:,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[:,b,c,d,:], lambda x: x[:,j,k,o,:])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,...], lambda x: x[i,j,...])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,...,e], lambda x: x[i,...,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[...,c,:,e], lambda x: x[...,k,:,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,c,d,e], lambda x: x[i,j,k,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[:,b,c,d,:], lambda x: x[:,j,k,o,:])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,...], lambda x: x[i,j,...])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,...,e], lambda x: x[i,...,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[...,c,:,e], lambda x: x[...,k,:,p])
+
+  def test_slice_fancy_indexing_dim_collapse_int(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     # dim collapse from int
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,c,d,e], lambda x: x[1,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,c,d,2], lambda x: x[i,j,k,o,2])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,3,d,e], lambda x: x[i,j,3,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,c,d,2], lambda x: x[1,j,k,o,2])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,2,d,2], lambda x: x[1,j,2,o,2])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,2,2,2,e], lambda x: x[i,2,2,2,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,:,3:11:2,d,0:2], lambda x: x[1,:,3:11:2,o,0:2])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,b,c,d,e], lambda x: x[1,j,k,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,3,d,e], lambda x: x[i,j,3,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,b,2,d,2], lambda x: x[1,j,2,o,2])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,2,2,2,e], lambda x: x[i,2,2,2,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,:,3:11:2,d,0:2], lambda x: x[1,:,3:11:2,o,0:2])
+
+  def test_slice_fancy_indexing_dim_inject_none(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     # dim injection from None
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,c,d,e], lambda x: x[None,j,k,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,c,d,None], lambda x: x[i,j,k,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,b,None,d,e], lambda x: x[i,j,None,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,None,None,None,e], lambda x: x[i,None,None,None,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,None,d,None], lambda x: x[None,j,None,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,c,d,None], lambda x: x[None,j,k,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[a,:,None,d,e], lambda x: x[i,:,None,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[None,b,c,d,e], lambda x: x[None,j,k,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,c,d,None], lambda x: x[i,j,k,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,b,None,d,e], lambda x: x[i,j,None,o,p])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[None,b,c,d,None], lambda x: x[None,j,k,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[a,:,None,d,e], lambda x: x[i,:,None,o,p])
+
+  def test_slice_fancy_indexing_dim_inject_and_collapse(self):
+    a,b,c,d,e,i,j,k,o,p = self._get_index_randoms()
     # dim injection and collapse
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[1,b,None,d,1], lambda x: x[1,j,None,o,1])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,b,2,d,None], lambda x: x[None,j,2,o,None])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[None,1,None,d,e], lambda x: x[None,1,None,o,p])
-    helper_test_op([(2,5,15,5,3,4)], lambda x: x[...,1,d,None], lambda x: x[...,1,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[1,b,None,d,1], lambda x: x[1,j,None,o,1])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[None,b,2,d,None], lambda x: x[None,j,2,o,None])
+    helper_test_op([(2,5,6,5,3,4)], lambda x: x[...,1,d,None], lambda x: x[...,1,o,None])
+
+  def test_slice_fancy_indexing_with_idx(self):
     # indexing using idx with different dim
     helper_test_op([(2,3)], lambda x: x[torch.tensor([[0,0,0],[0,0,0]]), torch.tensor(1)], lambda x: x[Tensor([[0,0,0],[0,0,0]]), Tensor(1)])
     helper_test_op([(2,3)], lambda x: x[torch.tensor([1]), torch.tensor([[0,0,0],[0,0,0]])], lambda x: x[Tensor([1]), Tensor([[0,0,0],[0,0,0]])])
diff --git a/test/test_uops.py b/test/test_uops.py
index b5fc990849..23aa787a67 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -44,18 +44,18 @@ class TestUOps(unittest.TestCase):
 
   def _test_uop_fxn(self, bop, fxn, dt=dtypes.float32):
     for f in [_test_single_value, _test_single_value_const]:
-      for a in [-2.0, 0.0, 1.0, 2.0]:
+      for a in [-2.0, 0.0, 1.0]:
         self._equal(f([a], bop, dt), fxn(a))
 
   def _test_bop_fxn(self, bop, fxn, dt=dtypes.float32, no_b_zero=False):
     for f in [_test_single_value, _test_single_value_const]:
-      for a in [-2.0, 0.0, 1.0, 2.0]:
-        for b in [-3.0, 1.0, 3.0] + ([] if no_b_zero else [0.0]):
+      for a in [-2.0, 0.0, 1.0]:
+        for b in [-3.0, 1.0] + ([] if no_b_zero else [0.0]):
           self._equal(f([a,b], bop, dt), fxn(a,b))
 
   def _test_top_fxn(self, bop, fxn, dt=dtypes.float32):
     for f in [_test_single_value, _test_single_value_const]:
-      for a in [-2.0, 0, 1, 2.0]:
+      for a in [-2.0, 0, 1]:
         for b in [-3.0, 3.0]:
           for c in [-4.0, 4.0]:
             self._equal(f([a,b,c], bop, dt), fxn(a,b,c))
diff --git a/test/test_winograd.py b/test/test_winograd.py
index 50fdd2efc9..297bff87e1 100644
--- a/test/test_winograd.py
+++ b/test/test_winograd.py
@@ -1,5 +1,5 @@
 import unittest
-from tinygrad.helpers import Timing
+from tinygrad.helpers import Timing, CI
 from tinygrad.tensor import Tensor
 from tinygrad.ops import LoadOps
 from tinygrad.codegen.linearizer import Linearizer
@@ -31,9 +31,9 @@ class TestWinograd(unittest.TestCase):
 
   def test_profile(self):
     x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
-    pr = start_profile()
+    if not CI: pr = start_profile()
     out = Tensor.conv2d(x,w).realize()
-    stop_profile(pr, sort='time')
+    if not CI: stop_profile(pr, sort='time')
     out.numpy()
 
 if __name__ == '__main__':