diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 53398e2b4f..d2604827f9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -91,8 +91,8 @@ jobs:
         python-version: 3.8
     - name: Install Dependencies
       run: pip install -e '.[llvm,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Run Pytest OPT=2
-      run: OPT=2 LLVM=1 python -m pytest -s -v -n=auto
+    - name: Run Pytest
+      run: LLVM=1 python -m pytest -s -v -n=auto
 
   testtorch:
     name: Torch Tests
@@ -132,9 +132,9 @@ jobs:
     - name: Install Dependencies
       run: pip install -e '.[gpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
     - name: Run Optimizer Test
-      run: OPT=2 GPU=1 python test/external_test_opt.py
+      run: GPU=1 python test/external_test_opt.py
     - name: Run Pytest (default)
-      run: OPT=1 GPU=1 python -m pytest -s -v -n=auto
+      run: GPU=1 python -m pytest -s -v -n=auto
 
   testopencl:
     name: openpilot (OpenCL) Test
diff --git a/test/test_conv_shapetracker.py b/test/test_conv_shapetracker.py
index cf3bdd392b..cedb6fd631 100644
--- a/test/test_conv_shapetracker.py
+++ b/test/test_conv_shapetracker.py
@@ -14,7 +14,7 @@ class TestConvShapetracker(unittest.TestCase):
     conv(inp).realize()
     test = GlobalCounters.cache
     GlobalCounters.cache = None
-    assert len(test) == 1
+    assert len(test) == 1, f"conv should only have one kernel {[x[0].clprg.name for x in test]}"
     print(test[0][0].clprg.prg)
     for arg in test[0][1]:
       print(arg.st)
diff --git a/tinygrad/nn/__init__.py b/tinygrad/nn/__init__.py
index 7117cc028f..178aed2350 100644
--- a/tinygrad/nn/__init__.py
+++ b/tinygrad/nn/__init__.py
@@ -44,7 +44,7 @@ class Conv2d:
     self.padding = (padding, ) * 4 if isinstance(padding, int) else ((padding[0], padding[0], padding[1], padding[1]) if len(padding) == 2 else padding)
     # TODO: why is this realize needed? shouldn't it realize on the first run?
     self.weight = Tensor.glorot_uniform(out_channels, in_channels, self.kernel_size[0], self.kernel_size[1]).realize()
-    self.bias = Tensor.zeros(out_channels) if bias else None
+    self.bias = Tensor.zeros(out_channels).contiguous().realize() if bias else None
 
   def __call__(self, x):
     return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride)
@@ -52,7 +52,7 @@ class Conv2d:
 class Linear:
   def __init__(self, in_features, out_features, bias=True):
     self.weight = Tensor.glorot_uniform(out_features, in_features).realize()
-    self.bias = Tensor.zeros(out_features) if bias else None
+    self.bias = Tensor.zeros(out_features).contiguous().realize() if bias else None
 
   def __call__(self, x):
     return x.linear(self.weight.transpose(), self.bias)