diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7378618db8..3f64f9cb08 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -17,9 +17,6 @@ jobs:
       with:
         python-version: 3.8
     - name: Install Dependencies
-      run: pip install ipython numpy tqdm requests torch
-    - name: Run mnist test
-      run: ipython3 test/mnist.py
-    - name: Run compare to torch test
-      run: ipython3 test/test.py
-
+      run: pip install pytest numpy tqdm requests torch
+    - name: Run Pytest
+      run: python -m pytest -s -v
diff --git a/test/test_mnist.py b/test/test_mnist.py
index ec60d2bdef..da12322cc2 100644
--- a/test/test_mnist.py
+++ b/test/test_mnist.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
 import os
+import unittest
 import numpy as np
 from tinygrad.tensor import Tensor
 from tinygrad.utils import layer_init_uniform, fetch_mnist
-import tinygrad.optim as optim
+import tinygrad.optim as tinygrad_optim
 from tqdm import trange
 np.random.seed(1337)
 
@@ -33,51 +34,57 @@ class TinyConvNet:
     x = x.conv2d(self.c1).reshape(Tensor(np.array((-1, 26*26*self.chans)))).relu()
     return x.dot(self.l1).relu().dot(self.l2).logsoftmax()
 
-if os.getenv("CONV") == "1":
-  model = TinyConvNet()
-  optim = optim.Adam([model.c1, model.l1, model.l2], lr=0.001)
-  steps = 400
-else:
-  model = TinyBobNet()
-  optim = optim.SGD([model.l1, model.l2], lr=0.001)
-  steps = 1000
 
-BS = 128
-losses, accuracies = [], []
-for i in (t := trange(steps)):
-  samp = np.random.randint(0, X_train.shape[0], size=(BS))
-  
-  x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32))
-  Y = Y_train[samp]
-  y = np.zeros((len(samp),10), np.float32)
-  # correct loss for NLL, torch NLL loss returns one per row
-  y[range(y.shape[0]),Y] = -10.0
-  y = Tensor(y)
-  
-  # network
-  out = model.forward(x)
+class TestMNIST(unittest.TestCase):
 
-  # NLL loss function
-  loss = out.mul(y).mean()
-  loss.backward()
-  optim.step()
-  
-  cat = np.argmax(out.data, axis=1)
-  accuracy = (cat == Y).mean()
-  
-  # printing
-  loss = loss.data
-  losses.append(loss)
-  accuracies.append(accuracy)
-  t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
+  def test_mnist(self):
+    if os.getenv("CONV") == "1":
+      model = TinyConvNet()
+      optim = tinygrad_optim.Adam([model.c1, model.l1, model.l2], lr=0.001)
+      steps = 400
+    else:
+      model = TinyBobNet()
+      optim = tinygrad_optim.SGD([model.l1, model.l2], lr=0.001)
+      steps = 1000
 
-# evaluate
-def numpy_eval():
-  Y_test_preds_out = model.forward(Tensor(X_test.reshape((-1, 28*28))))
-  Y_test_preds = np.argmax(Y_test_preds_out.data, axis=1)
-  return (Y_test == Y_test_preds).mean()
+    BS = 128
+    losses, accuracies = [], []
+    for i in (t := trange(steps)):
+      samp = np.random.randint(0, X_train.shape[0], size=(BS))
+      
+      x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32))
+      Y = Y_train[samp]
+      y = np.zeros((len(samp),10), np.float32)
+      # correct loss for NLL, torch NLL loss returns one per row
+      y[range(y.shape[0]),Y] = -10.0
+      y = Tensor(y)
+      
+      # network
+      out = model.forward(x)
 
-accuracy = numpy_eval()
-print("test set accuracy is %f" % accuracy)
-assert accuracy > 0.95
+      # NLL loss function
+      loss = out.mul(y).mean()
+      loss.backward()
+      optim.step()
+      
+      cat = np.argmax(out.data, axis=1)
+      accuracy = (cat == Y).mean()
+      
+      # printing
+      loss = loss.data
+      losses.append(loss)
+      accuracies.append(accuracy)
+      t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
 
+    # evaluate
+    def numpy_eval():
+      Y_test_preds_out = model.forward(Tensor(X_test.reshape((-1, 28*28))))
+      Y_test_preds = np.argmax(Y_test_preds_out.data, axis=1)
+      return (Y_test == Y_test_preds).mean()
+
+    accuracy = numpy_eval()
+    print("test set accuracy is %f" % accuracy)
+    assert accuracy > 0.95
+
+if __name__ == '__main__':
+  unittest.main()