diff --git a/docs/quickstart.md b/docs/quickstart.md
index ac26c42cec..e236f32516 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -115,13 +115,13 @@ class TinyNet:
     x = self.l1(x)
     x = x.leakyrelu()
     x = self.l2(x)
-    return x.log_softmax()
+    return x
 
 net = TinyNet()
 ```
 
 We can see that the forward pass of our neural network is just the sequence of operations performed on the input tensor `x`.
-We can also see that functional operations like `leakyrelu` and `log_softmax` are not defined as classes and instead are just methods we can just call.
+We can also see that functional operations like `leakyrelu` are not defined as classes and instead are just methods we can just call.
 Finally, we just initialize an instance of our neural network, and we are ready to start training it.
 
 ## Training
@@ -137,18 +137,18 @@ First we need to set the training flag in `Tensor`:
 Tensor.training = True
 ```
 
-For our loss function we will be using cross entropy loss.
+For our loss function we will be using sparse categorical cross entropy loss.
 
 ```python
-# from extra.training import sparse_categorical_crossentropy
-def cross_entropy(out, Y):
+# from tinygrad.tensor import sparse_categorical_crossentropy
+def sparse_categorical_crossentropy(out, Y, ignore_index=-1):
+  loss_mask = Y != ignore_index
   num_classes = out.shape[-1]
-  YY = Y.flatten().astype(np.int32)
-  y = np.zeros((YY.shape[0], num_classes), np.float32)
-  y[range(y.shape[0]),YY] = -1.0*num_classes
-  y = y.reshape(list(Y.shape)+[num_classes])
-  y = Tensor(y)
-  return out.mul(y).mean()
+  y_counter = Tensor.arange(num_classes, requires_grad=False).unsqueeze(0).expand(Y.numel(), num_classes)
+  y = (y_counter == Y.flatten().reshape(-1, 1)).where(-1.0, 0) 
+  y = y * loss_mask.reshape(-1, 1)
+  y = y.reshape(*Y.shape, num_classes)
+  return out.log_softmax().mul(y).sum() / loss_mask.sum()
 ```
 
 As we can see in this implementation of cross entropy loss, there are certain operations that tinygrad does not support.
@@ -187,13 +187,13 @@ for step in range(1000):
   samp = np.random.randint(0, X_train.shape[0], size=(64))
   batch = Tensor(X_train[samp], requires_grad=False)
   # get the corresponding labels
-  labels = Y_train[samp]
+  labels = Tensor(Y_train[samp])
 
   # forward pass
   out = net(batch)
 
   # compute loss
-  loss = cross_entropy(out, labels)
+  loss = sparse_categorical_crossentropy(out, labels)
 
   # zero gradients
   opt.zero_grad()
diff --git a/examples/serious_mnist.py b/examples/serious_mnist.py
index c6baaad108..87c3937e95 100644
--- a/examples/serious_mnist.py
+++ b/examples/serious_mnist.py
@@ -8,7 +8,7 @@ from tinygrad.nn import BatchNorm2d, optim
 from tinygrad.helpers import getenv
 from extra.datasets import fetch_mnist
 from extra.augment import augment_img
-from extra.training import train, evaluate, sparse_categorical_crossentropy
+from extra.training import train, evaluate
 GPU = getenv("GPU")
 QUICK = getenv("QUICK")
 DEBUG = getenv("DEBUG")
@@ -93,7 +93,7 @@ class BigConvNet:
     x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
     x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
     xo = x1.dot(self.weight1) + x2.dot(self.weight2)
-    return xo.log_softmax()
+    return xo
 
 
 if __name__ == "__main__":
@@ -102,7 +102,7 @@ if __name__ == "__main__":
   BS = 32
 
   lmbd = 0.00025
-  lossfn = lambda out,y: sparse_categorical_crossentropy(out, y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
+  lossfn = lambda out,y: out.sparse_categorical_crossentropy(y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
   X_train, Y_train, X_test, Y_test = fetch_mnist()
   X_train = X_train.reshape(-1, 28, 28).astype(np.uint8)
   X_test = X_test.reshape(-1, 28, 28).astype(np.uint8)
diff --git a/extra/training.py b/extra/training.py
index 6344390af6..8d8086ec79 100644
--- a/extra/training.py
+++ b/extra/training.py
@@ -3,24 +3,14 @@ from tqdm import trange
 from tinygrad.tensor import Tensor, Device
 from tinygrad.helpers import getenv
 
-def sparse_categorical_crossentropy(out, Y):
-  num_classes = out.shape[-1]
-  YY = Y.flatten().astype(np.int32)
-  y = np.zeros((YY.shape[0], num_classes), np.float32)
-  # correct loss for NLL, torch NLL loss returns one per row
-  y[range(y.shape[0]),YY] = -1.0*num_classes
-  y = y.reshape(list(Y.shape)+[num_classes])
-  y = Tensor(y)
-  return out.mul(y).mean()
-
-def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categorical_crossentropy,
+def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=lambda out,y: out.sparse_categorical_crossentropy(y),
         transform=lambda x: x, target_transform=lambda x: x, noloss=False):
   Tensor.training = True
   losses, accuracies = [], []
   for i in (t := trange(steps, disable=getenv('CI', False))):
     samp = np.random.randint(0, X_train.shape[0], size=(BS))
     x = Tensor(transform(X_train[samp]), requires_grad=False)
-    y = target_transform(Y_train[samp])
+    y = Tensor(target_transform(Y_train[samp]))
 
     # network
     out = model.forward(x) if hasattr(model, 'forward') else model(x)
diff --git a/test/test_nn.py b/test/test_nn.py
index e84b2087e6..9c98875ffd 100755
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -12,6 +12,17 @@ import pytest
 pytestmark = [pytest.mark.exclude_cuda]
 
 class TestNN(unittest.TestCase):
+  def test_sparse_cat_cross_entropy(self):
+    input = torch.randn(3, 5)
+    target = torch.empty(3, dtype=torch.long).random_(5)
+    loss_fun = torch.nn.CrossEntropyLoss(reduction='mean')
+    loss = loss_fun(input, target)
+
+    input_tiny = Tensor(input.detach().numpy())
+    taret_tiny = Tensor(target.detach().numpy())
+    loss_tiny = input_tiny.sparse_categorical_crossentropy(taret_tiny)
+
+    np.testing.assert_allclose(loss_tiny.numpy(), loss.detach().numpy(), atol=1e-5, rtol=1e-6)
 
   def test_batchnorm2d(self, training=False):
     szs = [4, 8, 16, 32]
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 5e3824eaf1..cce984b8af 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -708,6 +708,12 @@ class Tensor:
     if attn_mask is not None and attn_mask.dtype == dtypes.bool: attn_mask = (attn_mask == 0).where(-float("inf"), attn_mask)
     return (self @ key.transpose(-2,-1) / sqrt(self.shape[-1]) + attn_mask).softmax(-1).dropout(dropout_p) @ value
 
+  def sparse_categorical_crossentropy(self, Y, ignore_index=-1) -> Tensor:
+    loss_mask = Y != ignore_index
+    y_counter = Tensor.arange(self.shape[-1], requires_grad=False).unsqueeze(0).expand(Y.numel(), self.shape[-1])
+    y = ((y_counter == Y.flatten().reshape(-1, 1)).where(-1.0, 0) * loss_mask.reshape(-1, 1)).reshape(*Y.shape, self.shape[-1])
+    return self.log_softmax().mul(y).sum() / loss_mask.sum()
+
   # ***** cast ops *****
 
   def cast(self, dtype:DType) -> Tensor: return mlops.Cast.apply(self, dtype=dtype) if self.dtype != dtype else self