diff --git a/docs/tensor/ops.md b/docs/tensor/ops.md
index 2c04a11589..d613aad00e 100644
--- a/docs/tensor/ops.md
+++ b/docs/tensor/ops.md
@@ -44,3 +44,4 @@
 ::: tinygrad.Tensor.binary_crossentropy_logits
 ::: tinygrad.Tensor.sparse_categorical_crossentropy
 ::: tinygrad.Tensor.cross_entropy
+::: tinygrad.Tensor.nll_loss
diff --git a/test/test_ops.py b/test/test_ops.py
index d545fa97a2..e479d77ed6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2201,6 +2201,36 @@ class TestOps(unittest.TestCase):
       helper_test_op([(32,10), (32,10)], lambda x,y: torch.nn.functional.cross_entropy(x, y, label_smoothing=ls),
                                          lambda x,y: x.cross_entropy(y, label_smoothing=ls))
 
+  def test_nll_loss(self):
+    helper_test_op([(32,10), (32)], lambda x,y: torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x, dim=1),
+                                                                             torch.clip(y,0).type(torch.long)),
+                                    lambda x,y: x.log_softmax().nll_loss(y.clip(0).cast(dtypes.long)), forward_only=True)
+
+  def test_nll_loss_reductions(self):
+    for r in ("mean", "sum", "none"):
+      helper_test_op([(32,10), (32)], lambda x,y: torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x, dim=1),
+                                                                               torch.clip(y,0).type(torch.long), reduction=r),
+                                      lambda x,y: x.log_softmax().nll_loss(y.clip(0).cast(dtypes.long), reduction=r), forward_only=True)
+    self.helper_test_exception([(32,10), (32)], lambda x,y: torch.nn.functional.nll_loss(x, torch.clip(y,0).type(torch.long), reduction="typo"),
+                                                lambda x,y: x.nll_loss(y.clip(0).cast(dtypes.long), reduction="typo"), expected=ValueError)
+
+  def test_nll_loss_weight(self):
+    for r in ("mean", "sum", "none"):
+      helper_test_op([(32,10), (32), (10)], lambda x,y,z: torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x, dim=1),
+                                                                                            torch.clip(y,0).type(torch.long), weight=z, reduction=r),
+                                            lambda x,y,z: x.log_softmax().nll_loss(y.clip(0).cast(dtypes.long),
+                                                                                         weight=z, reduction=r), forward_only=True)
+
+  def test_nll_loss_ignore_index(self):
+    logits = [[2.0, 0.5, -1.0],
+              [1.5, 2.5, -0.5],
+              [0.0, -2.0, 1.0]]
+    targets = [0, 1, 2]
+    helper_test_op(None, lambda x,y: torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(x, dim=1),
+                                                                  torch.clip(y,0).type(torch.long), ignore_index=1),
+                         lambda x,y: x.log_softmax().nll_loss(y.clip(0).cast(dtypes.long), ignore_index=1),
+                         forward_only=True, vals=[logits, targets])
+
   def test_one_hot(self):
     data = [1, 2, 4]
     helper_test_op([], lambda: torch.nn.functional.one_hot(torch.tensor(data), 6).type(torch.int32),
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 21b2c72592..b7c9c29e7b 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -3336,6 +3336,32 @@ class Tensor(SimpleMathTrait):  # pylint: disable=abstract-method
     ret = -self.log_softmax(axis=1).mul(Y).sum(axis=1)
     return ret._do_reduction(reduction)
 
+  def nll_loss(self, Y:Tensor, weight:Optional[Tensor]=None, ignore_index:Optional[int]=None, reduction:ReductionStr="mean") -> Tensor:
+    """
+    Compute the negative log likelihood loss between log-probabilities and target labels.
+
+    NOTE: `self` is log-probabilities and `Y` is the Y labels or class probabilities.
+
+    See: https://pytorch.org/docs/stable/generated/torch.nn.functional.nll_loss.html
+
+    ```python exec="true" source="above" session="tensor" result="python"
+    t = Tensor([[-1, 2, -3], [1, -2, 3]])
+    Y = Tensor([1, 2])
+    print(t.log_softmax().nll_loss(Y).item())
+    ```
+    ```python exec="true" source="above" session="tensor" result="python"
+    t = Tensor([[-1, 2, -3], [1, -2, 3]])
+    Y = Tensor([1, 2])
+    print(t.log_softmax().nll_loss(Y, reduction='none').numpy())
+    ```
+    """
+    t, Y, target_shape = self.reshape(None, None, -1), Y.reshape(None, -1), Y.shape
+    mask = Tensor.ones_like(Y, requires_grad=False, device=t.device) if ignore_index is None else (Y != ignore_index)
+    masked_weight = mask if weight is None else weight[Y] * mask
+    ret = (-t.gather(1, Y.unsqueeze(1)).squeeze(1) * masked_weight).reshape(target_shape)
+    if reduction == "mean": return ret.sum() / (masked_weight.sum())
+    return ret._do_reduction(reduction)
+
   # ***** Tensor Properties *****
 
   @property