diff --git a/docs/quickstart.md b/docs/quickstart.md index ac26c42cec..e236f32516 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -115,13 +115,13 @@ class TinyNet: x = self.l1(x) x = x.leakyrelu() x = self.l2(x) - return x.log_softmax() + return x net = TinyNet() ``` We can see that the forward pass of our neural network is just the sequence of operations performed on the input tensor `x`. -We can also see that functional operations like `leakyrelu` and `log_softmax` are not defined as classes and instead are just methods we can just call. +We can also see that functional operations like `leakyrelu` are not defined as classes and instead are just methods we can just call. Finally, we just initialize an instance of our neural network, and we are ready to start training it. ## Training @@ -137,18 +137,18 @@ First we need to set the training flag in `Tensor`: Tensor.training = True ``` -For our loss function we will be using cross entropy loss. +For our loss function we will be using sparse categorical cross entropy loss. ```python -# from extra.training import sparse_categorical_crossentropy -def cross_entropy(out, Y): +# from tinygrad.tensor import sparse_categorical_crossentropy +def sparse_categorical_crossentropy(out, Y, ignore_index=-1): + loss_mask = Y != ignore_index num_classes = out.shape[-1] - YY = Y.flatten().astype(np.int32) - y = np.zeros((YY.shape[0], num_classes), np.float32) - y[range(y.shape[0]),YY] = -1.0*num_classes - y = y.reshape(list(Y.shape)+[num_classes]) - y = Tensor(y) - return out.mul(y).mean() + y_counter = Tensor.arange(num_classes, requires_grad=False).unsqueeze(0).expand(Y.numel(), num_classes) + y = (y_counter == Y.flatten().reshape(-1, 1)).where(-1.0, 0) + y = y * loss_mask.reshape(-1, 1) + y = y.reshape(*Y.shape, num_classes) + return out.log_softmax().mul(y).sum() / loss_mask.sum() ``` As we can see in this implementation of cross entropy loss, there are certain operations that tinygrad does not support. @@ -187,13 +187,13 @@ for step in range(1000): samp = np.random.randint(0, X_train.shape[0], size=(64)) batch = Tensor(X_train[samp], requires_grad=False) # get the corresponding labels - labels = Y_train[samp] + labels = Tensor(Y_train[samp]) # forward pass out = net(batch) # compute loss - loss = cross_entropy(out, labels) + loss = sparse_categorical_crossentropy(out, labels) # zero gradients opt.zero_grad() diff --git a/examples/serious_mnist.py b/examples/serious_mnist.py index c6baaad108..87c3937e95 100644 --- a/examples/serious_mnist.py +++ b/examples/serious_mnist.py @@ -8,7 +8,7 @@ from tinygrad.nn import BatchNorm2d, optim from tinygrad.helpers import getenv from extra.datasets import fetch_mnist from extra.augment import augment_img -from extra.training import train, evaluate, sparse_categorical_crossentropy +from extra.training import train, evaluate GPU = getenv("GPU") QUICK = getenv("QUICK") DEBUG = getenv("DEBUG") @@ -93,7 +93,7 @@ class BigConvNet: x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global xo = x1.dot(self.weight1) + x2.dot(self.weight2) - return xo.log_softmax() + return xo if __name__ == "__main__": @@ -102,7 +102,7 @@ if __name__ == "__main__": BS = 32 lmbd = 0.00025 - lossfn = lambda out,y: sparse_categorical_crossentropy(out, y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum() + lossfn = lambda out,y: out.sparse_categorical_crossentropy(y) + lmbd*(model.weight1.abs() + model.weight2.abs()).sum() X_train, Y_train, X_test, Y_test = fetch_mnist() X_train = X_train.reshape(-1, 28, 28).astype(np.uint8) X_test = X_test.reshape(-1, 28, 28).astype(np.uint8) diff --git a/extra/training.py b/extra/training.py index 6344390af6..8d8086ec79 100644 --- a/extra/training.py +++ b/extra/training.py @@ -3,24 +3,14 @@ from tqdm import trange from tinygrad.tensor import Tensor, Device from tinygrad.helpers import getenv -def sparse_categorical_crossentropy(out, Y): - num_classes = out.shape[-1] - YY = Y.flatten().astype(np.int32) - y = np.zeros((YY.shape[0], num_classes), np.float32) - # correct loss for NLL, torch NLL loss returns one per row - y[range(y.shape[0]),YY] = -1.0*num_classes - y = y.reshape(list(Y.shape)+[num_classes]) - y = Tensor(y) - return out.mul(y).mean() - -def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categorical_crossentropy, +def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=lambda out,y: out.sparse_categorical_crossentropy(y), transform=lambda x: x, target_transform=lambda x: x, noloss=False): Tensor.training = True losses, accuracies = [], [] for i in (t := trange(steps, disable=getenv('CI', False))): samp = np.random.randint(0, X_train.shape[0], size=(BS)) x = Tensor(transform(X_train[samp]), requires_grad=False) - y = target_transform(Y_train[samp]) + y = Tensor(target_transform(Y_train[samp])) # network out = model.forward(x) if hasattr(model, 'forward') else model(x) diff --git a/test/test_nn.py b/test/test_nn.py index e84b2087e6..9c98875ffd 100755 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -12,6 +12,17 @@ import pytest pytestmark = [pytest.mark.exclude_cuda] class TestNN(unittest.TestCase): + def test_sparse_cat_cross_entropy(self): + input = torch.randn(3, 5) + target = torch.empty(3, dtype=torch.long).random_(5) + loss_fun = torch.nn.CrossEntropyLoss(reduction='mean') + loss = loss_fun(input, target) + + input_tiny = Tensor(input.detach().numpy()) + taret_tiny = Tensor(target.detach().numpy()) + loss_tiny = input_tiny.sparse_categorical_crossentropy(taret_tiny) + + np.testing.assert_allclose(loss_tiny.numpy(), loss.detach().numpy(), atol=1e-5, rtol=1e-6) def test_batchnorm2d(self, training=False): szs = [4, 8, 16, 32] diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 5e3824eaf1..cce984b8af 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -708,6 +708,12 @@ class Tensor: if attn_mask is not None and attn_mask.dtype == dtypes.bool: attn_mask = (attn_mask == 0).where(-float("inf"), attn_mask) return (self @ key.transpose(-2,-1) / sqrt(self.shape[-1]) + attn_mask).softmax(-1).dropout(dropout_p) @ value + def sparse_categorical_crossentropy(self, Y, ignore_index=-1) -> Tensor: + loss_mask = Y != ignore_index + y_counter = Tensor.arange(self.shape[-1], requires_grad=False).unsqueeze(0).expand(Y.numel(), self.shape[-1]) + y = ((y_counter == Y.flatten().reshape(-1, 1)).where(-1.0, 0) * loss_mask.reshape(-1, 1)).reshape(*Y.shape, self.shape[-1]) + return self.log_softmax().mul(y).sum() / loss_mask.sum() + # ***** cast ops ***** def cast(self, dtype:DType) -> Tensor: return mlops.Cast.apply(self, dtype=dtype) if self.dtype != dtype else self