diff --git a/test/test_mnist.py b/test/test_mnist.py index c157e14ef8..8095bc2997 100644 --- a/test/test_mnist.py +++ b/test/test_mnist.py @@ -74,21 +74,21 @@ def evaluate(model): assert accuracy > 0.95 class TestMNIST(unittest.TestCase): - def conv(self): + def test_conv(self): np.random.seed(1337) model = TinyConvNet() optimizer = optim.Adam([model.c1, model.l1, model.l2], lr=0.001) train(model, optimizer, steps=400) evaluate(model) - def sgd(self): + def test_sgd(self): np.random.seed(1337) model = TinyBobNet() optimizer = optim.SGD([model.l1, model.l2], lr=0.001) train(model, optimizer, steps=1000) evaluate(model) - def rmsprop(self): + def test_rmsprop(self): np.random.seed(1337) model = TinyBobNet() optimizer = optim.RMSprop([model.l1, model.l2], lr=0.0002) diff --git a/tinygrad/gradcheck.py b/tinygrad/gradcheck.py index 1c0ccf1155..b5f4c8093e 100644 --- a/tinygrad/gradcheck.py +++ b/tinygrad/gradcheck.py @@ -8,7 +8,7 @@ def jacobian(func, input): ji = input.data.reshape(-1).shape[-1] jo = output.data.reshape(-1).shape[-1] - J = np.zeros((jo,ji)) + J = np.zeros((jo,ji), dtype=np.float32) for o in range(jo): # tinygrad doesn't support slicing, tiny-hack to select @@ -25,7 +25,7 @@ def numerical_jacobian(func, input, eps = 1e-6): ji = input.data.reshape(-1).shape[-1] jo = output.data.reshape(-1).shape[-1] - NJ = np.zeros((jo, ji)) + NJ = np.zeros((jo, ji), dtype=np.float32) for o in range(jo): for i in range(ji): diff --git a/tinygrad/optim.py b/tinygrad/optim.py index 40e3130fc8..b2bb5985f3 100644 --- a/tinygrad/optim.py +++ b/tinygrad/optim.py @@ -1,3 +1,5 @@ +# sorted in order of increasing complexity + import numpy as np class Optimizer: @@ -13,7 +15,20 @@ class SGD(Optimizer): for t in self.params: t.data -= self.lr * t.grad -# 80% sure this is right? +class RMSprop(Optimizer): + def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8): + super(RMSprop, self).__init__(params) + self.lr = lr + self.decay = decay + self.eps = eps + + self.v = [np.zeros_like(t.data) for t in self.params] + + def step(self): + for i, t in enumerate(self.params): + self.v[i] = self.decay * self.v[i] + (1 - self.decay) * np.square(t.grad) + t.data -= self.lr / (np.sqrt(self.v[i]) + self.eps) * t.grad + class Adam(Optimizer): def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8): super(Adam, self).__init__(params) @@ -35,17 +50,3 @@ class Adam(Optimizer): vhat = self.v[i] / (1. - self.b2**self.t) t.data -= self.lr * mhat / (np.sqrt(vhat) + self.eps) -# fill the 20% uncertainty of the above optim -class RMSprop(Optimizer): - def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8): - super(RMSprop, self).__init__(params) - self.lr = lr - self.decay = decay - self.eps = eps - - self.v = [np.zeros_like(t.data) for t in self.params] - - def step(self): - for i, t in enumerate(self.params): - self.v[i] = self.decay * self.v[i] + (1 - self.decay) * np.square(t.grad) - t.data -= self.lr / (np.sqrt(self.v[i]) + self.eps) * t.grad