From 8ae9a054aed93e8ffeeff458b36f67bdbab7d650 Mon Sep 17 00:00:00 2001 From: Reza Rezvan Date: Mon, 3 Jul 2023 00:07:30 +0200 Subject: [PATCH] Refactor nn.optim (#1091) * Refactor: nn.optim.py * Refactor: nn.optim.py; Fix all tests * Refactor: Replace all optim.get_parameters() * Refactor: Revert list comp. * Refactor: Replace optim.get_state_dict * Refactor: Change quickstart.md --- docs/quickstart.md | 2 +- examples/benchmark_train_efficientnet.py | 3 ++- examples/deep_deterministic_policy_gradient.py | 13 +++++++------ examples/hlb_cifar10.py | 7 ++++--- examples/mnist_gan.py | 5 +++-- examples/serious_mnist.py | 11 ++++++----- examples/train_efficientnet.py | 3 ++- examples/train_resnet.py | 3 ++- test/external/external_test_opt.py | 10 +++++----- test/external/graph_batchnorm.py | 3 ++- test/extra/test_lr_scheduler.py | 3 ++- test/models/test_end2end.py | 5 +++-- test/models/test_mnist.py | 5 +++-- test/models/test_train.py | 3 ++- tinygrad/nn/optim.py | 4 ---- 15 files changed, 44 insertions(+), 36 deletions(-) diff --git a/docs/quickstart.md b/docs/quickstart.md index f269f1fcd0..f6d430e59a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -165,7 +165,7 @@ opt = SGD([net.l1.weight, net.l2.weight], lr=3e-4) We can see that we are passing in the parameters of our neural network to the optimizer. This is due to the fact that the optimizer needs to know which parameters to update. -There is a simpler way to do this just by using `get_parameters(net)` from `tinygrad.nn.optim` which will return a list of all the parameters in the neural network. +There is a simpler way to do this just by using `get_parameters(net)` from `tinygrad.state` which will return a list of all the parameters in the neural network. The parameters are just listed out explicitly here for clarity. Now that we have our network, loss function, and optimizer defined all we are missing is the data to train on! diff --git a/examples/benchmark_train_efficientnet.py b/examples/benchmark_train_efficientnet.py index 80d3f6c294..963823a487 100644 --- a/examples/benchmark_train_efficientnet.py +++ b/examples/benchmark_train_efficientnet.py @@ -3,6 +3,7 @@ import gc import time from tqdm import trange from models.efficientnet import EfficientNet +from tinygrad.state import get_parameters from tinygrad.nn import optim from tinygrad.tensor import Tensor from tinygrad.ops import GlobalCounters @@ -22,7 +23,7 @@ CLCACHE = getenv("CLCACHE", 0) if __name__ == "__main__": print(f"NUM:{NUM} BS:{BS} CNT:{CNT}") model = EfficientNet(NUM, classes=1000, has_se=False, track_running_stats=False) - parameters = optim.get_parameters(model) + parameters = get_parameters(model) for p in parameters: p.realize() if ADAM: optimizer = optim.Adam(parameters, lr=0.001) else: optimizer = optim.SGD(parameters, lr=0.001) diff --git a/examples/deep_deterministic_policy_gradient.py b/examples/deep_deterministic_policy_gradient.py index 30be069abd..5c5953a6d8 100644 --- a/examples/deep_deterministic_policy_gradient.py +++ b/examples/deep_deterministic_policy_gradient.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple from numpy.typing import NDArray +from tinygrad.state import get_parameters from tinygrad.tensor import Tensor from tinygrad.nn import optim from tinygrad.helpers import getenv @@ -152,10 +153,10 @@ class DeepDeterministicPolicyGradient: self.target_actor = Actor(self.num_actions, self.num_states, hidden_size) self.target_critic = Critic(self.num_actions + self.num_states, hidden_size) - actor_params = optim.get_parameters(self.actor) - critic_params = optim.get_parameters(self.critic) - target_actor_params = optim.get_parameters(self.target_actor) - target_critic_params = optim.get_parameters(self.target_critic) + actor_params = get_parameters(self.actor) + critic_params = get_parameters(self.critic) + target_actor_params = get_parameters(self.target_actor) + target_critic_params = get_parameters(self.target_critic) if DEVICE == "GPU": [x.gpu_() for x in actor_params + critic_params + target_actor_params + target_critic_params] @@ -171,12 +172,12 @@ class DeepDeterministicPolicyGradient: tau = self.tau for param, target_param in zip( - optim.get_parameters(self.actor), optim.get_parameters(self.target_actor) + get_parameters(self.actor), get_parameters(self.target_actor) ): target_param.assign(param.detach() * tau + target_param * (1.0 - tau)) for param, target_param in zip( - optim.get_parameters(self.critic), optim.get_parameters(self.target_critic) + get_parameters(self.critic), get_parameters(self.target_critic) ): target_param.assign(param.detach() * tau + target_param * (1.0 - tau)) diff --git a/examples/hlb_cifar10.py b/examples/hlb_cifar10.py index 7c9c120c06..2ad5b9811e 100644 --- a/examples/hlb_cifar10.py +++ b/examples/hlb_cifar10.py @@ -7,6 +7,7 @@ import time import numpy as np from datasets import fetch_cifar from tinygrad import nn +from tinygrad.state import get_parameters, get_state_dict from tinygrad.nn import optim from tinygrad.tensor import Tensor from tinygrad.helpers import getenv @@ -89,7 +90,7 @@ def train_cifar(): if getenv("TORCHWEIGHTS"): from examples.hlb_cifar10_torch import SpeedyResNet as SpeedyResNetTorch torch_model = SpeedyResNetTorch() - model_state_dict = optim.get_state_dict(model) + model_state_dict = get_state_dict(model) torch_state_dict = torch_model.state_dict() for k,v in torch_state_dict.items(): old_mean_std = model_state_dict[k].mean().numpy(), model_state_dict[k].std().numpy() @@ -99,9 +100,9 @@ def train_cifar(): exit(0) if getenv("ADAM"): - optimizer = optim.Adam(optim.get_parameters(model), lr=Tensor([0.001]).realize()) + optimizer = optim.Adam(get_parameters(model), lr=Tensor([0.001]).realize()) else: - optimizer = optim.SGD(optim.get_parameters(model), lr=0.01, momentum=0.85, nesterov=True) + optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.85, nesterov=True) # 97 steps in 2 seconds = 20ms / step # step is 1163.42 GOPS = 56 TFLOPS!!!, 41% of max 136 diff --git a/examples/mnist_gan.py b/examples/mnist_gan.py index d77eb116b0..3ef893da30 100644 --- a/examples/mnist_gan.py +++ b/examples/mnist_gan.py @@ -3,6 +3,7 @@ import numpy as np from tqdm import trange import torch from torchvision.utils import make_grid, save_image +from tinygrad.state import get_parameters from tinygrad.tensor import Tensor from tinygrad.helpers import getenv from tinygrad.nn import optim @@ -84,8 +85,8 @@ if __name__ == "__main__": output_dir = Path(".").resolve() / "outputs" output_dir.mkdir(exist_ok=True) # optimizers - optim_g = optim.Adam(optim.get_parameters(generator),lr=0.0002, b1=0.5) # 0.0002 for equilibrium! - optim_d = optim.Adam(optim.get_parameters(discriminator),lr=0.0002, b1=0.5) + optim_g = optim.Adam(get_parameters(generator),lr=0.0002, b1=0.5) # 0.0002 for equilibrium! + optim_d = optim.Adam(get_parameters(discriminator),lr=0.0002, b1=0.5) # training loop for epoch in (t := trange(epochs)): loss_g, loss_d = 0.0, 0.0 diff --git a/examples/serious_mnist.py b/examples/serious_mnist.py index 9a1318d93d..f5697ad5f9 100644 --- a/examples/serious_mnist.py +++ b/examples/serious_mnist.py @@ -2,6 +2,7 @@ #inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb import sys import numpy as np +from tinygrad.state import get_parameters from tinygrad.tensor import Tensor from tinygrad.nn import BatchNorm2d, optim from tinygrad.helpers import getenv @@ -57,7 +58,7 @@ class BigConvNet: def parameters(self): if DEBUG: #keeping this for a moment - pars = [par for par in optim.get_parameters(self) if par.requires_grad] + pars = [par for par in get_parameters(self) if par.requires_grad] no_pars = 0 for par in pars: print(par.shape) @@ -65,17 +66,17 @@ class BigConvNet: print('no of parameters', no_pars) return pars else: - return optim.get_parameters(self) + return get_parameters(self) def save(self, filename): with open(filename+'.npy', 'wb') as f: - for par in optim.get_parameters(self): + for par in get_parameters(self): #if par.requires_grad: np.save(f, par.cpu().numpy()) def load(self, filename): with open(filename+'.npy', 'rb') as f: - for par in optim.get_parameters(self): + for par in get_parameters(self): #if par.requires_grad: try: par.cpu().numpy()[:] = np.load(f) @@ -122,7 +123,7 @@ if __name__ == "__main__": print('could not load weights "'+sys.argv[1]+'".') if GPU: - params = optim.get_parameters(model) + params = get_parameters(model) [x.gpu_() for x in params] for lr, epochs in zip(lrs, epochss): diff --git a/examples/train_efficientnet.py b/examples/train_efficientnet.py index c13746404e..b53767fefc 100644 --- a/examples/train_efficientnet.py +++ b/examples/train_efficientnet.py @@ -3,6 +3,7 @@ import time from multiprocessing import Process, Queue import numpy as np from tqdm import trange +from tinygrad.state import get_parameters from tinygrad.nn import optim from tinygrad.helpers import getenv from tinygrad.tensor import Tensor @@ -37,7 +38,7 @@ if __name__ == "__main__": else: model = EfficientNet(getenv("NUM", 0), classes, has_se=False) - parameters = optim.get_parameters(model) + parameters = get_parameters(model) print("parameter count", len(parameters)) optimizer = optim.Adam(parameters, lr=0.001) diff --git a/examples/train_resnet.py b/examples/train_resnet.py index 970eeba333..e6c63c0c23 100755 --- a/examples/train_resnet.py +++ b/examples/train_resnet.py @@ -2,6 +2,7 @@ import numpy as np from PIL import Image +from tinygrad.state import get_parameters from tinygrad.nn import optim from tinygrad.helpers import getenv from extra.training import train, evaluate @@ -37,7 +38,7 @@ if __name__ == "__main__": lambda x: np.tile(np.expand_dims(x, 1), (1, 3, 1, 1)).astype(np.float32), ]) for _ in range(5): - optimizer = optim.SGD(optim.get_parameters(model), lr=lr, momentum=0.9) + optimizer = optim.SGD(get_parameters(model), lr=lr, momentum=0.9) train(model, X_train, Y_train, optimizer, 100, BS=32, transform=transform) evaluate(model, X_test, Y_test, num_classes=classes, transform=transform) lr /= 1.2 diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index 4ac2f6557b..89a682add9 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -34,7 +34,7 @@ from models.convnext import ConvNeXt from models.efficientnet import EfficientNet from models.resnet import ResNet18 from models.vit import ViT -from tinygrad.nn.optim import get_parameters +from tinygrad.state import get_parameters @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented") class TestInferenceMinKernels(unittest.TestCase): @@ -182,7 +182,7 @@ class TestOpt(unittest.TestCase): Tensor.training = True img = Tensor.ones(2,3,4,4) c1 = nn.Conv2d(3,32,3) - opt = optim.SGD(optim.get_parameters(c1)) + opt = optim.SGD(get_parameters(c1)) with CLCache(): opt.zero_grad() c1(img).relu().sum().backward() @@ -199,7 +199,7 @@ class TestOpt(unittest.TestCase): img = Tensor.ones(2,3,64,64) c1 = nn.Conv2d(3,16,3,bias=False) c2 = nn.Conv2d(16,32,3,bias=False) - opt = optim.SGD(optim.get_parameters([c1, c2])) + opt = optim.SGD(get_parameters([c1, c2])) with CLCache(allowed=9): opt.zero_grad() c2(c1(img).relu()).relu().sum().backward() @@ -214,7 +214,7 @@ class TestOpt(unittest.TestCase): c2 = nn.Conv2d(4,8,3,bias=False) c3 = nn.Conv2d(8,16,3,bias=False) c4 = nn.Conv2d(16,32,3,bias=False) - opt = optim.SGD(optim.get_parameters([c1, c2, c3, c4])) + opt = optim.SGD(get_parameters([c1, c2, c3, c4])) with CLCache(allowed=19): opt.zero_grad() c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward() @@ -227,7 +227,7 @@ class TestOpt(unittest.TestCase): img = Tensor.ones(1,3,4,4) c1 = nn.Conv2d(3,32,3) bn = nn.BatchNorm2d(32, track_running_stats=False) - opt = optim.SGD(optim.get_parameters([c1, bn])) + opt = optim.SGD(get_parameters([c1, bn])) with CLCache(allowed=18): # this is too high img_bn = bn(c1(img)).elu().sum() opt.zero_grad() diff --git a/test/external/graph_batchnorm.py b/test/external/graph_batchnorm.py index 08343c0070..2fa98b05c8 100644 --- a/test/external/graph_batchnorm.py +++ b/test/external/graph_batchnorm.py @@ -1,11 +1,12 @@ import unittest +from tinygrad.state import get_parameters from tinygrad.tensor import Tensor from tinygrad.nn import Conv2d, BatchNorm2d, optim def model_step(lm): Tensor.training = True x = Tensor.ones(8,12,128,256, requires_grad=False) - optimizer = optim.SGD(optim.get_parameters(lm), lr=0.001) + optimizer = optim.SGD(get_parameters(lm), lr=0.001) loss = lm.forward(x).sum() optimizer.zero_grad() loss.backward() diff --git a/test/extra/test_lr_scheduler.py b/test/extra/test_lr_scheduler.py index c06a823c56..60d6fbe7cb 100644 --- a/test/extra/test_lr_scheduler.py +++ b/test/extra/test_lr_scheduler.py @@ -2,7 +2,8 @@ import numpy as np import torch import unittest from tinygrad.tensor import Tensor -from tinygrad.nn.optim import Adam, get_parameters +from tinygrad.state import get_parameters +from tinygrad.nn.optim import Adam from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR from extra.training import train, evaluate from datasets import fetch_mnist diff --git a/test/models/test_end2end.py b/test/models/test_end2end.py index da326118e8..b9021343d1 100644 --- a/test/models/test_end2end.py +++ b/test/models/test_end2end.py @@ -2,6 +2,7 @@ import torch from torch import nn import unittest import numpy as np +from tinygrad.state import get_parameters, get_state_dict from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d from tinygrad.tensor import Tensor from datasets import fetch_mnist @@ -9,12 +10,12 @@ from datasets import fetch_mnist def compare_tiny_torch(model, model_torch, X, Y): Tensor.training = True model_torch.train() - model_state_dict = optim.get_state_dict(model) + model_state_dict = get_state_dict(model) for k,v in model_torch.named_parameters(): print(f"initting {k} from torch") model_state_dict[k].assign(Tensor(v.detach().numpy())).realize() - optimizer = optim.SGD(optim.get_parameters(model), lr=0.01) + optimizer = optim.SGD(get_parameters(model), lr=0.01) optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.01) Xt = torch.Tensor(X.numpy()) diff --git a/test/models/test_mnist.py b/test/models/test_mnist.py index ec1dd8554a..b47aacb192 100644 --- a/test/models/test_mnist.py +++ b/test/models/test_mnist.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import unittest import numpy as np +from tinygrad.state import get_parameters from tinygrad.tensor import Tensor, Device from tinygrad.nn import optim, BatchNorm2d from extra.training import train, evaluate @@ -16,7 +17,7 @@ class TinyBobNet: self.l2 = Tensor.scaled_uniform(128, 10) def parameters(self): - return optim.get_parameters(self) + return get_parameters(self) def forward(self, x): return x.dot(self.l1).relu().dot(self.l2).log_softmax() @@ -38,7 +39,7 @@ class TinyConvNet: self.bn1, self.bn2 = lambda x: x, lambda x: x def parameters(self): - return optim.get_parameters(self) + return get_parameters(self) def forward(self, x:Tensor): x = x.reshape(shape=(-1, 1, 28, 28)) # hacks diff --git a/test/models/test_train.py b/test/models/test_train.py index e0c2778dd8..c3280611be 100644 --- a/test/models/test_train.py +++ b/test/models/test_train.py @@ -1,6 +1,7 @@ import unittest import time import numpy as np +from tinygrad.state import get_parameters from tinygrad.nn import optim from tinygrad.tensor import Device from tinygrad.helpers import getenv @@ -14,7 +15,7 @@ from models.resnet import ResNet18 BS = getenv("BS", 2) def train_one_step(model,X,Y): - params = optim.get_parameters(model) + params = get_parameters(model) pcount = 0 for p in params: pcount += np.prod(p.shape) diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py index e20795e61a..c4f6202182 100644 --- a/tinygrad/nn/optim.py +++ b/tinygrad/nn/optim.py @@ -67,7 +67,3 @@ class LAMB(Optimizer): r = 1.0 t.assign(t.detach() - self.lr * r * up) self.realize([self.t] + self.m + self.v) - -# TODO: remove this -from tinygrad.state import get_state_dict, get_parameters # pylint: disable=unused-import # noqa: F401 -