Refactor nn.optim (#1091)

* Refactor: nn.optim.py

* Refactor: nn.optim.py; Fix all tests

* Refactor: Replace all optim.get_parameters()

* Refactor: Revert list comp.

* Refactor: Replace optim.get_state_dict

* Refactor: Change quickstart.md
This commit is contained in:
Reza Rezvan
2023-07-03 00:07:30 +02:00
committed by GitHub
parent 10f1aeb144
commit 8ae9a054ae
15 changed files with 44 additions and 36 deletions

View File

@@ -165,7 +165,7 @@ opt = SGD([net.l1.weight, net.l2.weight], lr=3e-4)
We can see that we are passing in the parameters of our neural network to the optimizer.
This is due to the fact that the optimizer needs to know which parameters to update.
There is a simpler way to do this just by using `get_parameters(net)` from `tinygrad.nn.optim` which will return a list of all the parameters in the neural network.
There is a simpler way to do this just by using `get_parameters(net)` from `tinygrad.state` which will return a list of all the parameters in the neural network.
The parameters are just listed out explicitly here for clarity.
Now that we have our network, loss function, and optimizer defined all we are missing is the data to train on!

View File

@@ -3,6 +3,7 @@ import gc
import time
from tqdm import trange
from models.efficientnet import EfficientNet
from tinygrad.state import get_parameters
from tinygrad.nn import optim
from tinygrad.tensor import Tensor
from tinygrad.ops import GlobalCounters
@@ -22,7 +23,7 @@ CLCACHE = getenv("CLCACHE", 0)
if __name__ == "__main__":
print(f"NUM:{NUM} BS:{BS} CNT:{CNT}")
model = EfficientNet(NUM, classes=1000, has_se=False, track_running_stats=False)
parameters = optim.get_parameters(model)
parameters = get_parameters(model)
for p in parameters: p.realize()
if ADAM: optimizer = optim.Adam(parameters, lr=0.001)
else: optimizer = optim.SGD(parameters, lr=0.001)

View File

@@ -1,6 +1,7 @@
from typing import Optional, Tuple
from numpy.typing import NDArray
from tinygrad.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import optim
from tinygrad.helpers import getenv
@@ -152,10 +153,10 @@ class DeepDeterministicPolicyGradient:
self.target_actor = Actor(self.num_actions, self.num_states, hidden_size)
self.target_critic = Critic(self.num_actions + self.num_states, hidden_size)
actor_params = optim.get_parameters(self.actor)
critic_params = optim.get_parameters(self.critic)
target_actor_params = optim.get_parameters(self.target_actor)
target_critic_params = optim.get_parameters(self.target_critic)
actor_params = get_parameters(self.actor)
critic_params = get_parameters(self.critic)
target_actor_params = get_parameters(self.target_actor)
target_critic_params = get_parameters(self.target_critic)
if DEVICE == "GPU":
[x.gpu_() for x in actor_params + critic_params + target_actor_params + target_critic_params]
@@ -171,12 +172,12 @@ class DeepDeterministicPolicyGradient:
tau = self.tau
for param, target_param in zip(
optim.get_parameters(self.actor), optim.get_parameters(self.target_actor)
get_parameters(self.actor), get_parameters(self.target_actor)
):
target_param.assign(param.detach() * tau + target_param * (1.0 - tau))
for param, target_param in zip(
optim.get_parameters(self.critic), optim.get_parameters(self.target_critic)
get_parameters(self.critic), get_parameters(self.target_critic)
):
target_param.assign(param.detach() * tau + target_param * (1.0 - tau))

View File

@@ -7,6 +7,7 @@ import time
import numpy as np
from datasets import fetch_cifar
from tinygrad import nn
from tinygrad.state import get_parameters, get_state_dict
from tinygrad.nn import optim
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv
@@ -89,7 +90,7 @@ def train_cifar():
if getenv("TORCHWEIGHTS"):
from examples.hlb_cifar10_torch import SpeedyResNet as SpeedyResNetTorch
torch_model = SpeedyResNetTorch()
model_state_dict = optim.get_state_dict(model)
model_state_dict = get_state_dict(model)
torch_state_dict = torch_model.state_dict()
for k,v in torch_state_dict.items():
old_mean_std = model_state_dict[k].mean().numpy(), model_state_dict[k].std().numpy()
@@ -99,9 +100,9 @@ def train_cifar():
exit(0)
if getenv("ADAM"):
optimizer = optim.Adam(optim.get_parameters(model), lr=Tensor([0.001]).realize())
optimizer = optim.Adam(get_parameters(model), lr=Tensor([0.001]).realize())
else:
optimizer = optim.SGD(optim.get_parameters(model), lr=0.01, momentum=0.85, nesterov=True)
optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.85, nesterov=True)
# 97 steps in 2 seconds = 20ms / step
# step is 1163.42 GOPS = 56 TFLOPS!!!, 41% of max 136

View File

@@ -3,6 +3,7 @@ import numpy as np
from tqdm import trange
import torch
from torchvision.utils import make_grid, save_image
from tinygrad.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.helpers import getenv
from tinygrad.nn import optim
@@ -84,8 +85,8 @@ if __name__ == "__main__":
output_dir = Path(".").resolve() / "outputs"
output_dir.mkdir(exist_ok=True)
# optimizers
optim_g = optim.Adam(optim.get_parameters(generator),lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
optim_d = optim.Adam(optim.get_parameters(discriminator),lr=0.0002, b1=0.5)
optim_g = optim.Adam(get_parameters(generator),lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
optim_d = optim.Adam(get_parameters(discriminator),lr=0.0002, b1=0.5)
# training loop
for epoch in (t := trange(epochs)):
loss_g, loss_d = 0.0, 0.0

View File

@@ -2,6 +2,7 @@
#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
import sys
import numpy as np
from tinygrad.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import BatchNorm2d, optim
from tinygrad.helpers import getenv
@@ -57,7 +58,7 @@ class BigConvNet:
def parameters(self):
if DEBUG: #keeping this for a moment
pars = [par for par in optim.get_parameters(self) if par.requires_grad]
pars = [par for par in get_parameters(self) if par.requires_grad]
no_pars = 0
for par in pars:
print(par.shape)
@@ -65,17 +66,17 @@ class BigConvNet:
print('no of parameters', no_pars)
return pars
else:
return optim.get_parameters(self)
return get_parameters(self)
def save(self, filename):
with open(filename+'.npy', 'wb') as f:
for par in optim.get_parameters(self):
for par in get_parameters(self):
#if par.requires_grad:
np.save(f, par.cpu().numpy())
def load(self, filename):
with open(filename+'.npy', 'rb') as f:
for par in optim.get_parameters(self):
for par in get_parameters(self):
#if par.requires_grad:
try:
par.cpu().numpy()[:] = np.load(f)
@@ -122,7 +123,7 @@ if __name__ == "__main__":
print('could not load weights "'+sys.argv[1]+'".')
if GPU:
params = optim.get_parameters(model)
params = get_parameters(model)
[x.gpu_() for x in params]
for lr, epochs in zip(lrs, epochss):

View File

@@ -3,6 +3,7 @@ import time
from multiprocessing import Process, Queue
import numpy as np
from tqdm import trange
from tinygrad.state import get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import getenv
from tinygrad.tensor import Tensor
@@ -37,7 +38,7 @@ if __name__ == "__main__":
else:
model = EfficientNet(getenv("NUM", 0), classes, has_se=False)
parameters = optim.get_parameters(model)
parameters = get_parameters(model)
print("parameter count", len(parameters))
optimizer = optim.Adam(parameters, lr=0.001)

View File

@@ -2,6 +2,7 @@
import numpy as np
from PIL import Image
from tinygrad.state import get_parameters
from tinygrad.nn import optim
from tinygrad.helpers import getenv
from extra.training import train, evaluate
@@ -37,7 +38,7 @@ if __name__ == "__main__":
lambda x: np.tile(np.expand_dims(x, 1), (1, 3, 1, 1)).astype(np.float32),
])
for _ in range(5):
optimizer = optim.SGD(optim.get_parameters(model), lr=lr, momentum=0.9)
optimizer = optim.SGD(get_parameters(model), lr=lr, momentum=0.9)
train(model, X_train, Y_train, optimizer, 100, BS=32, transform=transform)
evaluate(model, X_test, Y_test, num_classes=classes, transform=transform)
lr /= 1.2

View File

@@ -34,7 +34,7 @@ from models.convnext import ConvNeXt
from models.efficientnet import EfficientNet
from models.resnet import ResNet18
from models.vit import ViT
from tinygrad.nn.optim import get_parameters
from tinygrad.state import get_parameters
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
class TestInferenceMinKernels(unittest.TestCase):
@@ -182,7 +182,7 @@ class TestOpt(unittest.TestCase):
Tensor.training = True
img = Tensor.ones(2,3,4,4)
c1 = nn.Conv2d(3,32,3)
opt = optim.SGD(optim.get_parameters(c1))
opt = optim.SGD(get_parameters(c1))
with CLCache():
opt.zero_grad()
c1(img).relu().sum().backward()
@@ -199,7 +199,7 @@ class TestOpt(unittest.TestCase):
img = Tensor.ones(2,3,64,64)
c1 = nn.Conv2d(3,16,3,bias=False)
c2 = nn.Conv2d(16,32,3,bias=False)
opt = optim.SGD(optim.get_parameters([c1, c2]))
opt = optim.SGD(get_parameters([c1, c2]))
with CLCache(allowed=9):
opt.zero_grad()
c2(c1(img).relu()).relu().sum().backward()
@@ -214,7 +214,7 @@ class TestOpt(unittest.TestCase):
c2 = nn.Conv2d(4,8,3,bias=False)
c3 = nn.Conv2d(8,16,3,bias=False)
c4 = nn.Conv2d(16,32,3,bias=False)
opt = optim.SGD(optim.get_parameters([c1, c2, c3, c4]))
opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
with CLCache(allowed=19):
opt.zero_grad()
c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
@@ -227,7 +227,7 @@ class TestOpt(unittest.TestCase):
img = Tensor.ones(1,3,4,4)
c1 = nn.Conv2d(3,32,3)
bn = nn.BatchNorm2d(32, track_running_stats=False)
opt = optim.SGD(optim.get_parameters([c1, bn]))
opt = optim.SGD(get_parameters([c1, bn]))
with CLCache(allowed=18): # this is too high
img_bn = bn(c1(img)).elu().sum()
opt.zero_grad()

View File

@@ -1,11 +1,12 @@
import unittest
from tinygrad.state import get_parameters
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d, BatchNorm2d, optim
def model_step(lm):
Tensor.training = True
x = Tensor.ones(8,12,128,256, requires_grad=False)
optimizer = optim.SGD(optim.get_parameters(lm), lr=0.001)
optimizer = optim.SGD(get_parameters(lm), lr=0.001)
loss = lm.forward(x).sum()
optimizer.zero_grad()
loss.backward()

View File

@@ -2,7 +2,8 @@ import numpy as np
import torch
import unittest
from tinygrad.tensor import Tensor
from tinygrad.nn.optim import Adam, get_parameters
from tinygrad.state import get_parameters
from tinygrad.nn.optim import Adam
from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR
from extra.training import train, evaluate
from datasets import fetch_mnist

View File

@@ -2,6 +2,7 @@ import torch
from torch import nn
import unittest
import numpy as np
from tinygrad.state import get_parameters, get_state_dict
from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
from tinygrad.tensor import Tensor
from datasets import fetch_mnist
@@ -9,12 +10,12 @@ from datasets import fetch_mnist
def compare_tiny_torch(model, model_torch, X, Y):
Tensor.training = True
model_torch.train()
model_state_dict = optim.get_state_dict(model)
model_state_dict = get_state_dict(model)
for k,v in model_torch.named_parameters():
print(f"initting {k} from torch")
model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
optimizer = optim.SGD(optim.get_parameters(model), lr=0.01)
optimizer = optim.SGD(get_parameters(model), lr=0.01)
optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.01)
Xt = torch.Tensor(X.numpy())

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.state import get_parameters
from tinygrad.tensor import Tensor, Device
from tinygrad.nn import optim, BatchNorm2d
from extra.training import train, evaluate
@@ -16,7 +17,7 @@ class TinyBobNet:
self.l2 = Tensor.scaled_uniform(128, 10)
def parameters(self):
return optim.get_parameters(self)
return get_parameters(self)
def forward(self, x):
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
@@ -38,7 +39,7 @@ class TinyConvNet:
self.bn1, self.bn2 = lambda x: x, lambda x: x
def parameters(self):
return optim.get_parameters(self)
return get_parameters(self)
def forward(self, x:Tensor):
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks

View File

@@ -1,6 +1,7 @@
import unittest
import time
import numpy as np
from tinygrad.state import get_parameters
from tinygrad.nn import optim
from tinygrad.tensor import Device
from tinygrad.helpers import getenv
@@ -14,7 +15,7 @@ from models.resnet import ResNet18
BS = getenv("BS", 2)
def train_one_step(model,X,Y):
params = optim.get_parameters(model)
params = get_parameters(model)
pcount = 0
for p in params:
pcount += np.prod(p.shape)

View File

@@ -67,7 +67,3 @@ class LAMB(Optimizer):
r = 1.0
t.assign(t.detach() - self.lr * r * up)
self.realize([self.t] + self.m + self.v)
# TODO: remove this
from tinygrad.state import get_state_dict, get_parameters # pylint: disable=unused-import # noqa: F401