mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
Refactor nn.optim (#1091)
* Refactor: nn.optim.py * Refactor: nn.optim.py; Fix all tests * Refactor: Replace all optim.get_parameters() * Refactor: Revert list comp. * Refactor: Replace optim.get_state_dict * Refactor: Change quickstart.md
This commit is contained in:
@@ -165,7 +165,7 @@ opt = SGD([net.l1.weight, net.l2.weight], lr=3e-4)
|
||||
|
||||
We can see that we are passing in the parameters of our neural network to the optimizer.
|
||||
This is due to the fact that the optimizer needs to know which parameters to update.
|
||||
There is a simpler way to do this just by using `get_parameters(net)` from `tinygrad.nn.optim` which will return a list of all the parameters in the neural network.
|
||||
There is a simpler way to do this just by using `get_parameters(net)` from `tinygrad.state` which will return a list of all the parameters in the neural network.
|
||||
The parameters are just listed out explicitly here for clarity.
|
||||
|
||||
Now that we have our network, loss function, and optimizer defined all we are missing is the data to train on!
|
||||
|
||||
@@ -3,6 +3,7 @@ import gc
|
||||
import time
|
||||
from tqdm import trange
|
||||
from models.efficientnet import EfficientNet
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.ops import GlobalCounters
|
||||
@@ -22,7 +23,7 @@ CLCACHE = getenv("CLCACHE", 0)
|
||||
if __name__ == "__main__":
|
||||
print(f"NUM:{NUM} BS:{BS} CNT:{CNT}")
|
||||
model = EfficientNet(NUM, classes=1000, has_se=False, track_running_stats=False)
|
||||
parameters = optim.get_parameters(model)
|
||||
parameters = get_parameters(model)
|
||||
for p in parameters: p.realize()
|
||||
if ADAM: optimizer = optim.Adam(parameters, lr=0.001)
|
||||
else: optimizer = optim.SGD(parameters, lr=0.001)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Optional, Tuple
|
||||
from numpy.typing import NDArray
|
||||
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import getenv
|
||||
@@ -152,10 +153,10 @@ class DeepDeterministicPolicyGradient:
|
||||
self.target_actor = Actor(self.num_actions, self.num_states, hidden_size)
|
||||
self.target_critic = Critic(self.num_actions + self.num_states, hidden_size)
|
||||
|
||||
actor_params = optim.get_parameters(self.actor)
|
||||
critic_params = optim.get_parameters(self.critic)
|
||||
target_actor_params = optim.get_parameters(self.target_actor)
|
||||
target_critic_params = optim.get_parameters(self.target_critic)
|
||||
actor_params = get_parameters(self.actor)
|
||||
critic_params = get_parameters(self.critic)
|
||||
target_actor_params = get_parameters(self.target_actor)
|
||||
target_critic_params = get_parameters(self.target_critic)
|
||||
|
||||
if DEVICE == "GPU":
|
||||
[x.gpu_() for x in actor_params + critic_params + target_actor_params + target_critic_params]
|
||||
@@ -171,12 +172,12 @@ class DeepDeterministicPolicyGradient:
|
||||
tau = self.tau
|
||||
|
||||
for param, target_param in zip(
|
||||
optim.get_parameters(self.actor), optim.get_parameters(self.target_actor)
|
||||
get_parameters(self.actor), get_parameters(self.target_actor)
|
||||
):
|
||||
target_param.assign(param.detach() * tau + target_param * (1.0 - tau))
|
||||
|
||||
for param, target_param in zip(
|
||||
optim.get_parameters(self.critic), optim.get_parameters(self.target_critic)
|
||||
get_parameters(self.critic), get_parameters(self.target_critic)
|
||||
):
|
||||
target_param.assign(param.detach() * tau + target_param * (1.0 - tau))
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import time
|
||||
import numpy as np
|
||||
from datasets import fetch_cifar
|
||||
from tinygrad import nn
|
||||
from tinygrad.state import get_parameters, get_state_dict
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv
|
||||
@@ -89,7 +90,7 @@ def train_cifar():
|
||||
if getenv("TORCHWEIGHTS"):
|
||||
from examples.hlb_cifar10_torch import SpeedyResNet as SpeedyResNetTorch
|
||||
torch_model = SpeedyResNetTorch()
|
||||
model_state_dict = optim.get_state_dict(model)
|
||||
model_state_dict = get_state_dict(model)
|
||||
torch_state_dict = torch_model.state_dict()
|
||||
for k,v in torch_state_dict.items():
|
||||
old_mean_std = model_state_dict[k].mean().numpy(), model_state_dict[k].std().numpy()
|
||||
@@ -99,9 +100,9 @@ def train_cifar():
|
||||
exit(0)
|
||||
|
||||
if getenv("ADAM"):
|
||||
optimizer = optim.Adam(optim.get_parameters(model), lr=Tensor([0.001]).realize())
|
||||
optimizer = optim.Adam(get_parameters(model), lr=Tensor([0.001]).realize())
|
||||
else:
|
||||
optimizer = optim.SGD(optim.get_parameters(model), lr=0.01, momentum=0.85, nesterov=True)
|
||||
optimizer = optim.SGD(get_parameters(model), lr=0.01, momentum=0.85, nesterov=True)
|
||||
|
||||
# 97 steps in 2 seconds = 20ms / step
|
||||
# step is 1163.42 GOPS = 56 TFLOPS!!!, 41% of max 136
|
||||
|
||||
@@ -3,6 +3,7 @@ import numpy as np
|
||||
from tqdm import trange
|
||||
import torch
|
||||
from torchvision.utils import make_grid, save_image
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.nn import optim
|
||||
@@ -84,8 +85,8 @@ if __name__ == "__main__":
|
||||
output_dir = Path(".").resolve() / "outputs"
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
# optimizers
|
||||
optim_g = optim.Adam(optim.get_parameters(generator),lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
|
||||
optim_d = optim.Adam(optim.get_parameters(discriminator),lr=0.0002, b1=0.5)
|
||||
optim_g = optim.Adam(get_parameters(generator),lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
|
||||
optim_d = optim.Adam(get_parameters(discriminator),lr=0.0002, b1=0.5)
|
||||
# training loop
|
||||
for epoch in (t := trange(epochs)):
|
||||
loss_g, loss_d = 0.0, 0.0
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
|
||||
import sys
|
||||
import numpy as np
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import BatchNorm2d, optim
|
||||
from tinygrad.helpers import getenv
|
||||
@@ -57,7 +58,7 @@ class BigConvNet:
|
||||
|
||||
def parameters(self):
|
||||
if DEBUG: #keeping this for a moment
|
||||
pars = [par for par in optim.get_parameters(self) if par.requires_grad]
|
||||
pars = [par for par in get_parameters(self) if par.requires_grad]
|
||||
no_pars = 0
|
||||
for par in pars:
|
||||
print(par.shape)
|
||||
@@ -65,17 +66,17 @@ class BigConvNet:
|
||||
print('no of parameters', no_pars)
|
||||
return pars
|
||||
else:
|
||||
return optim.get_parameters(self)
|
||||
return get_parameters(self)
|
||||
|
||||
def save(self, filename):
|
||||
with open(filename+'.npy', 'wb') as f:
|
||||
for par in optim.get_parameters(self):
|
||||
for par in get_parameters(self):
|
||||
#if par.requires_grad:
|
||||
np.save(f, par.cpu().numpy())
|
||||
|
||||
def load(self, filename):
|
||||
with open(filename+'.npy', 'rb') as f:
|
||||
for par in optim.get_parameters(self):
|
||||
for par in get_parameters(self):
|
||||
#if par.requires_grad:
|
||||
try:
|
||||
par.cpu().numpy()[:] = np.load(f)
|
||||
@@ -122,7 +123,7 @@ if __name__ == "__main__":
|
||||
print('could not load weights "'+sys.argv[1]+'".')
|
||||
|
||||
if GPU:
|
||||
params = optim.get_parameters(model)
|
||||
params = get_parameters(model)
|
||||
[x.gpu_() for x in params]
|
||||
|
||||
for lr, epochs in zip(lrs, epochss):
|
||||
|
||||
@@ -3,6 +3,7 @@ import time
|
||||
from multiprocessing import Process, Queue
|
||||
import numpy as np
|
||||
from tqdm import trange
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.tensor import Tensor
|
||||
@@ -37,7 +38,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
model = EfficientNet(getenv("NUM", 0), classes, has_se=False)
|
||||
|
||||
parameters = optim.get_parameters(model)
|
||||
parameters = get_parameters(model)
|
||||
print("parameter count", len(parameters))
|
||||
optimizer = optim.Adam(parameters, lr=0.001)
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.helpers import getenv
|
||||
from extra.training import train, evaluate
|
||||
@@ -37,7 +38,7 @@ if __name__ == "__main__":
|
||||
lambda x: np.tile(np.expand_dims(x, 1), (1, 3, 1, 1)).astype(np.float32),
|
||||
])
|
||||
for _ in range(5):
|
||||
optimizer = optim.SGD(optim.get_parameters(model), lr=lr, momentum=0.9)
|
||||
optimizer = optim.SGD(get_parameters(model), lr=lr, momentum=0.9)
|
||||
train(model, X_train, Y_train, optimizer, 100, BS=32, transform=transform)
|
||||
evaluate(model, X_test, Y_test, num_classes=classes, transform=transform)
|
||||
lr /= 1.2
|
||||
|
||||
10
test/external/external_test_opt.py
vendored
10
test/external/external_test_opt.py
vendored
@@ -34,7 +34,7 @@ from models.convnext import ConvNeXt
|
||||
from models.efficientnet import EfficientNet
|
||||
from models.resnet import ResNet18
|
||||
from models.vit import ViT
|
||||
from tinygrad.nn.optim import get_parameters
|
||||
from tinygrad.state import get_parameters
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
class TestInferenceMinKernels(unittest.TestCase):
|
||||
@@ -182,7 +182,7 @@ class TestOpt(unittest.TestCase):
|
||||
Tensor.training = True
|
||||
img = Tensor.ones(2,3,4,4)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
opt = optim.SGD(optim.get_parameters(c1))
|
||||
opt = optim.SGD(get_parameters(c1))
|
||||
with CLCache():
|
||||
opt.zero_grad()
|
||||
c1(img).relu().sum().backward()
|
||||
@@ -199,7 +199,7 @@ class TestOpt(unittest.TestCase):
|
||||
img = Tensor.ones(2,3,64,64)
|
||||
c1 = nn.Conv2d(3,16,3,bias=False)
|
||||
c2 = nn.Conv2d(16,32,3,bias=False)
|
||||
opt = optim.SGD(optim.get_parameters([c1, c2]))
|
||||
opt = optim.SGD(get_parameters([c1, c2]))
|
||||
with CLCache(allowed=9):
|
||||
opt.zero_grad()
|
||||
c2(c1(img).relu()).relu().sum().backward()
|
||||
@@ -214,7 +214,7 @@ class TestOpt(unittest.TestCase):
|
||||
c2 = nn.Conv2d(4,8,3,bias=False)
|
||||
c3 = nn.Conv2d(8,16,3,bias=False)
|
||||
c4 = nn.Conv2d(16,32,3,bias=False)
|
||||
opt = optim.SGD(optim.get_parameters([c1, c2, c3, c4]))
|
||||
opt = optim.SGD(get_parameters([c1, c2, c3, c4]))
|
||||
with CLCache(allowed=19):
|
||||
opt.zero_grad()
|
||||
c4(c3(c2(c1(img).relu()).relu()).relu()).relu().sum().backward()
|
||||
@@ -227,7 +227,7 @@ class TestOpt(unittest.TestCase):
|
||||
img = Tensor.ones(1,3,4,4)
|
||||
c1 = nn.Conv2d(3,32,3)
|
||||
bn = nn.BatchNorm2d(32, track_running_stats=False)
|
||||
opt = optim.SGD(optim.get_parameters([c1, bn]))
|
||||
opt = optim.SGD(get_parameters([c1, bn]))
|
||||
with CLCache(allowed=18): # this is too high
|
||||
img_bn = bn(c1(img)).elu().sum()
|
||||
opt.zero_grad()
|
||||
|
||||
3
test/external/graph_batchnorm.py
vendored
3
test/external/graph_batchnorm.py
vendored
@@ -1,11 +1,12 @@
|
||||
import unittest
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn import Conv2d, BatchNorm2d, optim
|
||||
|
||||
def model_step(lm):
|
||||
Tensor.training = True
|
||||
x = Tensor.ones(8,12,128,256, requires_grad=False)
|
||||
optimizer = optim.SGD(optim.get_parameters(lm), lr=0.001)
|
||||
optimizer = optim.SGD(get_parameters(lm), lr=0.001)
|
||||
loss = lm.forward(x).sum()
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
@@ -2,7 +2,8 @@ import numpy as np
|
||||
import torch
|
||||
import unittest
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.optim import Adam, get_parameters
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.nn.optim import Adam
|
||||
from extra.lr_scheduler import MultiStepLR, ReduceLROnPlateau, CosineAnnealingLR
|
||||
from extra.training import train, evaluate
|
||||
from datasets import fetch_mnist
|
||||
|
||||
@@ -2,6 +2,7 @@ import torch
|
||||
from torch import nn
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.state import get_parameters, get_state_dict
|
||||
from tinygrad.nn import optim, Linear, Conv2d, BatchNorm2d
|
||||
from tinygrad.tensor import Tensor
|
||||
from datasets import fetch_mnist
|
||||
@@ -9,12 +10,12 @@ from datasets import fetch_mnist
|
||||
def compare_tiny_torch(model, model_torch, X, Y):
|
||||
Tensor.training = True
|
||||
model_torch.train()
|
||||
model_state_dict = optim.get_state_dict(model)
|
||||
model_state_dict = get_state_dict(model)
|
||||
for k,v in model_torch.named_parameters():
|
||||
print(f"initting {k} from torch")
|
||||
model_state_dict[k].assign(Tensor(v.detach().numpy())).realize()
|
||||
|
||||
optimizer = optim.SGD(optim.get_parameters(model), lr=0.01)
|
||||
optimizer = optim.SGD(get_parameters(model), lr=0.01)
|
||||
optimizer_torch = torch.optim.SGD(model_torch.parameters(), lr=0.01)
|
||||
|
||||
Xt = torch.Tensor(X.numpy())
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import numpy as np
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.nn import optim, BatchNorm2d
|
||||
from extra.training import train, evaluate
|
||||
@@ -16,7 +17,7 @@ class TinyBobNet:
|
||||
self.l2 = Tensor.scaled_uniform(128, 10)
|
||||
|
||||
def parameters(self):
|
||||
return optim.get_parameters(self)
|
||||
return get_parameters(self)
|
||||
|
||||
def forward(self, x):
|
||||
return x.dot(self.l1).relu().dot(self.l2).log_softmax()
|
||||
@@ -38,7 +39,7 @@ class TinyConvNet:
|
||||
self.bn1, self.bn2 = lambda x: x, lambda x: x
|
||||
|
||||
def parameters(self):
|
||||
return optim.get_parameters(self)
|
||||
return get_parameters(self)
|
||||
|
||||
def forward(self, x:Tensor):
|
||||
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
import time
|
||||
import numpy as np
|
||||
from tinygrad.state import get_parameters
|
||||
from tinygrad.nn import optim
|
||||
from tinygrad.tensor import Device
|
||||
from tinygrad.helpers import getenv
|
||||
@@ -14,7 +15,7 @@ from models.resnet import ResNet18
|
||||
BS = getenv("BS", 2)
|
||||
|
||||
def train_one_step(model,X,Y):
|
||||
params = optim.get_parameters(model)
|
||||
params = get_parameters(model)
|
||||
pcount = 0
|
||||
for p in params:
|
||||
pcount += np.prod(p.shape)
|
||||
|
||||
@@ -67,7 +67,3 @@ class LAMB(Optimizer):
|
||||
r = 1.0
|
||||
t.assign(t.detach() - self.lr * r * up)
|
||||
self.realize([self.t] + self.m + self.v)
|
||||
|
||||
# TODO: remove this
|
||||
from tinygrad.state import get_state_dict, get_parameters # pylint: disable=unused-import # noqa: F401
|
||||
|
||||
|
||||
Reference in New Issue
Block a user