mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-18 10:31:41 -05:00
Over 90% on CIFAR with examples/hlb_cifar10.py (#1073)
* fix eval, lr decay, best eval * 82.27 * 82.64 * 82.79, reproducable * add lr sched, 85.26 * 87.42 * 87.94 * 87.42 * tta with flip * training flip aug * refactor * using Tensor for LR is faster * 89.5 * refactor, flip only train set * 90.01 * 90.64 * eval jit * refactor * only JIT model * fix eval JIT * fix eval JIT * 90.82 * STEPS=900 reaches 90.22 * TTA envvar * TTA default 0 * fully jit training * refactor optim * fix sched * add label smoothing * param changes * patial gelu * OneCycle with pause * gelu maybe works * 90.12 * remove pause lr * maybe fix lr schedulers * scheduler test passing * comments * try mixup * shuffle! * add back the missing last eval * fix shuffle bugs * add mixup prob * fix mixup prob * 90.19 * correct mixup * correct mixup * correct mixup * 90.24 * 90.33 * refactor, add type hints * add gradient clipping * maybe fix test * full JIT * back to relu for now * pass mixup prob as param * add typehints * maybe CI works * try erf gelu * CI, types * remove useless import/ * refactor optim * refactor optim * try leakyrelu * try celu * gelu * 90.67 * remove grad clip * remove grad clip tests * revert params * add test for OneCycleLR * 90.62 * fix eval timing * fix eval timing again * so where i calculate mixup_prob matters --------- Co-authored-by: Kunwar Raj Singh <kunwar31@pop-os.localdomain>
This commit is contained in:
@@ -4,13 +4,14 @@ from tinygrad.helpers import dedup
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
class Optimizer:
|
||||
def __init__(self, params: List[Tensor]):
|
||||
def __init__(self, params: List[Tensor], lr: float):
|
||||
# if it's None, but being put into an optimizer, set it to True
|
||||
for x in params:
|
||||
if x.requires_grad is None: x.requires_grad = True
|
||||
|
||||
self.params: List[Tensor] = dedup([x for x in params if x.requires_grad])
|
||||
self.buffers: List[Tensor] = dedup([x for x in params if not x.requires_grad]) # buffers are still realized
|
||||
self.lr = Tensor([lr], requires_grad=False)
|
||||
|
||||
def zero_grad(self):
|
||||
for param in self.params: param.grad = None
|
||||
@@ -23,8 +24,8 @@ class Optimizer:
|
||||
|
||||
class SGD(Optimizer):
|
||||
def __init__(self, params: List[Tensor], lr=0.001, momentum=0, weight_decay=0.0, nesterov=False):
|
||||
super().__init__(params)
|
||||
self.lr, self.momentum, self.wd, self.nesterov = lr, momentum, weight_decay, nesterov
|
||||
super().__init__(params, lr)
|
||||
self.momentum, self.wd, self.nesterov = momentum, weight_decay, nesterov
|
||||
self.b = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []
|
||||
|
||||
# https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
|
||||
@@ -44,8 +45,8 @@ def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8): return LAM
|
||||
|
||||
class LAMB(Optimizer):
|
||||
def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, wd=0.0, adam=False):
|
||||
super().__init__(params)
|
||||
self.lr, self.b1, self.b2, self.eps, self.wd, self.adam, self.t = lr, b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
|
||||
super().__init__(params, lr)
|
||||
self.b1, self.b2, self.eps, self.wd, self.adam, self.t = b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
|
||||
self.m = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
|
||||
self.v = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user