Over 90% on CIFAR with examples/hlb_cifar10.py (#1073)

* fix eval, lr decay, best eval * 82.27 * 82.64 * 82.79, reproducable * add lr sched, 85.26 * 87.42 * 87.94 * 87.42 * tta with flip * training flip aug * refactor * using Tensor for LR is faster * 89.5 * refactor, flip only train set * 90.01 * 90.64 * eval jit * refactor * only JIT model * fix eval JIT * fix eval JIT * 90.82 * STEPS=900 reaches 90.22 * TTA envvar * TTA default 0 * fully jit training * refactor optim * fix sched * add label smoothing * param changes * patial gelu * OneCycle with pause * gelu maybe works * 90.12 * remove pause lr * maybe fix lr schedulers * scheduler test passing * comments * try mixup * shuffle! * add back the missing last eval * fix shuffle bugs * add mixup prob * fix mixup prob * 90.19 * correct mixup * correct mixup * correct mixup * 90.24 * 90.33 * refactor, add type hints * add gradient clipping * maybe fix test * full JIT * back to relu for now * pass mixup prob as param * add typehints * maybe CI works * try erf gelu * CI, types * remove useless import/ * refactor optim * refactor optim * try leakyrelu * try celu * gelu * 90.67 * remove grad clip * remove grad clip tests * revert params * add test for OneCycleLR * 90.62 * fix eval timing * fix eval timing again * so where i calculate mixup_prob matters --------- Co-authored-by: Kunwar Raj Singh <kunwar31@pop-os.localdomain>
2026-02-18 10:31:41 -05:00 · 2023-07-07 09:16:22 +05:30
parent c5aea13a65
commit 8391648822
5 changed files with 139 additions and 83 deletions
--- a/tinygrad/nn/optim.py
+++ b/tinygrad/nn/optim.py
@@ -4,13 +4,14 @@ from tinygrad.helpers import dedup
 from tinygrad.tensor import Tensor

 class Optimizer:
-  def __init__(self, params: List[Tensor]):
+  def __init__(self, params: List[Tensor], lr: float):
    # if it's None, but being put into an optimizer, set it to True
    for x in params:
      if x.requires_grad is None: x.requires_grad = True

    self.params: List[Tensor] = dedup([x for x in params if x.requires_grad])
    self.buffers: List[Tensor] = dedup([x for x in params if not x.requires_grad])   # buffers are still realized
+    self.lr = Tensor([lr], requires_grad=False)

  def zero_grad(self):
    for param in self.params: param.grad = None
@@ -23,8 +24,8 @@ class Optimizer:

 class SGD(Optimizer):
  def __init__(self, params: List[Tensor], lr=0.001, momentum=0, weight_decay=0.0, nesterov=False):
-    super().__init__(params)
-    self.lr, self.momentum, self.wd, self.nesterov = lr, momentum, weight_decay, nesterov
+    super().__init__(params, lr)
+    self.momentum, self.wd, self.nesterov = momentum, weight_decay, nesterov
    self.b = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []

  # https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
@@ -44,8 +45,8 @@ def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8): return LAM

 class LAMB(Optimizer):
  def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, wd=0.0, adam=False):
-    super().__init__(params)
-    self.lr, self.b1, self.b2, self.eps, self.wd, self.adam, self.t = lr, b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
+    super().__init__(params, lr)
+    self.b1, self.b2, self.eps, self.wd, self.adam, self.t = b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
    self.m = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
    self.v = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]