Over 90% on CIFAR with examples/hlb_cifar10.py (#1073)

* fix eval, lr decay, best eval

* 82.27

* 82.64

* 82.79, reproducable

* add lr sched, 85.26

* 87.42

* 87.94

* 87.42

* tta with flip

* training flip aug

* refactor

* using Tensor for LR is faster

* 89.5

* refactor, flip only train set

* 90.01

* 90.64

* eval jit

* refactor

* only JIT model

* fix eval JIT

* fix eval JIT

* 90.82

* STEPS=900 reaches 90.22

* TTA envvar

* TTA default 0

* fully jit training

* refactor optim

* fix sched

* add label smoothing

* param changes

* patial gelu

* OneCycle with pause

* gelu maybe works

* 90.12

* remove pause lr

* maybe fix lr schedulers

* scheduler test passing

* comments

* try mixup

* shuffle!

* add back the missing last eval

* fix shuffle bugs

* add mixup prob

* fix mixup prob

* 90.19

* correct mixup

* correct mixup

* correct mixup

* 90.24

* 90.33

* refactor, add type hints

* add gradient clipping

* maybe fix test

* full JIT

* back to relu for now

* pass mixup prob as param

* add typehints

* maybe CI works

* try erf gelu

* CI, types

* remove useless import/

* refactor optim

* refactor optim

* try leakyrelu

* try celu

* gelu

* 90.67

* remove grad clip

* remove grad clip tests

* revert params

* add test for OneCycleLR

* 90.62

* fix eval timing

* fix eval timing again

* so where i calculate mixup_prob matters

---------

Co-authored-by: Kunwar Raj Singh <kunwar31@pop-os.localdomain>
This commit is contained in:
Kunwar Raj Singh
2023-07-07 09:16:22 +05:30
committed by GitHub
parent c5aea13a65
commit 8391648822
5 changed files with 139 additions and 83 deletions

View File

@@ -4,13 +4,14 @@ from tinygrad.helpers import dedup
from tinygrad.tensor import Tensor
class Optimizer:
def __init__(self, params: List[Tensor]):
def __init__(self, params: List[Tensor], lr: float):
# if it's None, but being put into an optimizer, set it to True
for x in params:
if x.requires_grad is None: x.requires_grad = True
self.params: List[Tensor] = dedup([x for x in params if x.requires_grad])
self.buffers: List[Tensor] = dedup([x for x in params if not x.requires_grad]) # buffers are still realized
self.lr = Tensor([lr], requires_grad=False)
def zero_grad(self):
for param in self.params: param.grad = None
@@ -23,8 +24,8 @@ class Optimizer:
class SGD(Optimizer):
def __init__(self, params: List[Tensor], lr=0.001, momentum=0, weight_decay=0.0, nesterov=False):
super().__init__(params)
self.lr, self.momentum, self.wd, self.nesterov = lr, momentum, weight_decay, nesterov
super().__init__(params, lr)
self.momentum, self.wd, self.nesterov = momentum, weight_decay, nesterov
self.b = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params] if self.momentum else []
# https://pytorch.org/docs/stable/generated/torch.optim.SGD.html
@@ -44,8 +45,8 @@ def Adam(params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8): return LAM
class LAMB(Optimizer):
def __init__(self, params: List[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, wd=0.0, adam=False):
super().__init__(params)
self.lr, self.b1, self.b2, self.eps, self.wd, self.adam, self.t = lr, b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
super().__init__(params, lr)
self.b1, self.b2, self.eps, self.wd, self.adam, self.t = b1, b2, eps, wd, adam, Tensor([0], requires_grad=False).realize()
self.m = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]
self.v = [Tensor.zeros(*t.shape, device=t.device, requires_grad=False) for t in self.params]