Files
tinygrad/examples/mlperf/optim.py
2026-03-19 22:12:38 -07:00

60 lines
2.7 KiB
Python

from tinygrad.tensor import Tensor
from tinygrad.dtype import dtypes
from tinygrad.nn.optim import Optimizer
from tinygrad.helpers import FUSE_OPTIM
class GradAccClipAdamW(Optimizer):
def __init__(self, params:list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-6, weight_decay=0.0, grad_acc=1, clip_norm=1.0, device=None, fused=FUSE_OPTIM):
super().__init__(params, lr, device, fused)
self.b1, self.b2, self.eps, self.wd = b1, b2, eps, weight_decay
self.b1_t, self.b2_t = (Tensor.ones((1,), dtype=dtypes.float32, device=self.device, requires_grad=False) for _ in [b1, b2])
self.m = self._new_optim_param()
self.v = self._new_optim_param()
self.grad_acc, self.clip_norm = grad_acc, clip_norm
def fstep(self, grads:list[Tensor]):
if self.fused:
out, extra = self._step([], grads)
updates = [out[0][self.pos_params[i]:self.pos_params[i+1]].reshape(tt.shape) for i, tt in enumerate(self.params)]
else:
updates, extra = self._step([], grads)
for i, tt in enumerate(self.params): tt.assign(self._apply_update(tt, updates[i]))
to_realize = extra+self.params+self.buffers
Tensor.realize(*to_realize)
return extra[-1]
def _step(self, params:list[Tensor], grads:list[Tensor]) -> tuple[list[Tensor], list[Tensor]]:
grads = list(grads)
for i in range(len(grads)):
if grads[i].device != self.m[i].device: grads[i] = grads[i].to(self.m[i].device)
if self.fused:
grads[0].assign(grads[0] / self.grad_acc)
total_norm = grads[0].float().square().sum().sqrt()
grads[0].assign((grads[0] * (self.clip_norm / (total_norm + 1e-6)).clamp(max_=1.0)).cast(grads[0].dtype))
else:
for i in range(len(grads)):
grads[i].assign(grads[i] / self.grad_acc)
total_norm = Tensor.stack(*[g.float().square().sum() for g in grads]).sum().sqrt().contiguous()
for i in range(len(grads)):
grads[i].assign((grads[i] * (self.clip_norm / (total_norm + 1e-6)).clamp(max_=1.0)).cast(grads[i].dtype))
ret = []
self.b1_t *= self.b1
self.b2_t *= self.b2
for i, g in enumerate(grads):
self.m[i].assign((self.b1 * self.m[i] + (1.0 - self.b1) * g).cast(self.m[i].dtype))
self.v[i].assign((self.b2 * self.v[i] + (1.0 - self.b2) * (g * g)).cast(self.v[i].dtype))
m_hat = (self.m[i] / (1.0 - self.b1_t)).cast(self.m[i].dtype)
v_hat = (self.v[i] / (1.0 - self.b2_t)).cast(self.v[i].dtype)
up = m_hat / (v_hat.sqrt() + self.eps)
ret.append((self.lr * up).cast(g.dtype))
return ret, [self.b1_t, self.b2_t] + self.m + self.v + [total_norm]
def _apply_update(self, t:Tensor, up:Tensor) -> Tensor:
wd = self.wd if t.ndim >= 3 else 0.0
up = up.shard_like(t) + self.lr.to(t.device) * wd * t.detach()
return t.detach() - up.cast(t.dtype)