mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
write out all the functions, no auto binding (#543)
* write out all the functions, no auto binding * cleanups, more types * Slice is for internal calls only * improve typing * ugh, put slice back
This commit is contained in:
@@ -19,7 +19,7 @@ class BatchNorm2D:
|
||||
batch_mean = x_detached.mean(axis=(0,2,3))
|
||||
y = (x_detached - batch_mean.reshape(shape=[1, -1, 1, 1]))
|
||||
batch_var = (y*y).mean(axis=(0,2,3))
|
||||
batch_invstd = batch_var.add(self.eps)**-0.5
|
||||
batch_invstd = batch_var.add(self.eps).pow(-0.5)
|
||||
self.batch_invstd = None
|
||||
|
||||
# NOTE: wow, this is done all throughout training in most PyTorch models
|
||||
@@ -31,7 +31,7 @@ class BatchNorm2D:
|
||||
batch_mean, batch_var = self.running_mean, self.running_var
|
||||
# NOTE: this can be precomputed for static inference. if you manually update running_var, you have to reset this
|
||||
if not hasattr(self, "batch_invstd") or not self.batch_invstd:
|
||||
self.batch_invstd = batch_var.add(self.eps)**-0.5
|
||||
self.batch_invstd = batch_var.add(self.eps).pow(-0.5)
|
||||
batch_invstd = self.batch_invstd
|
||||
|
||||
return x.batchnorm(self.weight, self.bias, batch_mean, batch_invstd)
|
||||
@@ -61,7 +61,7 @@ class GroupNorm:
|
||||
self.num_groups, self.num_channels, self.eps, self.affine = num_groups, num_channels, eps, affine
|
||||
self.weight, self.bias = (Tensor.ones(num_channels), Tensor.zeros(num_channels)) if affine else (None, None)
|
||||
|
||||
def __call__(self, x):
|
||||
def __call__(self, x:Tensor):
|
||||
# reshape for layernorm to work as group norm
|
||||
# subtract mean and divide stddev
|
||||
x = x.reshape(x.shape[0], self.num_groups, -1).layernorm(eps=self.eps).reshape(x.shape)
|
||||
@@ -76,7 +76,7 @@ class LayerNorm:
|
||||
self.axis, self.eps, self.elementwise_affine = tuple(-1-i for i in range(len(normalized_shape))), eps, elementwise_affine
|
||||
self.weight, self.bias = (Tensor.ones(*normalized_shape), Tensor.zeros(*normalized_shape)) if elementwise_affine else (None, None)
|
||||
|
||||
def __call__(self, x):
|
||||
def __call__(self, x:Tensor):
|
||||
x = x.layernorm(eps=self.eps, axis=self.axis)
|
||||
if not self.elementwise_affine: return x
|
||||
return x * self.weight + self.bias
|
||||
|
||||
@@ -14,6 +14,7 @@ class Optimizer:
|
||||
# TODO: this probably shouldn't change the gradients, just the ones used by the optimizer
|
||||
def clipnorm(self, amount=1):
|
||||
for param in self.params:
|
||||
assert param.grad is not None
|
||||
# clipnorm is the L2 norm, not value: is this right?
|
||||
param.grad.assign(param.grad.clip(-(amount**2), (amount**2)))
|
||||
|
||||
@@ -31,8 +32,9 @@ class SGD(Optimizer):
|
||||
super().__init__(params)
|
||||
self.lr = lr
|
||||
|
||||
def step(self):
|
||||
def step(self) -> None:
|
||||
for t in self.params:
|
||||
assert t.grad is not None
|
||||
t.assign(t.detach() - t.grad * self.lr)
|
||||
self.realize()
|
||||
|
||||
@@ -43,8 +45,9 @@ class RMSprop(Optimizer):
|
||||
|
||||
self.v = [Tensor.zeros(*t.shape, device=params[0].device, requires_grad=False) for t in self.params]
|
||||
|
||||
def step(self):
|
||||
def step(self) -> None:
|
||||
for i, t in enumerate(self.params):
|
||||
assert t.grad is not None
|
||||
self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * (t.grad * t.grad)
|
||||
t.assign(t.detach() - (t.grad * self.lr).div(self.v[i].sqrt() + self.eps))
|
||||
self.realize(self.v)
|
||||
@@ -57,10 +60,11 @@ class Adam(Optimizer):
|
||||
self.m = [Tensor.zeros(*t.shape, device=params[0].device, requires_grad=False) for t in self.params]
|
||||
self.v = [Tensor.zeros(*t.shape, device=params[0].device, requires_grad=False) for t in self.params]
|
||||
|
||||
def step(self):
|
||||
def step(self) -> None:
|
||||
self.t = self.t + 1
|
||||
a = self.lr * ((1.0 - self.b2**self.t)**0.5) / (1.0 - self.b1**self.t)
|
||||
for i, t in enumerate(self.params):
|
||||
assert t.grad is not None
|
||||
self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad
|
||||
self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * (t.grad * t.grad)
|
||||
t.assign(t.detach() - a * self.m[i].div(self.v[i].sqrt() + self.eps))
|
||||
|
||||
Reference in New Issue
Block a user