mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
correct mispelled words (#10165)
This commit is contained in:
@@ -25,7 +25,7 @@ if __name__ == "__main__":
|
||||
Tensor.training = True
|
||||
optimizer = nn.optim.Adam(nn.state.get_parameters(model), lr=1e-4)
|
||||
warmup_count = getenv("WARMUP", 3)
|
||||
for i in range(warmup_count): # TODO: why does it take three and not two to stablize
|
||||
for i in range(warmup_count): # TODO: why does it take three and not two to stabilize
|
||||
GlobalCounters.reset()
|
||||
X = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
|
||||
Y = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
|
||||
|
||||
@@ -34,7 +34,7 @@ pm_unbind = PatternMatcher([
|
||||
# **** schedule linearizer
|
||||
|
||||
def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[Variable, int], dict[UOp, UOp]]:
|
||||
# construnct the KERNEL children graph based on assigns
|
||||
# construct the KERNEL children graph based on assigns
|
||||
children: defaultdict[UOp, list[UOp]] = defaultdict(list)
|
||||
in_degree: dict[UOp, int] = {}
|
||||
for u in (toposort:=sched_sink.toposort()):
|
||||
|
||||
@@ -54,7 +54,7 @@ class OptimizerGroup(Optimizer):
|
||||
def zero_grad(self): [o.zero_grad() for o in self.optimizers]
|
||||
def schedule_step(self) -> list[Tensor]: return [x for o in self.optimizers for x in o.schedule_step()]
|
||||
|
||||
# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD.
|
||||
# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 it's just standard SGD.
|
||||
def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
|
||||
"""
|
||||
Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.
|
||||
@@ -88,7 +88,7 @@ class LARS(Optimizer):
|
||||
# classic momentum does post learning rate update
|
||||
if self.classic: g = g * r * self.lr
|
||||
if self.momentum:
|
||||
# TODO: this contiguous is required for correctness becuase self.b[i] becomes a non contiguous view
|
||||
# TODO: this contiguous is required for correctness because self.b[i] becomes a non contiguous view
|
||||
# the scheduler should detect this and just insert contiguous
|
||||
self.b[i].assign(self.momentum * self.b[i].contiguous() + g) # NOTE: self.b[i] is zero on the first run, no if required
|
||||
g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i]
|
||||
@@ -97,7 +97,7 @@ class LARS(Optimizer):
|
||||
t.assign((t.detach() - g).cast(t.dtype))
|
||||
return self.b
|
||||
|
||||
# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W.
|
||||
# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 it's just Adam/W.
|
||||
def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
|
||||
"""
|
||||
AdamW optimizer with optional weight decay.
|
||||
|
||||
@@ -2297,7 +2297,7 @@ class Tensor(SimpleMathTrait):
|
||||
winograd_Bt = [[4, 0, -5, 0, 1, 0], [0, -4, -4, 1, 1, 0], [0, 4, -4, -1, 1, 0], [0, -2, -1, 2, 1, 0], [0, 2, -1, -2, 1, 0], [0, 4, 0, -5, 0, 1]]
|
||||
winograd_At = [[1, 1, 1, 1, 1, 0], [0, 1, -1, 2, -2, 0], [0, 1, 1, 4, 4, 0], [0, 1, -1, 8, -8, 1]] # applying At in pre-order doubles compile time
|
||||
|
||||
# todo: stride == dilation
|
||||
# TODO: stride == dilation
|
||||
# use padding to round up to 4x4 output tiles
|
||||
# (bs, cin_, tyx, HWI)
|
||||
d = self.pad(sum([[padding_[i*2], padding_[i*2+1] + (-(dim + sum(padding_[i * 2:(i + 1) * 2]) - 2) % 4)] for i, dim in enumerate(self.shape[-len(HW):])], []))._pool(HWI, HWO) # noqa: E501
|
||||
@@ -2414,7 +2414,7 @@ class Tensor(SimpleMathTrait):
|
||||
def _split_cumalu(self, axis:int, op:Ops) -> Tensor:
|
||||
axis = self._resolve_dim(axis)
|
||||
if self.ndim == 0 or 0 in self.shape: return self
|
||||
# TODO: someday the optimizer will find this on it's own
|
||||
# TODO: someday the optimizer will find this on its own
|
||||
# for now this is a two stage cumsum
|
||||
SPLIT = 256
|
||||
if not isinstance(s:=self.shape[axis], int) or s <= SPLIT*2: return self._cumalu(axis, op)
|
||||
|
||||
Reference in New Issue
Block a user