correct mispelled words (#10165)

This commit is contained in:
Kevin Buhler
2025-05-05 09:12:41 -06:00
committed by GitHub
parent 98f4a831c8
commit 363481e2fb
4 changed files with 7 additions and 7 deletions

View File

@@ -25,7 +25,7 @@ if __name__ == "__main__":
Tensor.training = True
optimizer = nn.optim.Adam(nn.state.get_parameters(model), lr=1e-4)
warmup_count = getenv("WARMUP", 3)
for i in range(warmup_count): # TODO: why does it take three and not two to stablize
for i in range(warmup_count): # TODO: why does it take three and not two to stabilize
GlobalCounters.reset()
X = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
Y = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)

View File

@@ -34,7 +34,7 @@ pm_unbind = PatternMatcher([
# **** schedule linearizer
def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[Variable, int], dict[UOp, UOp]]:
# construnct the KERNEL children graph based on assigns
# construct the KERNEL children graph based on assigns
children: defaultdict[UOp, list[UOp]] = defaultdict(list)
in_degree: dict[UOp, int] = {}
for u in (toposort:=sched_sink.toposort()):

View File

@@ -54,7 +54,7 @@ class OptimizerGroup(Optimizer):
def zero_grad(self): [o.zero_grad() for o in self.optimizers]
def schedule_step(self) -> list[Tensor]: return [x for o in self.optimizers for x in o.schedule_step()]
# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD.
# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 it's just standard SGD.
def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
"""
Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.
@@ -88,7 +88,7 @@ class LARS(Optimizer):
# classic momentum does post learning rate update
if self.classic: g = g * r * self.lr
if self.momentum:
# TODO: this contiguous is required for correctness becuase self.b[i] becomes a non contiguous view
# TODO: this contiguous is required for correctness because self.b[i] becomes a non contiguous view
# the scheduler should detect this and just insert contiguous
self.b[i].assign(self.momentum * self.b[i].contiguous() + g) # NOTE: self.b[i] is zero on the first run, no if required
g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i]
@@ -97,7 +97,7 @@ class LARS(Optimizer):
t.assign((t.detach() - g).cast(t.dtype))
return self.b
# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W.
# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 it's just Adam/W.
def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
"""
AdamW optimizer with optional weight decay.

View File

@@ -2297,7 +2297,7 @@ class Tensor(SimpleMathTrait):
winograd_Bt = [[4, 0, -5, 0, 1, 0], [0, -4, -4, 1, 1, 0], [0, 4, -4, -1, 1, 0], [0, -2, -1, 2, 1, 0], [0, 2, -1, -2, 1, 0], [0, 4, 0, -5, 0, 1]]
winograd_At = [[1, 1, 1, 1, 1, 0], [0, 1, -1, 2, -2, 0], [0, 1, 1, 4, 4, 0], [0, 1, -1, 8, -8, 1]] # applying At in pre-order doubles compile time
# todo: stride == dilation
# TODO: stride == dilation
# use padding to round up to 4x4 output tiles
# (bs, cin_, tyx, HWI)
d = self.pad(sum([[padding_[i*2], padding_[i*2+1] + (-(dim + sum(padding_[i * 2:(i + 1) * 2]) - 2) % 4)] for i, dim in enumerate(self.shape[-len(HW):])], []))._pool(HWI, HWO) # noqa: E501
@@ -2414,7 +2414,7 @@ class Tensor(SimpleMathTrait):
def _split_cumalu(self, axis:int, op:Ops) -> Tensor:
axis = self._resolve_dim(axis)
if self.ndim == 0 or 0 in self.shape: return self
# TODO: someday the optimizer will find this on it's own
# TODO: someday the optimizer will find this on its own
# for now this is a two stage cumsum
SPLIT = 256
if not isinstance(s:=self.shape[axis], int) or s <= SPLIT*2: return self._cumalu(axis, op)