diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py index b041aba10e..b4b8a3ed38 100755 --- a/examples/llm.c/export.py +++ b/examples/llm.c/export.py @@ -25,7 +25,7 @@ if __name__ == "__main__": Tensor.training = True optimizer = nn.optim.Adam(nn.state.get_parameters(model), lr=1e-4) warmup_count = getenv("WARMUP", 3) - for i in range(warmup_count): # TODO: why does it take three and not two to stablize + for i in range(warmup_count): # TODO: why does it take three and not two to stabilize GlobalCounters.reset() X = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T) Y = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T) diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py index 4785e9239f..981a56473f 100644 --- a/tinygrad/engine/schedule.py +++ b/tinygrad/engine/schedule.py @@ -34,7 +34,7 @@ pm_unbind = PatternMatcher([ # **** schedule linearizer def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[Variable, int], dict[UOp, UOp]]: - # construnct the KERNEL children graph based on assigns + # construct the KERNEL children graph based on assigns children: defaultdict[UOp, list[UOp]] = defaultdict(list) in_degree: dict[UOp, int] = {} for u in (toposort:=sched_sink.toposort()): diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py index 6baffe99c2..fc872389d2 100644 --- a/tinygrad/nn/optim.py +++ b/tinygrad/nn/optim.py @@ -54,7 +54,7 @@ class OptimizerGroup(Optimizer): def zero_grad(self): [o.zero_grad() for o in self.optimizers] def schedule_step(self) -> list[Tensor]: return [x for o in self.optimizers for x in o.schedule_step()] -# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD. +# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 it's just standard SGD. def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False): """ Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay. @@ -88,7 +88,7 @@ class LARS(Optimizer): # classic momentum does post learning rate update if self.classic: g = g * r * self.lr if self.momentum: - # TODO: this contiguous is required for correctness becuase self.b[i] becomes a non contiguous view + # TODO: this contiguous is required for correctness because self.b[i] becomes a non contiguous view # the scheduler should detect this and just insert contiguous self.b[i].assign(self.momentum * self.b[i].contiguous() + g) # NOTE: self.b[i] is zero on the first run, no if required g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i] @@ -97,7 +97,7 @@ class LARS(Optimizer): t.assign((t.detach() - g).cast(t.dtype)) return self.b -# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W. +# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 it's just Adam/W. def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01): """ AdamW optimizer with optional weight decay. diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index f0b70c183d..158312622a 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -2297,7 +2297,7 @@ class Tensor(SimpleMathTrait): winograd_Bt = [[4, 0, -5, 0, 1, 0], [0, -4, -4, 1, 1, 0], [0, 4, -4, -1, 1, 0], [0, -2, -1, 2, 1, 0], [0, 2, -1, -2, 1, 0], [0, 4, 0, -5, 0, 1]] winograd_At = [[1, 1, 1, 1, 1, 0], [0, 1, -1, 2, -2, 0], [0, 1, 1, 4, 4, 0], [0, 1, -1, 8, -8, 1]] # applying At in pre-order doubles compile time - # todo: stride == dilation + # TODO: stride == dilation # use padding to round up to 4x4 output tiles # (bs, cin_, tyx, HWI) d = self.pad(sum([[padding_[i*2], padding_[i*2+1] + (-(dim + sum(padding_[i * 2:(i + 1) * 2]) - 2) % 4)] for i, dim in enumerate(self.shape[-len(HW):])], []))._pool(HWI, HWO) # noqa: E501 @@ -2414,7 +2414,7 @@ class Tensor(SimpleMathTrait): def _split_cumalu(self, axis:int, op:Ops) -> Tensor: axis = self._resolve_dim(axis) if self.ndim == 0 or 0 in self.shape: return self - # TODO: someday the optimizer will find this on it's own + # TODO: someday the optimizer will find this on its own # for now this is a two stage cumsum SPLIT = 256 if not isinstance(s:=self.shape[axis], int) or s <= SPLIT*2: return self._cumalu(axis, op)