correct mispelled words (#10165)

2026-01-08 22:48:25 -05:00 · 2025-05-05 09:12:41 -06:00
parent 98f4a831c8
commit 363481e2fb
4 changed files with 7 additions and 7 deletions
--- a/examples/llm.c/export.py
+++ b/examples/llm.c/export.py
@@ -25,7 +25,7 @@ if __name__ == "__main__":
  Tensor.training = True
  optimizer = nn.optim.Adam(nn.state.get_parameters(model), lr=1e-4)
  warmup_count = getenv("WARMUP", 3)
-  for i in range(warmup_count):  # TODO: why does it take three and not two to stablize
+  for i in range(warmup_count):  # TODO: why does it take three and not two to stabilize
    GlobalCounters.reset()
    X = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
    Y = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -34,7 +34,7 @@ pm_unbind = PatternMatcher([
 # **** schedule linearizer

 def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[Variable, int], dict[UOp, UOp]]:
-  # construnct the KERNEL children graph based on assigns
+  # construct the KERNEL children graph based on assigns
  children: defaultdict[UOp, list[UOp]] = defaultdict(list)
  in_degree: dict[UOp, int] = {}
  for u in (toposort:=sched_sink.toposort()):
--- a/tinygrad/nn/optim.py
+++ b/tinygrad/nn/optim.py
@@ -54,7 +54,7 @@ class OptimizerGroup(Optimizer):
  def zero_grad(self): [o.zero_grad() for o in self.optimizers]
  def schedule_step(self) -> list[Tensor]: return [x for o in self.optimizers for x in o.schedule_step()]

-# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD.
+# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 it's just standard SGD.
 def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
  """
  Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.
@@ -88,7 +88,7 @@ class LARS(Optimizer):
      # classic momentum does post learning rate update
      if self.classic: g = g * r * self.lr
      if self.momentum:
-        # TODO: this contiguous is required for correctness becuase self.b[i] becomes a non contiguous view
+        # TODO: this contiguous is required for correctness because self.b[i] becomes a non contiguous view
        # the scheduler should detect this and just insert contiguous
        self.b[i].assign(self.momentum * self.b[i].contiguous() + g)  # NOTE: self.b[i] is zero on the first run, no if required
        g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i]
@@ -97,7 +97,7 @@ class LARS(Optimizer):
      t.assign((t.detach() - g).cast(t.dtype))
    return self.b

-# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W.
+# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 it's just Adam/W.
 def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
  """
  AdamW optimizer with optional weight decay.
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -2297,7 +2297,7 @@ class Tensor(SimpleMathTrait):
    winograd_Bt = [[4, 0, -5, 0, 1, 0], [0, -4, -4, 1, 1, 0], [0, 4, -4, -1, 1, 0], [0, -2, -1, 2, 1, 0], [0, 2, -1, -2, 1, 0], [0, 4, 0, -5, 0, 1]]
    winograd_At = [[1, 1, 1, 1, 1, 0], [0, 1, -1, 2, -2, 0], [0, 1, 1, 4, 4, 0], [0, 1, -1, 8, -8, 1]] # applying At in pre-order doubles compile time

-    # todo: stride == dilation
+    # TODO: stride == dilation
    # use padding to round up to 4x4 output tiles
    # (bs, cin_, tyx, HWI)
    d = self.pad(sum([[padding_[i*2], padding_[i*2+1] + (-(dim + sum(padding_[i * 2:(i + 1) * 2]) - 2) % 4)] for i, dim in enumerate(self.shape[-len(HW):])], []))._pool(HWI, HWO)  # noqa: E501
@@ -2414,7 +2414,7 @@ class Tensor(SimpleMathTrait):
  def _split_cumalu(self, axis:int, op:Ops) -> Tensor:
    axis = self._resolve_dim(axis)
    if self.ndim == 0 or 0 in self.shape: return self
-    # TODO: someday the optimizer will find this on it's own
+    # TODO: someday the optimizer will find this on its own
    # for now this is a two stage cumsum
    SPLIT = 256
    if not isinstance(s:=self.shape[axis], int) or s <= SPLIT*2: return self._cumalu(axis, op)