diff --git a/examples/llm.c/export.py b/examples/llm.c/export.py
index b041aba10e..b4b8a3ed38 100755
--- a/examples/llm.c/export.py
+++ b/examples/llm.c/export.py
@@ -25,7 +25,7 @@ if __name__ == "__main__":
   Tensor.training = True
   optimizer = nn.optim.Adam(nn.state.get_parameters(model), lr=1e-4)
   warmup_count = getenv("WARMUP", 3)
-  for i in range(warmup_count):  # TODO: why does it take three and not two to stablize
+  for i in range(warmup_count):  # TODO: why does it take three and not two to stabilize
     GlobalCounters.reset()
     X = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
     Y = Tensor.empty(4, 64, dtype=dtypes.int).reshape(B, T)
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 4785e9239f..981a56473f 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -34,7 +34,7 @@ pm_unbind = PatternMatcher([
 # **** schedule linearizer
 
 def create_schedule_with_vars(sched_sink:UOp) -> tuple[list[ScheduleItem], dict[Variable, int], dict[UOp, UOp]]:
-  # construnct the KERNEL children graph based on assigns
+  # construct the KERNEL children graph based on assigns
   children: defaultdict[UOp, list[UOp]] = defaultdict(list)
   in_degree: dict[UOp, int] = {}
   for u in (toposort:=sched_sink.toposort()):
diff --git a/tinygrad/nn/optim.py b/tinygrad/nn/optim.py
index 6baffe99c2..fc872389d2 100644
--- a/tinygrad/nn/optim.py
+++ b/tinygrad/nn/optim.py
@@ -54,7 +54,7 @@ class OptimizerGroup(Optimizer):
   def zero_grad(self): [o.zero_grad() for o in self.optimizers]
   def schedule_step(self) -> list[Tensor]: return [x for o in self.optimizers for x in o.schedule_step()]
 
-# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 its just standard SGD.
+# LARS is essentially just trust ratio to SGD so if we just set the trust coeff 0.0 it's just standard SGD.
 def SGD(params: list[Tensor], lr=0.001, momentum=0.0, weight_decay=0.0, nesterov=False, classic=False):
   """
   Stochastic Gradient Descent (SGD) optimizer with optional momentum and weight decay.
@@ -88,7 +88,7 @@ class LARS(Optimizer):
       # classic momentum does post learning rate update
       if self.classic: g = g * r * self.lr
       if self.momentum:
-        # TODO: this contiguous is required for correctness becuase self.b[i] becomes a non contiguous view
+        # TODO: this contiguous is required for correctness because self.b[i] becomes a non contiguous view
         # the scheduler should detect this and just insert contiguous
         self.b[i].assign(self.momentum * self.b[i].contiguous() + g)  # NOTE: self.b[i] is zero on the first run, no if required
         g = (g + self.momentum * self.b[i]) if self.nesterov else self.b[i]
@@ -97,7 +97,7 @@ class LARS(Optimizer):
       t.assign((t.detach() - g).cast(t.dtype))
     return self.b
 
-# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 its just Adam/W.
+# LAMB is essentially just the trust ratio part of LARS applied to Adam/W so if we just set the trust ratio to 1.0 it's just Adam/W.
 def AdamW(params: list[Tensor], lr=0.001, b1=0.9, b2=0.999, eps=1e-8, weight_decay=0.01):
   """
   AdamW optimizer with optional weight decay.
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index f0b70c183d..158312622a 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -2297,7 +2297,7 @@ class Tensor(SimpleMathTrait):
     winograd_Bt = [[4, 0, -5, 0, 1, 0], [0, -4, -4, 1, 1, 0], [0, 4, -4, -1, 1, 0], [0, -2, -1, 2, 1, 0], [0, 2, -1, -2, 1, 0], [0, 4, 0, -5, 0, 1]]
     winograd_At = [[1, 1, 1, 1, 1, 0], [0, 1, -1, 2, -2, 0], [0, 1, 1, 4, 4, 0], [0, 1, -1, 8, -8, 1]] # applying At in pre-order doubles compile time
 
-    # todo: stride == dilation
+    # TODO: stride == dilation
     # use padding to round up to 4x4 output tiles
     # (bs, cin_, tyx, HWI)
     d = self.pad(sum([[padding_[i*2], padding_[i*2+1] + (-(dim + sum(padding_[i * 2:(i + 1) * 2]) - 2) % 4)] for i, dim in enumerate(self.shape[-len(HW):])], []))._pool(HWI, HWO)  # noqa: E501
@@ -2414,7 +2414,7 @@ class Tensor(SimpleMathTrait):
   def _split_cumalu(self, axis:int, op:Ops) -> Tensor:
     axis = self._resolve_dim(axis)
     if self.ndim == 0 or 0 in self.shape: return self
-    # TODO: someday the optimizer will find this on it's own
+    # TODO: someday the optimizer will find this on its own
     # for now this is a two stage cumsum
     SPLIT = 256
     if not isinstance(s:=self.shape[axis], int) or s <= SPLIT*2: return self._cumalu(axis, op)