default opt level 2

2026-01-09 23:18:04 -05:00 · 2022-07-17 14:54:40 -07:00
parent 73b0471b25
commit eda6f071b2
3 changed files with 10 additions and 6 deletions
--- a/extra/training.py
+++ b/extra/training.py
@@ -31,9 +31,6 @@ def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categoric
    loss.backward()
    optim.step()

-    # TODO: corealize
-    for p in optim.params: p.realize()
-
    # printing
    if not noloss:
      cat = np.argmax(out.cpu().data, axis=-1)
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -22,12 +22,12 @@ OpType = Union[Type[UnaryOps], Type[BinaryOps], Type[ReduceOps], Type[MovementOp

 DEBUG = int(os.getenv("DEBUG", "0"))
 GRAPH = int(os.getenv("GRAPH", "0"))
-OPT = int(os.getenv("OPT", "1"))
+OPT = int(os.getenv("OPT", "2"))
 NOCONV = int(os.getenv("NOCONV", "0"))

 # TODO: movement ops that only change shape are really nops. treat them as such
 REMOVE_MOVEMENT_NOPS, MERGE_UNARY_OPS, MERGE_ELEMENTWISE_INTO_REDUCE = OPT>=1, OPT>=1, OPT>=1
-MERGE_ELEMENTWISE_OPS, MERGE_ONE_CONV_INTO_ELEMENTWISE, SHUFFLE_RESHAPE_OPS = OPT>=2, OPT>=2, OPT>=2
+MERGE_ELEMENTWISE_OPS, MERGE_ONE_CONV_INTO_ELEMENTWISE = OPT>=2, OPT>=2
 SHUFFLE_MOVEMENT_OPS = OPT>=3
 SHUFFLE_PAD_OPS = OPT>=4  # NOTE: 0/0 is NaN if you pad, so this can change the output

@@ -251,7 +251,7 @@ class LazyBuffer:
    # some permutes are actually just reshapes
    if op == MovementOps.PERMUTE and ShapeTracker(x.shape).movement_op(op, arg).contiguous: return x.movement_op(MovementOps.RESHAPE, tuple(x.shape[i] for i in arg))

-    if (SHUFFLE_MOVEMENT_OPS or (SHUFFLE_RESHAPE_OPS and op == MovementOps.RESHAPE)) and x.optype == BinaryOps and x.realized is None and (SHUFFLE_PAD_OPS or op != MovementOps.PAD) and op != MovementOps.STRIDED:
+    if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps and x.realized is None and (SHUFFLE_PAD_OPS or op != MovementOps.PAD) and op != MovementOps.STRIDED:
      # if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead
      def replace_with_movement_op(y:Union[LazyOp, LazyBuffer]) -> LazyBuffer:
        if isinstance(y, LazyBuffer): return y.movement_op(op, arg)
--- a/tinygrad/optim.py
+++ b/tinygrad/optim.py
@@ -9,6 +9,10 @@ class Optimizer:
    for param in self.params:
      param.grad = None

+  def realize(self, extra=[]):
+    # TODO: corealize
+    for p in self.params + extra: p.realize()
+
 class SGD(Optimizer):
  def __init__(self, params, lr=0.001):
    super().__init__(params)
@@ -17,6 +21,7 @@ class SGD(Optimizer):
  def step(self):
    for t in self.params:
      t -= t.grad * self.lr
+    self.realize()

 class RMSprop(Optimizer):
  def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8):
@@ -29,6 +34,7 @@ class RMSprop(Optimizer):
    for i, t in enumerate(self.params):
      self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * (t.grad * t.grad)
      t -= (t.grad * self.lr).div(self.v[i].sqrt() + self.eps)
+    self.realize(self.v)

 class Adam(Optimizer):
  def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
@@ -45,3 +51,4 @@ class Adam(Optimizer):
      self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad
      self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * (t.grad * t.grad)
      t -= a * self.m[i].div(self.v[i].sqrt() + self.eps)
+    self.realize(self.m + self.v)