From 2bb6b95e9f6eb11f43fe34e35db6a6e2c12910d3 Mon Sep 17 00:00:00 2001
From: ignaciosica <p4ssenger.developer@gmail.com>
Date: Sat, 12 Oct 2024 01:46:22 -0300
Subject: [PATCH 01/17] refactor _make_hip_code_for_op into pm rules (#7001)

---
 tinygrad/renderer/cstyle.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index d57deae1d1..35ca86d007 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -369,16 +369,6 @@ code_for_op_hip = { UnaryOps.SQRT: lambda x,dtype: f"__ocml_sqrt_f{ {dtypes.half
                     # TODO: MAX with int uses fmax_f32?
                     BinaryOps.MAX: lambda a,b,dtype: f"__ocml_fmax_f{ {dtypes.half:16, dtypes.double:64}.get(dtype, 32) }({a},{b})",}
 
-def _make_hip_code_for_op():
-  def wrapper(key, func):
-    def cast_bf16(*args):
-      if args[-1] == dtypes.bfloat16:
-        operands = tuple(f"(float)({arg})" for arg in (args[1:-1] if key is TernaryOps.WHERE else args[:-1]))
-        return f"(hip_bfloat16)({func(*(((args[0],) if key is TernaryOps.WHERE else ()) + operands), dtypes.float)})"
-      return func(*args)
-    return cast_bf16
-  return { k:wrapper(k,v) for k,v in {**CStyleLanguage.code_for_op, **code_for_op_hip}.items() }
-
 class AMDRenderer(CStyleLanguage):
   device = "AMD"
   shared_max = 65536
@@ -397,13 +387,18 @@ class AMDRenderer(CStyleLanguage):
   kernel_prefix += '\nextern "C" __attribute__((global))'
   code_for_workitem = {"g": lambda x: f"__ockl_get_group_id({x})", "l": lambda x: f"__ockl_get_local_id({x})",
                        "i": lambda x: f"(__ockl_get_group_id({x})*__ockl_get_local_size({x})+__ockl_get_local_id({x}))"}
-  code_for_op = _make_hip_code_for_op()
+  code_for_op = { **CStyleLanguage.code_for_op, **code_for_op_hip }
   smem_prefix = "__attribute__((shared))"
   barrier = '__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");' + '__builtin_amdgcn_s_barrier();' + \
             '__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");'
   float4 = "make_float4"
   uses_ptr_arithmetic = False  # NOTE: this fixes TestLinearizerOverflowAlt
   type_map = {dtypes.bfloat16: "hip_bfloat16"}
+  extra_matcher = PatternMatcher([
+    (UPat(UOps.ALU, arg=TernaryOps.WHERE, src=(UPat.var("b"), UPat.var("x", dtype=dtypes.bfloat16), UPat.var("y", dtype=dtypes.bfloat16))),
+      lambda b,x,y: UOp(UOps.ALU, arg=TernaryOps.WHERE, dtype=dtypes.float, src=(b,x.cast(dtypes.float),y.cast(dtypes.float))).cast(dtypes.bfloat16)),
+    *[(UPat(UOps.ALU, dtype=dtypes.bfloat16, name="x"),
+      lambda x: (UOp(x.op, dtypes.float, tuple(vv.cast(dtypes.float) for vv in x.src), x.arg).cast(dtypes.bfloat16)))]]) + extra_pm
 
   def render_vector_prefix(self, dtype:DType) -> str:
     vec, scal = self.render_dtype(dtype), self.render_dtype(dtype.scalar())

From f79e05cac0be3a3f0d37e4c072a070283e62cb9a Mon Sep 17 00:00:00 2001
From: Bhavya Gada <bsg@somaiya.edu>
Date: Fri, 11 Oct 2024 23:42:14 -0700
Subject: [PATCH 02/17] add types in all nn/init.py classes (#7002)

* add types in batchnorm class

* fix lint error in batchnorm types

* add types to conv1d function

* add types to convtranspose1d func and conv2d, convtranspose2d classes

* add types to all remaining classes

* change conv1d padding type to also accept str

* less is more; only keep non-obvious types

* mkdocs need types
---
 tinygrad/nn/__init__.py | 52 ++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/tinygrad/nn/__init__.py b/tinygrad/nn/__init__.py
index bd7c7eb37b..6eff505072 100644
--- a/tinygrad/nn/__init__.py
+++ b/tinygrad/nn/__init__.py
@@ -1,5 +1,6 @@
+from __future__ import annotations
 import math
-from typing import Optional, Union, Tuple
+from typing import Optional, Union, Tuple, List
 from tinygrad.tensor import Tensor
 from tinygrad.helpers import prod, make_pair
 from tinygrad.nn import optim, state, datasets  # noqa: F401
@@ -39,7 +40,7 @@ class BatchNorm:
     if track_running_stats: self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False)
 
   def calc_stats(self, x:Tensor) -> Tuple[Tensor, Tensor]:
-    shape_mask = [1, -1, *([1]*(x.ndim-2))]
+    shape_mask: List[int] = [1, -1, *([1]*(x.ndim-2))]
     if self.track_running_stats and not Tensor.training: return self.running_mean, self.running_var.reshape(shape=shape_mask).expand(x.shape)
     # This requires two full memory accesses to x
     # https://github.com/pytorch/pytorch/blob/c618dc13d2aa23625cb0d7ada694137532a4fa33/aten/src/ATen/native/cuda/Normalization.cuh
@@ -49,7 +50,7 @@ class BatchNorm:
     batch_var = (y*y).mean(axis=reduce_axes)
     return batch_mean, batch_var
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     batch_mean, batch_var = self.calc_stats(x)
     # NOTE: wow, this is done all throughout training in most PyTorch models
     if self.track_running_stats and Tensor.training:
@@ -59,7 +60,7 @@ class BatchNorm:
     return x.batchnorm(self.weight, self.bias, batch_mean, batch_var.add(self.eps).rsqrt())
 BatchNorm2d = BatchNorm3d = BatchNorm
 
-def Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+def Conv1d(in_channels:int, out_channels:int, kernel_size:int, stride=1, padding:Union[int, str]=0, dilation=1, groups=1, bias=True) -> Conv2d:
   """
   Applies a 1D convolution over an input signal composed of several input planes.
 
@@ -93,22 +94,24 @@ class Conv2d:
   print(t.numpy())
   ```
   """
-  def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
-    self.kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
+  def __init__(self, in_channels:int, out_channels:int, kernel_size:Union[int, Tuple[int, ...]], stride=1, padding:Union[int, str]=0,
+                dilation=1, groups=1, bias=True):
+    self.kernel_size: Tuple[int, ...] = (kernel_size, kernel_size) if isinstance(kernel_size, int) else tuple(kernel_size)
     if isinstance(padding, str):
       if padding.lower() != 'same': raise ValueError(f"Invalid padding string {padding!r}, only 'same' is supported")
       if stride != 1: raise ValueError("padding='same' is not supported for strided convolutions")
-      self.padding = [p for d,k in zip(make_pair(dilation,len(self.kernel_size)), self.kernel_size[::-1]) for p in (d*(k-1)//2, d*(k-1) - d*(k-1)//2)]
+      self.padding: Union[int, List[int]] = [p for d,k in zip(make_pair(dilation,len(self.kernel_size)), self.kernel_size[::-1]) for p in (d*(k-1)//2, d*(k-1) - d*(k-1)//2)] #noqa:E501
     else: self.padding = padding
     self.stride, self.dilation, self.groups = stride, dilation, groups
     scale = 1 / math.sqrt(in_channels * prod(self.kernel_size))
     self.weight = Tensor.uniform(out_channels, in_channels//groups, *self.kernel_size, low=-scale, high=scale)
-    self.bias = Tensor.uniform(out_channels, low=-scale, high=scale) if bias else None
+    self.bias: Optional[Tensor] = Tensor.uniform(out_channels, low=-scale, high=scale) if bias else None
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     return x.conv2d(self.weight, self.bias, padding=self.padding, stride=self.stride, dilation=self.dilation, groups=self.groups)
 
-def ConvTranspose1d(in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
+def ConvTranspose1d(in_channels:int, out_channels:int, kernel_size:int, stride=1, padding=0, output_padding=0, dilation=1,
+                      groups=1, bias=True) -> ConvTranspose2d:
   """
   Applies a 1D transposed convolution operator over an input signal composed of several input planes.
 
@@ -142,13 +145,14 @@ class ConvTranspose2d(Conv2d):
   print(t.numpy())
   ```
   """
-  def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, dilation=1, groups=1, bias=True):
+  def __init__(self, in_channels:int, out_channels:int, kernel_size:Union[int, Tuple[int, ...]], stride=1, padding=0, output_padding=0,
+                dilation=1, groups=1, bias=True):
     super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
     scale = 1 / math.sqrt(in_channels * prod(self.kernel_size))
     self.weight = Tensor.uniform(in_channels, out_channels//groups, *self.kernel_size, low=-scale, high=scale)
     self.output_padding = output_padding
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     return x.conv_transpose2d(self.weight, self.bias, padding=self.padding, output_padding=self.output_padding, stride=self.stride,
                               dilation=self.dilation, groups=self.groups)
 
@@ -168,12 +172,12 @@ class Linear:
   print(t.numpy())
   ```
   """
-  def __init__(self, in_features, out_features, bias=True):
+  def __init__(self, in_features:int, out_features:int, bias=True):
     bound = 1 / math.sqrt(in_features)
     self.weight = Tensor.uniform(out_features, in_features, low=-bound, high=bound)
     self.bias = Tensor.uniform(out_features, low=-bound, high=bound) if bias else None
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     return x.linear(self.weight.transpose(), self.bias)
 
 class GroupNorm:
@@ -193,12 +197,12 @@ class GroupNorm:
   print(t.mean().item(), t.std().item())
   ```
   """
-  def __init__(self, num_groups:int, num_channels:int, eps:float=1e-5, affine:bool=True):
+  def __init__(self, num_groups:int, num_channels:int, eps=1e-5, affine=True):
     self.num_groups, self.num_channels, self.eps = num_groups, num_channels, eps
     self.weight: Optional[Tensor] = Tensor.ones(num_channels) if affine else None
     self.bias: Optional[Tensor] = Tensor.zeros(num_channels) if affine else None
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     # reshape for layernorm to work as group norm
     # subtract mean and divide stddev
     x = x.reshape(x.shape[0], self.num_groups, -1).layernorm(eps=self.eps).reshape(x.shape)
@@ -224,12 +228,12 @@ class InstanceNorm:
   print(t.mean().item(), t.std().item())
   ```
   """
-  def __init__(self, num_features:int, eps:float=1e-5, affine:bool=True):
+  def __init__(self, num_features:int, eps=1e-5, affine=True):
     self.num_features, self.eps = num_features, eps
     self.weight: Optional[Tensor] = Tensor.ones(num_features) if affine else None
     self.bias: Optional[Tensor] = Tensor.zeros(num_features) if affine else None
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     x = x.reshape(x.shape[0], self.num_features, -1).layernorm(eps=self.eps).reshape(x.shape)
     if self.weight is None or self.bias is None: return x
     return x * self.weight.reshape(1, -1, *[1] * (len(x.shape)-2)) + self.bias.reshape(1, -1, *[1] * (len(x.shape)-2))
@@ -251,12 +255,12 @@ class LayerNorm:
   print(t.mean().item(), t.std().item())
   ```
   """
-  def __init__(self, normalized_shape:Union[int, Tuple[int, ...]], eps:float=1e-5, elementwise_affine:bool=True):
-    self.normalized_shape = (normalized_shape,) if isinstance(normalized_shape, int) else tuple(normalized_shape)
+  def __init__(self, normalized_shape:Union[int, Tuple[int, ...]], eps=1e-5, elementwise_affine=True):
+    self.normalized_shape: Tuple[int, ...] = (normalized_shape,) if isinstance(normalized_shape, int) else tuple(normalized_shape)
     self.axis, self.eps, self.elementwise_affine = tuple(-1-i for i in range(len(self.normalized_shape))), eps, elementwise_affine
     self.weight, self.bias = (Tensor.ones(*self.normalized_shape), Tensor.zeros(*self.normalized_shape)) if elementwise_affine else (None, None)
 
-  def __call__(self, x:Tensor):
+  def __call__(self, x:Tensor) -> Tensor:
     assert self.normalized_shape == x.shape[-len(self.normalized_shape):], f"last dimensions of {x.shape} must match {self.normalized_shape}"
     x = x.layernorm(eps=self.eps, axis=self.axis)
     if not self.elementwise_affine: return x
@@ -278,7 +282,7 @@ class LayerNorm2d(LayerNorm):
   print(t.mean().item(), t.std().item())
   ```
   """
-  def __call__(self, x): return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+  def __call__(self, x: Tensor) -> Tensor: return super().__call__(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
 
 class RMSNorm:
   """
@@ -296,9 +300,9 @@ class RMSNorm:
   print(norm(t).numpy())
   ```
   """
-  def __init__(self, dim, eps=1e-6): self.eps, self.weight = eps, Tensor.ones(dim)
+  def __init__(self, dim:int, eps=1e-6): self.eps, self.weight = eps, Tensor.ones(dim)
 
-  def _norm(self, x:Tensor): return x * (x.square().mean(-1, keepdim=True) + self.eps).rsqrt()
+  def _norm(self, x:Tensor) -> Tensor: return x * (x.square().mean(-1, keepdim=True) + self.eps).rsqrt()
 
   def __call__(self, x:Tensor) -> Tensor: return self._norm(x.float()).cast(x.dtype) * self.weight
 

From 5ae2de9845e4cd5c1a05cfb66e812d83a7dd88d7 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 12 Oct 2024 18:20:44 +0800
Subject: [PATCH 03/17] UOp.variable (#7010)

* UOp.variable [pr]

* fix tests

* clean

* improve name rendering

* last bug
---
 extra/optimization/helpers.py          |  3 ++-
 extra/optimization/test_beam_search.py |  3 +--
 test/external/fuzz_symbolic.py         |  3 ++-
 test/models/test_real_world.py         |  3 +--
 test/test_sample.py                    |  3 +--
 test/test_symbolic_jit.py              |  4 +--
 test/test_symbolic_ops.py              |  2 +-
 test/test_symbolic_shapetracker.py     |  3 ++-
 test/test_uops.py                      |  3 +--
 test/unit/test_helpers.py              |  3 ++-
 test/unit/test_shapetracker.py         |  3 ++-
 test/unit/test_shapetracker_math.py    |  2 +-
 test/unit/test_uop_symbolic.py         |  2 +-
 tinygrad/__init__.py                   |  3 ++-
 tinygrad/codegen/kernel.py             |  3 ++-
 tinygrad/ops.py                        | 35 ++++++++++++++++++--------
 tinygrad/renderer/__init__.py          |  2 +-
 tinygrad/shape/symbolic.py             | 19 +++-----------
 tinygrad/shape/view.py                 |  2 +-
 19 files changed, 52 insertions(+), 49 deletions(-)

diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py
index 56e66f274f..66a6409411 100644
--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -1,12 +1,13 @@
 # stuff needed to unpack a kernel
 from typing import Tuple
 from extra.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, BufferOps, MemBuffer, ConstBuffer, MetaOps
+from tinygrad import Variable
 from tinygrad.codegen.kernel import Opt, OptOps
 from tinygrad.ops import UOp, UOps, KernelInfo
 from tinygrad.dtype import dtypes, PtrDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.shape.symbolic import Variable, NumNode
+from tinygrad.shape.symbolic import NumNode
 inf, nan = float('inf'), float('nan')
 
 # kernel unpacker
diff --git a/extra/optimization/test_beam_search.py b/extra/optimization/test_beam_search.py
index e43502863e..c37e8e5785 100644
--- a/extra/optimization/test_beam_search.py
+++ b/extra/optimization/test_beam_search.py
@@ -2,8 +2,7 @@ import unittest
 import numpy as np
 
 from tinygrad.helpers import BEAM, Timing, CI
-from tinygrad.shape.symbolic import Variable
-from tinygrad.tensor import Tensor
+from tinygrad import Variable, Tensor
 from tinygrad.nn import Conv2d
 
 def rand(*shape):
diff --git a/test/external/fuzz_symbolic.py b/test/external/fuzz_symbolic.py
index 79eb52f978..a3c505f93f 100644
--- a/test/external/fuzz_symbolic.py
+++ b/test/external/fuzz_symbolic.py
@@ -1,7 +1,8 @@
 import itertools
 import random
+from tinygrad import Variable
 from tinygrad.helpers import DEBUG
-from tinygrad.shape.symbolic import Variable, NumNode
+from tinygrad.shape.symbolic import NumNode
 random.seed(42)
 
 def add_v(expr, rng=None):
diff --git a/test/models/test_real_world.py b/test/models/test_real_world.py
index fdf80a76d0..9b543f0def 100644
--- a/test/models/test_real_world.py
+++ b/test/models/test_real_world.py
@@ -3,9 +3,8 @@ import numpy as np
 from tinygrad.nn import optim
 from tinygrad.nn.state import get_parameters
 from tinygrad.engine.jit import TinyJit
-from tinygrad import Tensor, Device, GlobalCounters, dtypes
+from tinygrad import Tensor, Device, GlobalCounters, dtypes, Variable
 from tinygrad.helpers import CI, Context
-from tinygrad.shape.symbolic import Variable
 from extra.lr_scheduler import OneCycleLR
 from test.helpers import derandomize_model, is_dtype_supported
 
diff --git a/test/test_sample.py b/test/test_sample.py
index 3bb4cf76f6..9af7557ca1 100644
--- a/test/test_sample.py
+++ b/test/test_sample.py
@@ -1,7 +1,6 @@
 import unittest
 import numpy as np
-from tinygrad.tensor import Tensor
-from tinygrad.shape.symbolic import Variable
+from tinygrad import Tensor, Variable
 
 class TestSample(unittest.TestCase):
   def test_sample(self):
diff --git a/test/test_symbolic_jit.py b/test/test_symbolic_jit.py
index e4e876907e..05608180aa 100644
--- a/test/test_symbolic_jit.py
+++ b/test/test_symbolic_jit.py
@@ -1,9 +1,7 @@
 import unittest
 
 from test.helpers import assert_jit_cache_len
-from tinygrad.engine.jit import TinyJit
-from tinygrad.shape.symbolic import Variable
-from tinygrad.tensor import Tensor
+from tinygrad import Variable, Tensor, TinyJit
 import numpy as np
 
 class TestSymbolicJit(unittest.TestCase):
diff --git a/test/test_symbolic_ops.py b/test/test_symbolic_ops.py
index 8142c5137d..ada84bd90a 100644
--- a/test/test_symbolic_ops.py
+++ b/test/test_symbolic_ops.py
@@ -1,5 +1,5 @@
 import unittest
-from tinygrad.shape.symbolic import Variable
+from tinygrad import Variable
 from tinygrad.helpers import getenv
 from tinygrad.tensor import Tensor
 from examples.gpt2 import Attention
diff --git a/test/test_symbolic_shapetracker.py b/test/test_symbolic_shapetracker.py
index 17f644f2f0..f89583b40d 100644
--- a/test/test_symbolic_shapetracker.py
+++ b/test/test_symbolic_shapetracker.py
@@ -1,6 +1,7 @@
 import unittest
 from tinygrad.shape.shapetracker import ShapeTracker, View
-from tinygrad.shape.symbolic import Variable, NumNode
+from tinygrad import Variable
+from tinygrad.shape.symbolic import NumNode
 from tinygrad.tensor import Tensor
 
 class TestSymbolic(unittest.TestCase):
diff --git a/test/test_uops.py b/test/test_uops.py
index 870c6cc917..0ddc026606 100644
--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -12,7 +12,6 @@ from tinygrad.engine.schedule import create_schedule, reduceop_fusor
 from tinygrad.engine.realize import CompiledRunner, lower_schedule_item, get_kernel
 from tinygrad.codegen.linearize import linearize_uop
 from tinygrad.codegen.uopgraph import full_graph_rewrite, sym
-from tinygrad.shape.symbolic import Variable
 from test.helpers import is_dtype_supported, assert_equiv_uops
 
 def to_uops_list(u:List[UOp], opts=None, skip_check=False) -> List[UOp]: return linearize_uop(full_graph_rewrite(UOp.sink(*u), opts), skip_check)
@@ -360,7 +359,7 @@ class TestUOpMethod(unittest.TestCase):
     assert (add < mul) or (mul < add), "add and mul with same src should have an order"
 
   def test_uop_variables(self):
-    a = Variable("a", 1, 10)
+    a = UOp.variable("a", 1, 10)
     uop_var = UOp.const(dtypes.int, a)
     st_var = UOp(UOps.LOAD, dtypes.float, (UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.float), (), 0),
                                            ShapeTracker.from_shape((2, a)).to_uop()))
diff --git a/test/unit/test_helpers.py b/test/unit/test_helpers.py
index 77580491fc..c8f4fa2b77 100644
--- a/test/unit/test_helpers.py
+++ b/test/unit/test_helpers.py
@@ -1,10 +1,11 @@
 import gzip, unittest
 from PIL import Image
+from tinygrad import Variable
 from tinygrad.helpers import Context, ContextVar
 from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv
 from tinygrad.tensor import get_shape
 from tinygrad.codegen.lowerer import get_contraction
-from tinygrad.shape.symbolic import Variable, NumNode
+from tinygrad.shape.symbolic import NumNode
 import numpy as np
 
 VARIABLE = ContextVar("VARIABLE", 0)
diff --git a/test/unit/test_shapetracker.py b/test/unit/test_shapetracker.py
index f55dab829b..d7082543b2 100644
--- a/test/unit/test_shapetracker.py
+++ b/test/unit/test_shapetracker.py
@@ -4,7 +4,8 @@ import numpy as np
 from tinygrad.dtype import dtypes
 from tinygrad.helpers import prod
 from tinygrad.shape.shapetracker import ShapeTracker, View
-from tinygrad.shape.symbolic import Variable, NumNode
+from tinygrad import Variable
+from tinygrad.shape.symbolic import NumNode
 from tinygrad.ops import UOp, UOps, graph_rewrite
 from tinygrad.codegen.uopgraph import sym
 from itertools import product
diff --git a/test/unit/test_shapetracker_math.py b/test/unit/test_shapetracker_math.py
index cf9a7561a4..96af3d8288 100644
--- a/test/unit/test_shapetracker_math.py
+++ b/test/unit/test_shapetracker_math.py
@@ -3,7 +3,7 @@ from typing import List
 from tinygrad.helpers import prod
 from tinygrad.shape.view import View
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.shape.symbolic import Variable
+from tinygrad import Variable
 from test.unit.test_shapetracker import shapetracker_getitem
 
 class MultiShapeTracker:
diff --git a/test/unit/test_uop_symbolic.py b/test/unit/test_uop_symbolic.py
index a95d17701d..f280b040b1 100644
--- a/test/unit/test_uop_symbolic.py
+++ b/test/unit/test_uop_symbolic.py
@@ -12,7 +12,7 @@ from tinygrad.dtype import dtypes, PtrDType, ConstType
 from tinygrad.codegen.linearize import linearize_uop
 from tinygrad.codegen.uopgraph import full_graph_rewrite
 from tinygrad.ops import BinaryOps, UOp, UOps, print_uops
-from tinygrad.shape.symbolic import Variable
+from tinygrad import Variable
 import functools
 
 def render(self) -> Tuple[str, ConstType, ConstType]:
diff --git a/tinygrad/__init__.py b/tinygrad/__init__.py
index 841215def6..947d03c9ed 100644
--- a/tinygrad/__init__.py
+++ b/tinygrad/__init__.py
@@ -4,7 +4,8 @@ if int(os.getenv("TYPED", "0")):
   install_import_hook(__name__)
 from tinygrad.tensor import Tensor                                    # noqa: F401
 from tinygrad.engine.jit import TinyJit                               # noqa: F401
-from tinygrad.shape.symbolic import Variable                          # noqa: F401
+from tinygrad.ops import UOp
+Variable = UOp.variable
 from tinygrad.dtype import dtypes                                     # noqa: F401
 from tinygrad.helpers import GlobalCounters, fetch, Context, getenv   # noqa: F401
 from tinygrad.device import Device                                    # noqa: F401
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index 6a6c4b3dee..2eb5f47332 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -603,7 +603,8 @@ class Kernel:
     # kernel name (before late upcast)
     name = ("r" if self.reduceop is not None else ("C" if all(x.op in BUFFER_UOPS for x in self.ast.parents) else "E")) + \
                  (f"{len(self.ast.src)}_" if len(self.ast.src) > 1 else "_") + \
-                 colored('_', 'BLACK').join([colored(str(x), c) for x,c in zip(self.full_shape, self.colors())])
+                 colored('_', 'BLACK').join([colored(str(x.render() if isinstance(x, UOp) else x), c) \
+                                             for x,c in zip(self.full_shape, self.colors())])
 
     # name the function something unique
     Kernel.kernel_cnt[(function_name := to_function_name(name))] += 1
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 8ed9838bd5..837b759c0d 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -185,6 +185,7 @@ class UOp(MathTrait):
   __slots__ = ["op", "dtype", "src", "arg"]
   def __init__(self, op: UOps, dtype:DType=dtypes.void, src: Tuple[UOp,...]=tuple(), arg:Any=None):
     # TODO: instant check rules here make debugging easier
+    #assert op in UOps and isinstance(dtype, DType), f"bad UOp creation with {op} {dtype}"
     #if op is UOps.ALU and arg is BinaryOps.CMPNE: assert dtype.scalar() == dtypes.bool
     #if op is UOps.VECTORIZE and dtype != dtypes.void: assert len(src) == dtype.count, f"{len(src)} invalid for {dtype}"
     #if op is UOps.ALU and arg not in (BinaryOps.CMPNE, BinaryOps.CMPLT, TernaryOps.WHERE): assert all_same([dtype] + [x.dtype for x in src])
@@ -276,14 +277,35 @@ class UOp(MathTrait):
     if isinstance(b, UOp): return b.unbind()[0] if b.op is UOps.BIND else b
     if isinstance(b, tuple) and all_same(b): b = b[0]  # doesn't have to be a VCONST if they are all the same
     return UOp(UOps.VCONST if isinstance(b, tuple) else UOps.CONST, dtype, arg=dtypes.as_const(b, dtype) if dtype is not None else b) # type: ignore
+
+  # *** Variable stuff ***
+
+  @staticmethod
+  def variable(name:str, min_val:ConstType, max_val:ConstType): return UOp(UOps.DEFINE_VAR, dtypes.int, arg=(name, min_val, max_val))
   @staticmethod
   def define_var(name:str, dtype:DType, min_val:ConstType, max_val:ConstType): return UOp(UOps.DEFINE_VAR, dtype, arg=(name, min_val, max_val))
+  @property
+  def expr(self):
+    assert self.op is UOps.DEFINE_VAR, f"op is {self.op}, need DEFINE_VAR"
+    return self.arg[0]
+  def bind(self, val:int):
+    assert self.op is UOps.DEFINE_VAR, f"op is {self.op}, need DEFINE_VAR"
+    assert self.arg[1] <= val and val <= self.arg[2], f"bind {val} not in range {self.arg[1]}-{self.arg[2]}"
+    return UOp(UOps.BIND, self.dtype, (self, self.const_like(val)))
   def unbind(self) -> Tuple[Variable, int]:
     assert self.op is UOps.BIND and self.src[0].op is UOps.DEFINE_VAR and self.src[1].op is UOps.CONST, f"can't unbind {self}"
-    from tinygrad.shape.symbolic import Variable
-    return cast(Variable, self.src[0]), self.src[1].arg
+    return self.src[0], self.src[1].arg
   @property
   def val(self) -> int: return self.unbind()[1]
+  def vars(self) -> Set[UOp]:
+    bound_vars = set([x for x in self.sparents if x.op is UOps.BIND and x.src[0].op is UOps.DEFINE_VAR])
+    bound_var_base = set(x.src[0] for x in bound_vars)
+    all_vars = set([x for x in self.sparents if x.op is UOps.DEFINE_VAR])
+    return bound_vars.union(set([x for x in all_vars if x not in bound_var_base]))
+  def variables(self) -> List[Variable]:
+    st_vars: List[Set[Variable]] = [x.st_arg.vars() for x in self.sparents if x.op in BUFFER_UOPS]
+    return sorted(set.union(*st_vars, [x.unbind()[0] if x.op is not UOps.DEFINE_VAR else x for x in self.vars()]), key=lambda v: v.arg)
+
   # TODO: this is context rewrite
   def substitute(self, dvars:Dict[UOp, UOp]):
     if self in dvars: return dvars[self]
@@ -299,15 +321,6 @@ class UOp(MathTrait):
   @functools.cached_property
   def full_shape(self) -> Tuple[sint, ...]:
     return self.arg.shape if self.op is UOps.VIEW else tuple(smax(x) for x in zip(*[x.full_shape for x in self.src if x.has_st]))
-  def vars(self) -> Set[UOp]:
-    bound_vars = set([x for x in self.sparents if x.op is UOps.BIND and x.src[0].op is UOps.DEFINE_VAR])
-    bound_var_base = set(x.src[0] for x in bound_vars)
-    all_vars = set([x for x in self.sparents if x.op is UOps.DEFINE_VAR])
-    return bound_vars.union(set([x for x in all_vars if x not in bound_var_base]))
-  def variables(self) -> List[Variable]:
-    st_vars: List[Set[Variable]] = [x.st_arg.vars() for x in self.sparents if x.op in BUFFER_UOPS]
-    from tinygrad.shape.symbolic import Variable
-    return sorted(set.union(*st_vars, [x.unbind()[0] if not isinstance(x, Variable) else x for x in self.vars()]), key=lambda v: v.arg)
   def const_factor(self) -> int:
     """largest known int that divides self"""
     if self.op is UOps.CONST: return self.arg
diff --git a/tinygrad/renderer/__init__.py b/tinygrad/renderer/__init__.py
index 1a8006c166..985b160452 100644
--- a/tinygrad/renderer/__init__.py
+++ b/tinygrad/renderer/__init__.py
@@ -43,7 +43,7 @@ class Program:
     if not self._ran_post_init and self.uops is not None:
       # single pass through the uops
       for u in self.uops:
-        if u.op is UOps.DEFINE_VAR: self.vars.append(Variable(u.arg[0], u.arg[1], u.arg[2]))
+        if u.op is UOps.DEFINE_VAR: self.vars.append(u)
         if u.op is UOps.DEFINE_GLOBAL: self.globals.append(u.arg)
         if u.op is UOps.STORE: self.outs.extend([x.arg for x in u.src[0].sparents if x.op is UOps.DEFINE_GLOBAL])
         if u.op is UOps.SPECIAL:
diff --git a/tinygrad/shape/symbolic.py b/tinygrad/shape/symbolic.py
index 605e214194..c24dade84a 100644
--- a/tinygrad/shape/symbolic.py
+++ b/tinygrad/shape/symbolic.py
@@ -1,28 +1,17 @@
 from __future__ import annotations
-from typing import Union, Optional, Dict, cast
+from typing import Union, Optional, Dict
 from tinygrad.dtype import dtypes
-from tinygrad.ops import UOp, UOps, exec_alu, ConstType
+from tinygrad.ops import UOp, UOps, exec_alu
 
 sint = Union[int, UOp]
 
 def NumNode(val:int): return UOp.const(dtypes.int, val)
-class Variable(UOp):
-  def __reduce__(self): return Variable, self.arg
-  def __new__(cls, expr:str, nmin:ConstType, nmax:ConstType):  # pylint: disable=signature-differs
-    return super().__new__(cls, UOps.DEFINE_VAR, dtypes.int, arg=(expr, nmin, nmax))
-  def __init__(self, expr:str, nmin:ConstType, nmax:ConstType):
-    super().__init__(UOps.DEFINE_VAR, dtypes.int, arg=(expr, nmin, nmax))
-  def bind(self, val:int):
-    assert self.op is UOps.DEFINE_VAR, f"op is {self.op}"
-    assert self.arg[1] <= val and val <= self.arg[2], f"bind {val} not in range {self.arg[1]}-{self.arg[2]}"
-    return UOp(UOps.BIND, self.dtype, (self, self.const_like(val)))
-  @property
-  def expr(self): return self.arg[0]
+Variable = UOp
 
 def sym_infer(uop: Union[UOp, int], var_vals: Optional[Dict[Variable, int]]) -> int:
   if isinstance(uop, (int, float)): return uop   # TODO: ugh, the float is a hack for qcom
   if uop.op == UOps.CONST: return uop.arg
-  if uop.op == UOps.DEFINE_VAR and var_vals is not None: return var_vals[cast(Variable, uop)]
+  if uop.op == UOps.DEFINE_VAR and var_vals is not None: return var_vals[uop]
   if uop.op == UOps.BIND: return uop.src[1].arg  # bound variable returns bound value
   if uop.op == UOps.ALU:
     src_values = [sym_infer(src, var_vals) for src in uop.src]
diff --git a/tinygrad/shape/view.py b/tinygrad/shape/view.py
index 55dc5a8a98..719703859b 100644
--- a/tinygrad/shape/view.py
+++ b/tinygrad/shape/view.py
@@ -164,7 +164,7 @@ class View:
 
     # Merge dimensions in vm2 if required.
     # NB: Merging too many dimensions can make it difficult to project vm2's mask, hence only combining when required.
-    idxs: List[UOp] = [Variable(f"idx{i}", 0, s-1) for i,s in enumerate(vm1.shape)]
+    idxs: List[UOp] = [UOp.variable(f"idx{i}", 0, s-1) for i,s in enumerate(vm1.shape)]
     merged_size, merged_term = 1, NumNode(0)
     extents: List[Tuple[sint, UOp]] = []
     for term, s, o in zip(reversed(terms), reversed(vm2.shape), reversed(origin)):

From b737ee5bac40fc4711fda7877c115cc28015167e Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 12 Oct 2024 18:20:57 +0800
Subject: [PATCH 04/17] move to_indexed_uops to uops (#7011)

* move to_indexed_uops to uops

* UOp.range
---
 tinygrad/codegen/lowerer.py    |  3 ++-
 tinygrad/ops.py                |  5 +++--
 tinygrad/shape/shapetracker.py | 22 ++++------------------
 tinygrad/shape/view.py         | 13 +++++++++++++
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py
index 3cb804769b..da6915dc5a 100644
--- a/tinygrad/codegen/lowerer.py
+++ b/tinygrad/codegen/lowerer.py
@@ -3,7 +3,8 @@ from __future__ import annotations
 import functools, itertools, operator
 from dataclasses import dataclass
 from typing import List, Tuple, cast, Optional
-from tinygrad.shape.shapetracker import ShapeTracker, variable_to_uop
+from tinygrad.shape.shapetracker import ShapeTracker
+from tinygrad.shape.view import variable_to_uop
 from tinygrad.shape.symbolic import sint
 from tinygrad.dtype import dtypes
 from tinygrad.ops import KernelInfo, BinaryOps, UOp, UOps, graph_rewrite, PatternMatcher, UPat, resolve
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 837b759c0d..d780b30419 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -311,8 +311,9 @@ class UOp(MathTrait):
     if self in dvars: return dvars[self]
     return self.replace(src=tuple(x.substitute(dvars) for x in self.src))
   @staticmethod
-  def range(dtype:DType, start:ConstType, end:ConstType, idx:int):
-    return UOp(UOps.RANGE, dtype=dtype, src=(UOp.const(dtype, start), UOp.const(dtype, end)), arg=(idx,))
+  def range(dtype:DType, start:ConstType|UOp, end:ConstType|UOp, idx:int):
+    return UOp(UOps.RANGE, dtype=dtype, src=(UOp.const(dtype, start) if not isinstance(start, UOp) else start,
+                                             UOp.const(dtype, end) if not isinstance(end, UOp) else end), arg=idx)
   def reduce(self, op:BinaryOps, *rng:UOp): return UOp(UOps.REDUCE, self.dtype, (self,) + rng, op)
   @functools.cached_property
   def parents(self) -> Dict[UOp, None]: return {**{x:None for x in self.src}, **{k:None for x in self.src for k in x.parents}}
diff --git a/tinygrad/shape/shapetracker.py b/tinygrad/shape/shapetracker.py
index 7bad9f8890..3284858f36 100644
--- a/tinygrad/shape/shapetracker.py
+++ b/tinygrad/shape/shapetracker.py
@@ -6,18 +6,7 @@ from tinygrad.helpers import merge_dicts, getenv
 from tinygrad.shape.symbolic import Variable, sint
 from tinygrad.shape.view import View, strides_for_shape
 from tinygrad.dtype import dtypes
-from tinygrad.ops import UOp, UOps, BinaryOps, graph_rewrite, resolve, _get_chain, symbolic_flat
-
-def variable_to_uop(x, ctx=None) -> UOp: return UOp.const(dtypes.pyint, x) if isinstance(x, int) else x
-def _uop_view(view:View, idxs:List[UOp], vexpr:UOp) -> Tuple[UOp, UOp]:
-  # TODO: dtypes.realint
-  iexpr = variable_to_uop(view.offset)
-  for idx,sh,st,m in zip(idxs, view.shape, view.strides, view.mask if view.mask is not None else [None]*len(view.shape)):
-    if resolve(sh != 1) and resolve(st != 0): iexpr = iexpr + idx*variable_to_uop(st)
-    if m is not None:
-      if resolve(m[0] != 0): vexpr = vexpr * idx.ge(variable_to_uop(m[0]))
-      if resolve(m[1] != sh): vexpr = vexpr * idx.lt(variable_to_uop(m[1]))
-  return iexpr, vexpr
+from tinygrad.ops import UOp, UOps, BinaryOps, graph_rewrite, _get_chain, symbolic_flat
 
 @dataclass(frozen=True)
 class ShapeTracker:
@@ -55,17 +44,14 @@ class ShapeTracker:
   def to_uop(self) -> UOp: return UOp(UOps.VIEW, dtypes.void, (), self)
 
   def to_indexed_uops(self, _idxs:Optional[List[UOp]]=None) -> Tuple[UOp, UOp]:
-    idxs = [UOp(UOps.RANGE, dtypes.pyint, (UOp.const(dtypes.pyint, 0), variable_to_uop(s)), i) for i,s in enumerate(self.shape)] \
-      if _idxs is None else _idxs
-    idx, valid = _uop_view(self.views[-1], idxs, UOp.const(dtypes.bool, True))
+    idx, valid = self.views[-1].to_indexed_uops(_idxs)
     for view in reversed(self.views[0:-1]):
       view = view.minify()
       acc, idxs = 1, []
-      for _d in reversed(view.shape):
-        d = variable_to_uop(_d)
+      for d in reversed(view.shape):
         idxs.append((idx//acc)%d)
         acc *= d
-      idx, valid = _uop_view(view, idxs[::-1], valid)
+      idx, valid = view.to_indexed_uops(idxs[::-1], valid)
     return idx, valid
 
   def real_size(self) -> int:
diff --git a/tinygrad/shape/view.py b/tinygrad/shape/view.py
index 719703859b..99fbf7fff0 100644
--- a/tinygrad/shape/view.py
+++ b/tinygrad/shape/view.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 import functools, operator, itertools, math
 from dataclasses import dataclass
 from typing import Tuple, List, Optional, Dict, Set, cast, Union
+from tinygrad.dtype import dtypes
 from tinygrad.ops import resolve, UOp
 from tinygrad.helpers import prod, all_int, argsort
 from tinygrad.shape.symbolic import NumNode, Variable, sint, sym_infer
@@ -82,6 +83,8 @@ def un1d(shape:Tuple[sint, ...], offs:sint) -> List[sint]:
     offs -= here * stride
   return result
 
+def variable_to_uop(x, ctx=None) -> UOp: return UOp.const(dtypes.pyint, x) if isinstance(x, int) else x
+
 @dataclass(frozen=True)
 class View:
   shape:Tuple[sint, ...]
@@ -90,6 +93,16 @@ class View:
   mask:Optional[Tuple[Tuple[sint, sint], ...]]
   contiguous:bool
 
+  def to_indexed_uops(self:View, _idxs:Optional[List[UOp]]=None, vexpr:UOp=UOp.const(dtypes.bool, True)) -> Tuple[UOp, UOp]:
+    idxs = [UOp.range(dtypes.pyint, 0, s, i) for i,s in enumerate(self.shape)] if _idxs is None else _idxs
+    iexpr = variable_to_uop(self.offset)
+    for idx,sh,st,m in zip(idxs, self.shape, self.strides, self.mask if self.mask is not None else [None]*len(self.shape)):
+      if resolve(sh != 1) and resolve(st != 0): iexpr = iexpr + idx*st
+      if m is not None:
+        if resolve(m[0] != 0): vexpr = vexpr * idx.ge(m[0])
+        if resolve(m[1] != sh): vexpr = vexpr * idx.lt(m[1])
+    return iexpr, vexpr
+
   @functools.lru_cache(maxsize=None)  # pylint: disable=method-cache-max-size-none
   def size(self) -> int:
     # NOTE: Variable and the Node derived from it in symbolic shapes can only have int as max.

From 16271189eae777aad5f1c76f9605657ea33dac78 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Sat, 12 Oct 2024 18:20:19 +0800
Subject: [PATCH 05/17] hotfix: don't spend lines on a (broken) favicon

---
 tinygrad/viz/favicon.svg | 25 -------------------------
 tinygrad/viz/index.html  |  2 +-
 tinygrad/viz/serve.py    |  8 +-------
 3 files changed, 2 insertions(+), 33 deletions(-)
 delete mode 100644 tinygrad/viz/favicon.svg

diff --git a/tinygrad/viz/favicon.svg b/tinygrad/viz/favicon.svg
deleted file mode 100644
index 420a33e0f8..0000000000
--- a/tinygrad/viz/favicon.svg
+++ /dev/null
@@ -1,25 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 150 70" shape-rendering="crispEdges">
-  <g id="logo">
-    <!-- t -->
-    <polygon points="10,40 10,20 0,20 0,10 10,10 10,0 20,0 20,10 30,10 30,20 20,20 20,30 30,30 30,40" />
-    <!-- i -->
-    <polygon points="40,40 40,20 50,20 50,40" />
-    <polygon points="40,10 40,0 50,0 50,10" />
-    <!-- n -->
-    <polygon points="60,40 60,10 80,10 80,40 90,40 90,20 70,20 70,40" />
-    <!-- y -->
-    <polygon points="100,50 100,40 130,40 130,10 120,10 120,20 110,20 110,10 100,10 100,30 120,30 120,50" />
-  </g>
-  <style>
-  @media (prefers-color-scheme: dark) {
-    #logo {
-      fill: #fff;
-    }
-  }
-  @media (prefers-color-scheme: light) {
-    #logo {
-      fill: #000;
-    }
-  }
-  </style>
-</svg>
diff --git a/tinygrad/viz/index.html b/tinygrad/viz/index.html
index 86a6414cd7..657e3bc17d 100644
--- a/tinygrad/viz/index.html
+++ b/tinygrad/viz/index.html
@@ -3,7 +3,7 @@
 <head>
   <title>tinygrad viz</title>
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <link rel="icon" href="favicon.svg" type="image/svg+xml">
+  <link rel="icon" href="data:;base64,iVBORw0KGgo=">
   <link rel="preconnect" href="https://fonts.googleapis.com">
   <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
   <link href="https://fonts.googleapis.com/css2?family=Noto+Sans+HK:wght@100..900&display=swap" rel="stylesheet">
diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py
index e74c57a1bc..4ee84f4f74 100755
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -89,13 +89,7 @@ def get_details(k:Any, ctx:TrackedRewriteContext, metadata:GraphRewriteMetadata)
 
 class Handler(BaseHTTPRequestHandler):
   def do_GET(self):
-    if (url:=urlparse(self.path)).path == "/favicon.svg":
-      self.send_response(200)
-      self.send_header("Content-type", "image/svg+xml")
-      self.end_headers()
-      with open(os.path.join(os.path.dirname(__file__), "favicon.svg"), "rb") as f:
-        ret = f.read()
-    if url.path == "/":
+    if (url:=urlparse(self.path)).path == "/":
       self.send_response(200)
       self.send_header("Content-type", "text/html")
       self.end_headers()

From a71bb09ec38d2dd70b87deb807b6dcc3026024e7 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 12 Oct 2024 18:44:44 +0800
Subject: [PATCH 06/17] remove symbolic file [pr] (#7012)

---
 examples/handcode_opt.py                |  3 +--
 extra/assembly/assembly.py              |  2 +-
 extra/backends/hsa_graph.py             |  2 +-
 extra/ops.py                            |  2 +-
 extra/optimization/extract_policynet.py |  2 +-
 extra/optimization/extract_sa_pairs.py  |  2 +-
 extra/optimization/helpers.py           |  2 +-
 extra/optimization/pretrain_valuenet.py |  2 +-
 extra/to_movement_ops.py                |  2 +-
 test/external/fuzz_symbolic.py          |  2 +-
 test/external/fuzz_uops.py              |  2 +-
 test/helpers.py                         |  2 +-
 test/test_linearizer.py                 |  2 +-
 test/test_symbolic_shapetracker.py      |  2 +-
 test/unit/test_helpers.py               |  2 +-
 test/unit/test_shapetracker.py          |  2 +-
 test/unit/test_uop_symbolic.py          |  2 +-
 tinygrad/codegen/kernel.py              |  3 +--
 tinygrad/codegen/lowerer.py             |  3 +--
 tinygrad/engine/jit.py                  |  3 +--
 tinygrad/engine/lazy.py                 |  3 +--
 tinygrad/engine/realize.py              |  8 ++++----
 tinygrad/engine/schedule.py             |  3 +--
 tinygrad/engine/search.py               |  6 ++----
 tinygrad/function.py                    |  3 +--
 tinygrad/ops.py                         | 20 ++++++++++++++------
 tinygrad/renderer/__init__.py           |  3 +--
 tinygrad/runtime/graph/clang.py         |  2 +-
 tinygrad/runtime/graph/cuda.py          |  2 +-
 tinygrad/runtime/graph/hcq.py           |  2 +-
 tinygrad/runtime/graph/metal.py         |  2 +-
 tinygrad/shape/shapetracker.py          |  3 +--
 tinygrad/shape/symbolic.py              | 19 -------------------
 tinygrad/shape/view.py                  |  3 +--
 tinygrad/tensor.py                      |  3 +--
 35 files changed, 51 insertions(+), 75 deletions(-)
 delete mode 100644 tinygrad/shape/symbolic.py

diff --git a/examples/handcode_opt.py b/examples/handcode_opt.py
index ec30733ceb..42df2b216d 100644
--- a/examples/handcode_opt.py
+++ b/examples/handcode_opt.py
@@ -4,12 +4,11 @@ from extra.mcts_search import mcts_search
 from examples.mlperf.helpers import get_mlperf_bert_model
 from tinygrad import Tensor, Device, dtypes, nn
 from tinygrad.codegen.kernel import Kernel
-from tinygrad.ops import UOps
+from tinygrad.ops import UOps, sym_infer
 from tinygrad.device import Compiled
 from tinygrad.engine.schedule import create_schedule
 from tinygrad.engine.search import time_linearizer, beam_search, bufs_from_lin
 from tinygrad.helpers import DEBUG, ansilen, getenv, colored, TRACEMETA
-from tinygrad.shape.symbolic import sym_infer
 
 def get_sched_resnet():
   mdl = ResNet50()
diff --git a/extra/assembly/assembly.py b/extra/assembly/assembly.py
index f0349a8e5f..1133675d23 100644
--- a/extra/assembly/assembly.py
+++ b/extra/assembly/assembly.py
@@ -3,7 +3,7 @@ from tinygrad.codegen.kernel import UOps, MemOp, UOp
 from tinygrad.ops import BinaryOps, UnaryOps
 from tinygrad.dtype import DType, dtypes
 from tinygrad.helpers import DEBUG
-from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, LtNode, SumNode, AndNode
+from tinygrad.ops import Variable, NumNode, MulNode, DivNode, ModNode, LtNode, SumNode, AndNode
 import functools
 import math
 from collections import defaultdict
diff --git a/extra/backends/hsa_graph.py b/extra/backends/hsa_graph.py
index 210b7a84a2..22476e9ac0 100644
--- a/extra/backends/hsa_graph.py
+++ b/extra/backends/hsa_graph.py
@@ -3,7 +3,7 @@ from typing import List, Any, Dict, cast, Optional, Tuple
 from tinygrad.helpers import init_c_var, round_up
 from tinygrad.device import Buffer, BufferOptions
 from tinygrad.device import Compiled, Device
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 from tinygrad.runtime.ops_hsa import HSADevice, PROFILE, Profiler
 from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner, GraphException
diff --git a/extra/ops.py b/extra/ops.py
index f612c7c423..76367a12c4 100644
--- a/extra/ops.py
+++ b/extra/ops.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from tinygrad.helpers import dedup, prod
 from tinygrad.ops import ReduceOps, UnaryOps, BinaryOps, TernaryOps, UOp, UOps, pretty_print
 from tinygrad.dtype import ImageDType, PtrDType, dtypes, DType, ConstType
-from tinygrad.shape.symbolic import Variable, sint
+from tinygrad.ops import Variable, sint
 from tinygrad.shape.shapetracker import ShapeTracker
 
 # these ops are deleted after AST is UOp
diff --git a/extra/optimization/extract_policynet.py b/extra/optimization/extract_policynet.py
index 6e4f4c2041..129208d4f7 100644
--- a/extra/optimization/extract_policynet.py
+++ b/extra/optimization/extract_policynet.py
@@ -15,7 +15,7 @@ from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, Buf
 from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 inf, nan = float('inf'), float('nan')
 from tinygrad.codegen.kernel import Opt, OptOps
 
diff --git a/extra/optimization/extract_sa_pairs.py b/extra/optimization/extract_sa_pairs.py
index 6b90a5a4ef..82f6eb002e 100644
--- a/extra/optimization/extract_sa_pairs.py
+++ b/extra/optimization/extract_sa_pairs.py
@@ -8,7 +8,7 @@ from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, Buf
 from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 inf, nan = float('inf'), float('nan')
 from tinygrad.codegen.kernel import Opt, OptOps
 
diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py
index 66a6409411..6c5f8d99e5 100644
--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -7,7 +7,7 @@ from tinygrad.ops import UOp, UOps, KernelInfo
 from tinygrad.dtype import dtypes, PtrDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.shape.symbolic import NumNode
+from tinygrad.ops import NumNode
 inf, nan = float('inf'), float('nan')
 
 # kernel unpacker
diff --git a/extra/optimization/pretrain_valuenet.py b/extra/optimization/pretrain_valuenet.py
index 6b49a30a14..2216b24d56 100644
--- a/extra/optimization/pretrain_valuenet.py
+++ b/extra/optimization/pretrain_valuenet.py
@@ -12,7 +12,7 @@ from tinygrad.ops import LazyOp, TernaryOps, BinaryOps, UnaryOps, ReduceOps, Buf
 from tinygrad.dtype import dtypes
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 inf, nan = float('inf'), float('nan')
 from tinygrad.codegen.kernel import Opt, OptOps
 
diff --git a/extra/to_movement_ops.py b/extra/to_movement_ops.py
index 3545799712..af9da9b88c 100644
--- a/extra/to_movement_ops.py
+++ b/extra/to_movement_ops.py
@@ -6,7 +6,7 @@ from extra.optimization.helpers import load_worlds, ast_str_to_ast
 from tinygrad.helpers import prod, tqdm
 from tinygrad.ops import UOp, UOps
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.shape.symbolic import sym_infer, Node
+from tinygrad.ops import sym_infer, Node
 
 class MovementOps(Enum): RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); STRIDE = auto(); AS_STRIDED = auto() # noqa: E702
 
diff --git a/test/external/fuzz_symbolic.py b/test/external/fuzz_symbolic.py
index a3c505f93f..834853b39c 100644
--- a/test/external/fuzz_symbolic.py
+++ b/test/external/fuzz_symbolic.py
@@ -2,7 +2,7 @@ import itertools
 import random
 from tinygrad import Variable
 from tinygrad.helpers import DEBUG
-from tinygrad.shape.symbolic import NumNode
+from tinygrad.ops import NumNode
 random.seed(42)
 
 def add_v(expr, rng=None):
diff --git a/test/external/fuzz_uops.py b/test/external/fuzz_uops.py
index af9f8d7fdf..476b707bcf 100644
--- a/test/external/fuzz_uops.py
+++ b/test/external/fuzz_uops.py
@@ -7,7 +7,7 @@ from tinygrad.ops import END_FOR_UOP, UOp, print_uops
 from tinygrad.device import Buffer, Device
 from tinygrad.engine.realize import CompiledRunner
 from tinygrad.helpers import DEBUG, colored
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 from tinygrad.tensor import _to_np_dtype
 from test.external.fuzz_schedule import FUZZ_SCHEDULE_MAX_PATHS, find_all_toposorts
 
diff --git a/test/helpers.py b/test/helpers.py
index ab313d42a3..9fbed0c1b6 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -10,7 +10,7 @@ from tinygrad.engine.realize import Runner
 from tinygrad.dtype import ConstType, DType
 from tinygrad.nn.state import get_parameters
 from tinygrad.helpers import Context, CI, OSX, getenv
-from tinygrad.shape.symbolic import sint
+from tinygrad.ops import sint
 
 def derandomize_model(model):
   with Context(GRAPH=0):
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index 8d32d93455..c1b9f179cb 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -10,7 +10,7 @@ from tinygrad.ops import UOp, UOps, BinaryOps, TernaryOps, UnaryOps
 from tinygrad.device import Device, Buffer
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-# from tinygrad.shape.symbolic import Variable
+# from tinygrad.ops import Variable
 from tinygrad.tensor import Tensor, _to_np_dtype
 from tinygrad.engine.schedule import BUF_LIMIT, create_schedule
 from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner
diff --git a/test/test_symbolic_shapetracker.py b/test/test_symbolic_shapetracker.py
index f89583b40d..859b0e0e07 100644
--- a/test/test_symbolic_shapetracker.py
+++ b/test/test_symbolic_shapetracker.py
@@ -1,7 +1,7 @@
 import unittest
 from tinygrad.shape.shapetracker import ShapeTracker, View
 from tinygrad import Variable
-from tinygrad.shape.symbolic import NumNode
+from tinygrad.ops import NumNode
 from tinygrad.tensor import Tensor
 
 class TestSymbolic(unittest.TestCase):
diff --git a/test/unit/test_helpers.py b/test/unit/test_helpers.py
index c8f4fa2b77..4a8cb29108 100644
--- a/test/unit/test_helpers.py
+++ b/test/unit/test_helpers.py
@@ -5,7 +5,7 @@ from tinygrad.helpers import Context, ContextVar
 from tinygrad.helpers import merge_dicts, strip_parens, prod, round_up, fetch, fully_flatten, from_mv, to_mv
 from tinygrad.tensor import get_shape
 from tinygrad.codegen.lowerer import get_contraction
-from tinygrad.shape.symbolic import NumNode
+from tinygrad.ops import NumNode
 import numpy as np
 
 VARIABLE = ContextVar("VARIABLE", 0)
diff --git a/test/unit/test_shapetracker.py b/test/unit/test_shapetracker.py
index d7082543b2..5bd6927b94 100644
--- a/test/unit/test_shapetracker.py
+++ b/test/unit/test_shapetracker.py
@@ -5,7 +5,7 @@ from tinygrad.dtype import dtypes
 from tinygrad.helpers import prod
 from tinygrad.shape.shapetracker import ShapeTracker, View
 from tinygrad import Variable
-from tinygrad.shape.symbolic import NumNode
+from tinygrad.ops import NumNode
 from tinygrad.ops import UOp, UOps, graph_rewrite
 from tinygrad.codegen.uopgraph import sym
 from itertools import product
diff --git a/test/unit/test_uop_symbolic.py b/test/unit/test_uop_symbolic.py
index f280b040b1..5cf0043c65 100644
--- a/test/unit/test_uop_symbolic.py
+++ b/test/unit/test_uop_symbolic.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import unittest, pickle
 from typing import Tuple
-#from tinygrad.shape.symbolic import MulNode, SumNode, Variable, NumNode, LtNode, ModNode, Node, sym_render, sym_infer, create_lt_node, create_ge_node
+#from tinygrad.ops import MulNode, SumNode, Variable, NumNode, LtNode, ModNode, Node, sym_render, sym_infer, create_lt_node, create_ge_node
 
 # TODO: fix all the @unittest.expectedFailure
 
diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py
index 2eb5f47332..5425f6099b 100644
--- a/tinygrad/codegen/kernel.py
+++ b/tinygrad/codegen/kernel.py
@@ -6,14 +6,13 @@ from typing import Optional, List, Tuple, cast, Dict, Final, DefaultDict
 from enum import Enum, auto
 
 from tinygrad.ops import UNSAFE_PAD_OPS, BUFFER_UOPS, BinaryOps, KernelInfo, UOp, UOps, PatternMatcher, print_uops, type_verify, resolve, \
-    graph_rewrite, track_rewrites
+    graph_rewrite, track_rewrites, Variable, sint
 from tinygrad.device import Device
 from tinygrad.renderer import Renderer, TensorCore, Program
 from tinygrad.dtype import ImageDType, PtrDType
 from tinygrad.helpers import all_same, colored, ansilen, dedup, getenv, prod, round_up, all_int, to_function_name, diskcache_put
 from tinygrad.helpers import DEBUG, TC_OPT, USE_TC, AMX
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.shape.symbolic import Variable, sint
 from tinygrad.shape.view import strides_for_shape
 from tinygrad.codegen.linearize import linearize_uop
 from tinygrad.codegen.uopgraph import full_graph_rewrite
diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py
index da6915dc5a..96a1dbb320 100644
--- a/tinygrad/codegen/lowerer.py
+++ b/tinygrad/codegen/lowerer.py
@@ -5,9 +5,8 @@ from dataclasses import dataclass
 from typing import List, Tuple, cast, Optional
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import variable_to_uop
-from tinygrad.shape.symbolic import sint
 from tinygrad.dtype import dtypes
-from tinygrad.ops import KernelInfo, BinaryOps, UOp, UOps, graph_rewrite, PatternMatcher, UPat, resolve
+from tinygrad.ops import KernelInfo, BinaryOps, UOp, UOps, graph_rewrite, PatternMatcher, UPat, resolve, sint
 from tinygrad.renderer import Renderer
 from tinygrad.helpers import all_int, prod, partition, flatten
 
diff --git a/tinygrad/engine/jit.py b/tinygrad/engine/jit.py
index b9b928e644..d2c458d0a3 100644
--- a/tinygrad/engine/jit.py
+++ b/tinygrad/engine/jit.py
@@ -6,9 +6,8 @@ from tinygrad.engine.lazy import LazyBuffer
 from tinygrad.helpers import flatten, merge_dicts, DEBUG, Context, GRAPH, BEAM, getenv, colored, JIT, dedup, partition
 from tinygrad.device import Buffer, Compiled, Device
 from tinygrad.dtype import DType
-from tinygrad.ops import UOp, ssimplify
+from tinygrad.ops import UOp, ssimplify, Variable, sint, sym_infer
 from tinygrad.shape.shapetracker import ShapeTracker
-from tinygrad.shape.symbolic import Variable, sint, sym_infer
 from tinygrad.engine.realize import ExecItem, capturing, EmptyOp, ViewOp, BufferXfer, CompiledRunner, Runner, _internal_memory_planner
 from tinygrad.nn.state import get_parameters
 from dataclasses import dataclass
diff --git a/tinygrad/engine/lazy.py b/tinygrad/engine/lazy.py
index 3868dccf7d..6b632c646d 100644
--- a/tinygrad/engine/lazy.py
+++ b/tinygrad/engine/lazy.py
@@ -3,8 +3,7 @@ from typing import Union, Optional, Any, Tuple, List, get_args
 from tinygrad.dtype import dtypes, DType, ConstType, to_dtype, ImageDType
 from tinygrad.helpers import prod, getenv, all_int, all_same, DEBUG, _METADATA, Metadata, SPLIT_REDUCEOP
 from tinygrad.ops import MetaOps, UnaryOps, BinaryOps, TernaryOps, ReduceOps, Op, exec_alu, python_alu, REDUCE_ALU
-from tinygrad.ops import identity_element, MathTrait, resolve, UOp
-from tinygrad.shape.symbolic import sint
+from tinygrad.ops import identity_element, MathTrait, resolve, UOp, sint
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.device import Buffer
 from weakref import ref, ReferenceType, WeakValueDictionary
diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py
index 156c5a47bf..9c1808e5aa 100644
--- a/tinygrad/engine/realize.py
+++ b/tinygrad/engine/realize.py
@@ -4,10 +4,9 @@ from collections import defaultdict
 from dataclasses import dataclass, replace
 from tinygrad.helpers import colored, getenv, DEBUG, GlobalCounters, ansilen, BEAM, NOOPT, all_int, CAPTURING, Metadata, Context, TRACEMETA, dedup
 from tinygrad.helpers import NO_MEMORY_PLANNER
-from tinygrad.ops import UOps, UOp
+from tinygrad.ops import UOps, UOp, Variable, sym_infer, sint
 from tinygrad.dtype import dtypes
 from tinygrad.device import Device, Buffer
-from tinygrad.shape.symbolic import Variable, sym_infer, sint
 from tinygrad.renderer import Renderer, Program
 from tinygrad.codegen.kernel import Kernel
 from tinygrad.engine.schedule import ScheduleItem
@@ -164,9 +163,10 @@ class ExecItem:
   prg: Runner
   bufs: List[Optional[Buffer]]
   metadata: Optional[Tuple[Metadata, ...]] = None
-  def run(self, var_vals:Optional[Dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
+  def run(self, _var_vals:Optional[Dict[Variable, int]]=None, wait=False, jit=False, do_update_stats=True) -> Optional[float]:
+    var_vals = {} if _var_vals is None else _var_vals
     bufs = [cast(Buffer, x) for x in self.bufs] if jit else [cast(Buffer, x).ensure_allocated() for x in self.bufs]
-    et = self.prg(bufs, var_vals if var_vals is not None else {}, wait=wait or DEBUG >= 2)
+    et = self.prg(bufs, var_vals, wait=wait or DEBUG >= 2)
     if do_update_stats:
       GlobalCounters.kernel_count += 1
       GlobalCounters.global_ops += (op_est:=sym_infer(self.prg.op_estimate, var_vals))
diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index 552c1abcb5..b4113baeb8 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -3,10 +3,9 @@ from collections import defaultdict, deque
 from dataclasses import dataclass
 from typing import Callable, Tuple, List, Dict, Optional, DefaultDict, cast
 from tinygrad.ops import REDUCE_ALU, UNSAFE_PAD_OPS, MetaOps, ReduceOps, TernaryOps, UnaryOps, UOp, UOps, PatternMatcher, UPat, resolve, \
-    graph_rewrite, track_rewrites
+    graph_rewrite, track_rewrites, Variable, sint
 from tinygrad.helpers import GRAPH, DEBUG, MULTIOUTPUT, SAVE_SCHEDULE, FUSE_CONV_BW, FUSE_ARANGE, GlobalCounters, Metadata, all_same, \
     colored, diskcache_put, prod, dedup, all_int, merge_dicts, getenv, unwrap
-from tinygrad.shape.symbolic import Variable, sint
 from tinygrad.dtype import ImageDType, dtypes
 from tinygrad.engine.lazy import LazyBuffer
 from tinygrad.shape.shapetracker import ShapeTracker
diff --git a/tinygrad/engine/search.py b/tinygrad/engine/search.py
index d7acd272c9..e727da0f62 100644
--- a/tinygrad/engine/search.py
+++ b/tinygrad/engine/search.py
@@ -2,14 +2,12 @@ from typing import Dict, List, cast, DefaultDict, Optional, Tuple, Callable
 import itertools, functools, random, math, time, multiprocessing, traceback, signal
 from collections import defaultdict
 from dataclasses import replace
-from tinygrad.ops import UOp, UOps
+from tinygrad.ops import UOp, UOps, Variable, sym_infer
 from tinygrad.device import Device, Buffer, Compiler
 from tinygrad.helpers import prod, flatten, DEBUG, CACHELEVEL, diskcache_get, diskcache_put, getenv, Context, colored, to_function_name
 from tinygrad.dtype import ImageDType
-from tinygrad.codegen.kernel import Kernel
-from tinygrad.codegen.kernel import Opt, OptOps, KernelOptError
+from tinygrad.codegen.kernel import Kernel, Opt, OptOps, KernelOptError
 from tinygrad.tensor import Tensor
-from tinygrad.shape.symbolic import Variable, sym_infer
 from tinygrad.engine.realize import CompiledRunner
 from tinygrad.renderer import Program
 
diff --git a/tinygrad/function.py b/tinygrad/function.py
index c31ba0a1f5..b27a2e6a4f 100644
--- a/tinygrad/function.py
+++ b/tinygrad/function.py
@@ -3,10 +3,9 @@ import math
 from typing import Tuple, Optional
 from tinygrad.helpers import argsort
 from tinygrad.dtype import dtypes, DType, sum_acc_dtype
-from tinygrad.ops import ReduceOps, resolve
+from tinygrad.ops import ReduceOps, resolve, sint
 from tinygrad.tensor import Function
 from tinygrad.engine.lazy import LazyBuffer
-from tinygrad.shape.symbolic import sint
 
 class Contiguous(Function):
   def forward(self, x:LazyBuffer) -> LazyBuffer: return x.contiguous()
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index d780b30419..37fd162244 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -7,7 +7,6 @@ from weakref import WeakValueDictionary
 from tinygrad.dtype import ConstType, ImageDType, PtrDType, dtypes, DType, truncate
 from tinygrad.helpers import ContextVar, prod, getenv, all_same, Context
 if TYPE_CHECKING:
-  from tinygrad.shape.symbolic import Variable, sint
   from tinygrad.shape.shapetracker import ShapeTracker
 
 # wrapper around IntEnum that preserves Enum.__str__ and makes auto() unique across all FastEnum subclasses
@@ -154,7 +153,7 @@ COMMUTATIVE = {BinaryOps.ADD, BinaryOps.MUL, BinaryOps.MAX, BinaryOps.CMPNE, Bin
 END_FOR_UOP = {UOps.IF:(UOps.STORE, UOps.ENDIF), UOps.RANGE:(UOps.ASSIGN, UOps.ENDRANGE)}
 
 # With True as the default, this matches the old symbolic behavior
-# python3 -c 'from tinygrad.shape.symbolic import Variable; print(bool(Variable("a", 1, 10) < 4))' -> True
+# python3 -c 'from tinygrad.ops import Variable; print(bool(Variable("a", 1, 10) < 4))' -> True
 def resolve(x, default:bool=True):
   if not isinstance(x, UOp): return bool(x)
   assert x.dtype is dtypes.bool, "UOp in resolve must be bool"
@@ -306,10 +305,8 @@ class UOp(MathTrait):
     st_vars: List[Set[Variable]] = [x.st_arg.vars() for x in self.sparents if x.op in BUFFER_UOPS]
     return sorted(set.union(*st_vars, [x.unbind()[0] if x.op is not UOps.DEFINE_VAR else x for x in self.vars()]), key=lambda v: v.arg)
 
-  # TODO: this is context rewrite
-  def substitute(self, dvars:Dict[UOp, UOp]):
-    if self in dvars: return dvars[self]
-    return self.replace(src=tuple(x.substitute(dvars) for x in self.src))
+  def substitute(self, dvars:Dict[UOp, UOp]): return graph_rewrite(self, substitute, dvars)
+
   @staticmethod
   def range(dtype:DType, start:ConstType|UOp, end:ConstType|UOp, idx:int):
     return UOp(UOps.RANGE, dtype=dtype, src=(UOp.const(dtype, start) if not isinstance(start, UOp) else start,
@@ -967,6 +964,8 @@ symbolic_flat = symbolic+PatternMatcher([
   ((UPat.var("x", dtypes.ints) + UPat.var("y")) * UPat.cvar("c"), lambda x,y,c: x*c+y*c),
 ])
 
+substitute = PatternMatcher([(UPat(tuple(UOps), name="x"), lambda ctx,x: ctx.get(x,None))])
+
 # for debug
 renderer = PatternMatcher([
   (UPat(UOps.DEFINE_VAR, name="x"), lambda x: UOp(UOps.NOOP, arg=x.arg[0])),
@@ -979,3 +978,12 @@ renderer = PatternMatcher([
   (UPat(UOps.ALU, src=UPat(UOps.NOOP), arg=BinaryOps.CMPLT, name="x"), lambda x: UOp(UOps.NOOP, arg=f"({x.src[0].arg}<{x.src[1].arg})")),
   (UPat(UOps.ALU, src=UPat(UOps.NOOP), arg=BinaryOps.CMPNE, name="x"), lambda x: UOp(UOps.NOOP, arg=f"({x.src[0].arg}!={x.src[1].arg})")),
 ])
+
+# *** what was symbolic.py ***
+
+sint = Union[int, UOp]
+Variable = UOp
+
+def NumNode(val:int): return UOp.const(dtypes.int, val)
+def sym_infer(uop: Union[UOp, int], var_vals: Dict[UOp, int]) -> int:
+  return int(uop.substitute({k:k.const_like(v) for k,v in var_vals.items()})) if isinstance(uop, UOp) else uop
diff --git a/tinygrad/renderer/__init__.py b/tinygrad/renderer/__init__.py
index 985b160452..5c1f976f5e 100644
--- a/tinygrad/renderer/__init__.py
+++ b/tinygrad/renderer/__init__.py
@@ -2,8 +2,7 @@ from typing import Optional, List, Tuple, Dict, Callable, Any
 import functools
 from dataclasses import dataclass, field
 from tinygrad.helpers import to_function_name, dedup, prod
-from tinygrad.ops import Op, UOps, UOp, flops_mem
-from tinygrad.shape.symbolic import sym_infer, sint, Variable
+from tinygrad.ops import Op, UOps, UOp, flops_mem, sym_infer, sint, Variable
 from tinygrad.dtype import DType
 
 @dataclass(frozen=True)
diff --git a/tinygrad/runtime/graph/clang.py b/tinygrad/runtime/graph/clang.py
index 6c516f3882..038ad03469 100644
--- a/tinygrad/runtime/graph/clang.py
+++ b/tinygrad/runtime/graph/clang.py
@@ -4,7 +4,7 @@ from tinygrad.helpers import dedup, cpu_time_execution, DEBUG
 from tinygrad.engine.jit import GraphRunner, GraphException
 from tinygrad.device import Buffer, Device
 from tinygrad.engine.realize import ExecItem, CompiledRunner
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 from tinygrad.runtime.ops_clang import ClangProgram
 from tinygrad.renderer.cstyle import ClangRenderer
 render_dtype = ClangRenderer().render_dtype
diff --git a/tinygrad/runtime/graph/cuda.py b/tinygrad/runtime/graph/cuda.py
index 658041b809..03d14325d9 100644
--- a/tinygrad/runtime/graph/cuda.py
+++ b/tinygrad/runtime/graph/cuda.py
@@ -4,7 +4,7 @@ import tinygrad.runtime.autogen.cuda as cuda
 from tinygrad.helpers import init_c_var, dedup
 from tinygrad.device import Buffer, Device
 from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner, GraphException
 
diff --git a/tinygrad/runtime/graph/hcq.py b/tinygrad/runtime/graph/hcq.py
index f3e7448b28..61f27072c7 100644
--- a/tinygrad/runtime/graph/hcq.py
+++ b/tinygrad/runtime/graph/hcq.py
@@ -3,7 +3,7 @@ from typing import List, Any, Dict, cast, Optional, Tuple, Set
 from tinygrad.helpers import round_up, PROFILE, memsize_to_str
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQSignal, HCQBuffer, HWCommandQueue, HWComputeQueue, HWCopyQueue, HCQArgsState
 from tinygrad.device import Buffer, BufferOptions, Compiled, Device
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner
 
diff --git a/tinygrad/runtime/graph/metal.py b/tinygrad/runtime/graph/metal.py
index ff8df6d128..9b0ad0aea8 100644
--- a/tinygrad/runtime/graph/metal.py
+++ b/tinygrad/runtime/graph/metal.py
@@ -5,7 +5,7 @@ from tinygrad.helpers import dedup, getenv
 from tinygrad.device import Buffer
 from tinygrad.engine.realize import ExecItem, CompiledRunner
 from tinygrad.engine.jit import GraphRunner, GraphException
-from tinygrad.shape.symbolic import Variable
+from tinygrad.ops import Variable
 from tinygrad.runtime.ops_metal import wait_check, msg, libobjc, to_struct, objc_instance,\
   MTLResourceOptions, elapsed_time, objc_id
 
diff --git a/tinygrad/shape/shapetracker.py b/tinygrad/shape/shapetracker.py
index 3284858f36..bea81181e6 100644
--- a/tinygrad/shape/shapetracker.py
+++ b/tinygrad/shape/shapetracker.py
@@ -3,10 +3,9 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Tuple, List, Optional, Dict, Set
 from tinygrad.helpers import merge_dicts, getenv
-from tinygrad.shape.symbolic import Variable, sint
 from tinygrad.shape.view import View, strides_for_shape
 from tinygrad.dtype import dtypes
-from tinygrad.ops import UOp, UOps, BinaryOps, graph_rewrite, _get_chain, symbolic_flat
+from tinygrad.ops import UOp, UOps, BinaryOps, graph_rewrite, _get_chain, symbolic_flat, Variable, sint
 
 @dataclass(frozen=True)
 class ShapeTracker:
diff --git a/tinygrad/shape/symbolic.py b/tinygrad/shape/symbolic.py
deleted file mode 100644
index c24dade84a..0000000000
--- a/tinygrad/shape/symbolic.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from __future__ import annotations
-from typing import Union, Optional, Dict
-from tinygrad.dtype import dtypes
-from tinygrad.ops import UOp, UOps, exec_alu
-
-sint = Union[int, UOp]
-
-def NumNode(val:int): return UOp.const(dtypes.int, val)
-Variable = UOp
-
-def sym_infer(uop: Union[UOp, int], var_vals: Optional[Dict[Variable, int]]) -> int:
-  if isinstance(uop, (int, float)): return uop   # TODO: ugh, the float is a hack for qcom
-  if uop.op == UOps.CONST: return uop.arg
-  if uop.op == UOps.DEFINE_VAR and var_vals is not None: return var_vals[uop]
-  if uop.op == UOps.BIND: return uop.src[1].arg  # bound variable returns bound value
-  if uop.op == UOps.ALU:
-    src_values = [sym_infer(src, var_vals) for src in uop.src]
-    return exec_alu(uop.arg, uop.dtype, src_values)
-  raise NotImplementedError(f"Unsupported UOp {uop.op}")
diff --git a/tinygrad/shape/view.py b/tinygrad/shape/view.py
index 99fbf7fff0..eea20ea490 100644
--- a/tinygrad/shape/view.py
+++ b/tinygrad/shape/view.py
@@ -3,9 +3,8 @@ import functools, operator, itertools, math
 from dataclasses import dataclass
 from typing import Tuple, List, Optional, Dict, Set, cast, Union
 from tinygrad.dtype import dtypes
-from tinygrad.ops import resolve, UOp
+from tinygrad.ops import resolve, UOp, NumNode, Variable, sint, sym_infer
 from tinygrad.helpers import prod, all_int, argsort
-from tinygrad.shape.symbolic import NumNode, Variable, sint, sym_infer
 
 @functools.lru_cache(maxsize=None)
 def canonicalize_strides(shape:Tuple[sint, ...], strides:Tuple[sint, ...]) -> Tuple[sint, ...]:
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 27b4718c49..357866e43f 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -9,9 +9,8 @@ from tinygrad.dtype import DType, DTypeLike, dtypes, ImageDType, ConstType, leas
 from tinygrad.helpers import argfix, make_pair, flatten, prod, all_int, round_up, merge_dicts, argsort, getenv, all_same, fully_flatten, dedup
 from tinygrad.helpers import IMAGE, DEBUG, WINO, _METADATA, Metadata, TRACEMETA, ceildiv
 from tinygrad.multi import MultiLazyBuffer
-from tinygrad.ops import MetaOps, truncate, smax, resolve, UOp, UOps, BinaryOps
+from tinygrad.ops import MetaOps, truncate, smax, resolve, UOp, UOps, BinaryOps, sint, Variable
 from tinygrad.device import Device, Buffer, BufferOptions
-from tinygrad.shape.symbolic import sint, Variable
 from tinygrad.engine.lazy import LazyBuffer
 from tinygrad.engine.realize import run_schedule, memory_planner
 from tinygrad.engine.schedule import ScheduleItem, create_schedule_with_vars

From 334f499e6af2f7186bd8146427524b9735166c2a Mon Sep 17 00:00:00 2001
From: ignaciosica <p4ssenger.developer@gmail.com>
Date: Sat, 12 Oct 2024 07:56:47 -0300
Subject: [PATCH 07/17] consistent render of recip in cuda with CStyleLanguage
 (#6980)

---
 tinygrad/renderer/cstyle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 35ca86d007..482a3bb2b5 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -294,7 +294,7 @@ class MetalRenderer(CStyleLanguage):
   return {arg[3].name}2(c.thread_elements()[0], c.thread_elements()[1]);\n}}""")
     return super().render_kernel(function_name, kernel, bufs, uops, prefix)
 
-code_for_op_half = {UnaryOps.RECIP: lambda x,dtype: f"hrcp({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"1/{x}",
+code_for_op_half = {UnaryOps.RECIP: lambda x,dtype: f"hrcp({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"(1/{x})",
                     BinaryOps.MAX: lambda a,b,dtype: f"__hmax({a},{b})" if dtype in (dtypes.half, dtypes.bfloat16) else f"max({a},{b})",
                     UnaryOps.SQRT: lambda x,dtype: f"hsqrt({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"sqrt({x})",
                     UnaryOps.SIN: lambda x,dtype: f"hsin({x})" if dtype in (dtypes.half, dtypes.bfloat16) else f"sin({x})",

From 746a1f8c86872a7ec1208bac2a79501e41da814b Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Sat, 12 Oct 2024 14:09:32 +0300
Subject: [PATCH 08/17] prep uoping diff for big graph [pr] (#7014)

---
 tinygrad/engine/schedule.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/tinygrad/engine/schedule.py b/tinygrad/engine/schedule.py
index b4113baeb8..fc6a2bff83 100644
--- a/tinygrad/engine/schedule.py
+++ b/tinygrad/engine/schedule.py
@@ -161,20 +161,19 @@ def _recursive_uop(buf:LazyBuffer, st:ShapeTracker, outputs:Tuple[LazyBuffer, ..
     if buf not in assign_targets and buf not in inputs: inputs.append(buf)
     return UOp(UOps.LOAD, dtype, (ubuf, unbound_st.to_uop()))
 
-  # reduce ops change ShapeTracker
-  if buf.op in ReduceOps:
-    rsrc = _recursive_uop(buf.srcs[0], ShapeTracker.from_shape(buf.srcs[0].shape), outputs, var_vals, inputs, buf_uops, assign_targets, cache)
-    return cache.setdefault((buf, st), UOp(UOps.REDUCE_AXIS, dtype, (rsrc,), (REDUCE_ALU[cast(ReduceOps, buf.op)], buf.arg)).view(st))
-
-  # elementwise ops pass shapetracker
-  in_uops = tuple(_recursive_uop(x, st, outputs, var_vals, inputs, buf_uops, assign_targets, cache) for x in buf.srcs)
-  if buf.op is MetaOps.CONTIGUOUS:
+  # only reduceop changes shape
+  src_st = ShapeTracker.from_shape(buf.srcs[0].shape) if buf.op in ReduceOps else st
+  src: List[UOp] = [_recursive_uop(x, src_st, outputs, var_vals, inputs, buf_uops, assign_targets, cache) for x in buf.srcs]
+  if buf.op in ReduceOps: ret = UOp(UOps.REDUCE_AXIS, dtype, tuple(src), (REDUCE_ALU[cast(ReduceOps, buf.op)], buf.arg)).view(st)
+  elif buf.op is MetaOps.CONTIGUOUS:
     assert buf in outputs, f"{buf.op} must be writable"
-    return in_uops[0]
-  if buf.op is MetaOps.ASSIGN: return cache.setdefault((buf, st), UOp(UOps.ASSIGN, dtype, (in_uops[1].src[0], in_uops[0])))
-  if buf.op is UnaryOps.CAST: return cache.setdefault((buf, st), UOp(UOps.CAST, dtype, in_uops))
-  if buf.op is UnaryOps.BITCAST: return cache.setdefault((buf, st), UOp(UOps.BITCAST, dtype, in_uops))
-  return cache.setdefault((buf, st), UOp(UOps.ALU, dtype, in_uops, buf.op))
+    ret = src[0]
+  elif buf.op is MetaOps.ASSIGN: ret = UOp(UOps.ASSIGN, dtype, (src[1].src[0], src[0]))
+  elif buf.op is UnaryOps.CAST: ret = src[0].cast(dtype)
+  elif buf.op is UnaryOps.BITCAST: ret = src[0].bitcast(dtype)
+  else: ret = UOp(UOps.ALU, dtype, tuple(src), buf.op)
+  cache[(buf, st)] = ret
+  return ret
 
 def _lower_lazybuffer(outs:List[LazyBuffer], buf_uops:Dict[Buffer, UOp]) -> Tuple[LBScheduleItem, Dict[Variable, int]]:
   """describe the computation for a LazyBuffer with UOp + inputs + var_vals"""

From cba4b9a058015f061252c175a8534096ba6be192 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 12 Oct 2024 19:53:52 +0800
Subject: [PATCH 09/17] clean up ops file [pr] (#7013)

---
 tinygrad/ops.py | 58 ++++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 37fd162244..10c95caeac 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -153,7 +153,6 @@ COMMUTATIVE = {BinaryOps.ADD, BinaryOps.MUL, BinaryOps.MAX, BinaryOps.CMPNE, Bin
 END_FOR_UOP = {UOps.IF:(UOps.STORE, UOps.ENDIF), UOps.RANGE:(UOps.ASSIGN, UOps.ENDRANGE)}
 
 # With True as the default, this matches the old symbolic behavior
-# python3 -c 'from tinygrad.ops import Variable; print(bool(Variable("a", 1, 10) < 4))' -> True
 def resolve(x, default:bool=True):
   if not isinstance(x, UOp): return bool(x)
   assert x.dtype is dtypes.bool, "UOp in resolve must be bool"
@@ -195,6 +194,18 @@ class UOp(MathTrait):
     new_args = (kwargs.get("op", self.op), kwargs.get("dtype", self.dtype), kwargs.get("src", self.src), kwargs.get("arg", self.arg))
     if (self.op, self.dtype, self.src, self.arg) == new_args: return self
     return UOp(*new_args)
+  @functools.cached_property
+  def key(self) -> bytes:
+    return hashlib.sha256(str((self.op, self.dtype, self.arg)).encode() + b"".join([s.key for s in self.src])).digest()
+  def __repr__(self): return pretty_print(self, lambda x: f"{type(self).__name__}({x.op}, {x.dtype}, arg={x.argstr()}, src=(%s))")
+  def argstr(self): return f'({", ".join(map(str, self.arg))})' if self.op is UOps.REDUCE_AXIS else self.arg
+  @functools.cached_property
+  def parents(self) -> Dict[UOp, None]: return {**{x:None for x in self.src}, **{k:None for x in self.src for k in x.parents}}
+  @property  # parents with self
+  def sparents(self) -> Dict[UOp, None]: return {**self.parents, self:None}
+
+  # *** uop shape stuff ***
+
   @property
   def has_st(self) -> bool: return self.op not in {UOps.DEFINE_LOCAL, UOps.DEFINE_GLOBAL, UOps.BUFFER, UOps.CONST, UOps.DEFINE_VAR}
   @functools.cached_property
@@ -207,11 +218,11 @@ class UOp(MathTrait):
     from tinygrad.shape.shapetracker import ShapeTracker
     return ShapeTracker.from_shape(src_sts[0].reduce(self.axis_arg)) if self.op is UOps.REDUCE_AXIS else src_sts[0]
   @functools.cached_property
-  def key(self) -> bytes:
-    return hashlib.sha256(str((self.op, self.dtype, self.arg)).encode() + b"".join([s.key for s in self.src])).digest()
-  def __repr__(self): return pretty_print(self, lambda x: f"{type(self).__name__}({x.op}, {x.dtype}, arg={x.argstr()}, src=(%s))")
-  def argstr(self): return f'({", ".join(map(str, self.arg))})' if self.op is UOps.REDUCE_AXIS else self.arg
+  def full_shape(self) -> Tuple[sint, ...]:
+    return self.arg.shape if self.op is UOps.VIEW else tuple(smax(x) for x in zip(*[x.full_shape for x in self.src if x.has_st]))
+
   # *** uop evaluation ***
+
   def simplify(self):
     with Context(TRACK_MATCH_STATS=0):
       return graph_rewrite(self, symbolic)
@@ -226,7 +237,10 @@ class UOp(MathTrait):
   def __bool__(self): return self._eval((dtypes.bool,), bool)
   def __int__(self): return self._eval(dtypes.ints, int)
   def __float__(self): return self._eval(dtypes.floats, float)
-  # *** uop syntactic sugar
+  def substitute(self, dvars:Dict[UOp, UOp]): return graph_rewrite(self, _substitute, dvars)
+
+  # *** uop syntactic sugar ***
+
   @property
   def st_arg(self) -> ShapeTracker:
     assert self.op in BUFFER_UOPS, f"st_arg called on {self.op}"
@@ -276,8 +290,13 @@ class UOp(MathTrait):
     if isinstance(b, UOp): return b.unbind()[0] if b.op is UOps.BIND else b
     if isinstance(b, tuple) and all_same(b): b = b[0]  # doesn't have to be a VCONST if they are all the same
     return UOp(UOps.VCONST if isinstance(b, tuple) else UOps.CONST, dtype, arg=dtypes.as_const(b, dtype) if dtype is not None else b) # type: ignore
+  @staticmethod
+  def range(dtype:DType, start:ConstType|UOp, end:ConstType|UOp, idx:int):
+    return UOp(UOps.RANGE, dtype=dtype, src=(UOp.const(dtype, start) if not isinstance(start, UOp) else start,
+                                             UOp.const(dtype, end) if not isinstance(end, UOp) else end), arg=idx)
+  def reduce(self, op:BinaryOps, *rng:UOp): return UOp(UOps.REDUCE, self.dtype, (self,) + rng, op)
 
-  # *** Variable stuff ***
+  # *** uop Variable stuff ***
 
   @staticmethod
   def variable(name:str, min_val:ConstType, max_val:ConstType): return UOp(UOps.DEFINE_VAR, dtypes.int, arg=(name, min_val, max_val))
@@ -305,20 +324,8 @@ class UOp(MathTrait):
     st_vars: List[Set[Variable]] = [x.st_arg.vars() for x in self.sparents if x.op in BUFFER_UOPS]
     return sorted(set.union(*st_vars, [x.unbind()[0] if x.op is not UOps.DEFINE_VAR else x for x in self.vars()]), key=lambda v: v.arg)
 
-  def substitute(self, dvars:Dict[UOp, UOp]): return graph_rewrite(self, substitute, dvars)
+  # *** uop symbolic stuff ***
 
-  @staticmethod
-  def range(dtype:DType, start:ConstType|UOp, end:ConstType|UOp, idx:int):
-    return UOp(UOps.RANGE, dtype=dtype, src=(UOp.const(dtype, start) if not isinstance(start, UOp) else start,
-                                             UOp.const(dtype, end) if not isinstance(end, UOp) else end), arg=idx)
-  def reduce(self, op:BinaryOps, *rng:UOp): return UOp(UOps.REDUCE, self.dtype, (self,) + rng, op)
-  @functools.cached_property
-  def parents(self) -> Dict[UOp, None]: return {**{x:None for x in self.src}, **{k:None for x in self.src for k in x.parents}}
-  @property  # parents with self
-  def sparents(self) -> Dict[UOp, None]: return {**self.parents, self:None}
-  @functools.cached_property
-  def full_shape(self) -> Tuple[sint, ...]:
-    return self.arg.shape if self.op is UOps.VIEW else tuple(smax(x) for x in zip(*[x.full_shape for x in self.src if x.has_st]))
   def const_factor(self) -> int:
     """largest known int that divides self"""
     if self.op is UOps.CONST: return self.arg
@@ -413,13 +420,6 @@ def exec_alu(op:Op, dtype:DType, operands):
     return tuple([exec_alu(op, dtype.scalar(), [x[i] if isinstance(x, tuple) else x for x in operands]) for i in range(dtype.count)])
   return truncate.get(dtype, lambda x: x)(python_alu[op](*operands))
 
-def uop_alu_resolve(u:UOp) -> sint:
-  if u.op is UOps.CONST: return u.arg
-  if u.op is UOps.DEFINE_VAR: return u
-  #if u.op is UOps.DEFINE_VAR: return Variable(u.arg[0], u.arg[1], u.arg[2])
-  if u.op is UOps.ALU: return exec_alu(u.arg, u.dtype, tuple(map(uop_alu_resolve, u.src)))
-  raise RuntimeError(f"ALU resolve fail @ {u.op}")
-
 # ***** uop helpers *****
 
 def print_uops(uops:List[UOp]):
@@ -446,7 +446,7 @@ def flops_mem(uops:List[UOp], ignore_indexing=False) -> Tuple[sint, sint]:
   for u in uops:
     if u.op is UOps.RANGE:
       mult_stack.append(mults)
-      mults *= uop_alu_resolve(u.src[1] - u.src[0])
+      mults *= (u.src[1] - u.src[0]).ssimplify()
     elif u.op is UOps.ENDRANGE:
       mults = mult_stack.pop(-1)
     elif u.op is UOps.SPECIAL:
@@ -964,7 +964,7 @@ symbolic_flat = symbolic+PatternMatcher([
   ((UPat.var("x", dtypes.ints) + UPat.var("y")) * UPat.cvar("c"), lambda x,y,c: x*c+y*c),
 ])
 
-substitute = PatternMatcher([(UPat(tuple(UOps), name="x"), lambda ctx,x: ctx.get(x,None))])
+_substitute = PatternMatcher([(UPat(tuple(UOps), name="x"), lambda ctx,x: ctx.get(x,None))])
 
 # for debug
 renderer = PatternMatcher([

From ed1ed9e4ff709ccdce3f427d2bc4cfed30cfd5be Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Sat, 12 Oct 2024 09:41:56 -0400
Subject: [PATCH 10/17] bert use BS=72 (#7015)

memory 131 -> 138
green tflops 201 -> 209
red tflops 160 -> 169
---
 .../benchmarks/bert/implementations/tinybox_green/dev_beam.sh   | 2 +-
 .../benchmarks/bert/implementations/tinybox_green/dev_run.sh    | 2 +-
 .../bert/implementations/tinybox_green/run_and_time.sh          | 2 +-
 .../benchmarks/bert/implementations/tinybox_red/dev_beam.sh     | 2 +-
 .../benchmarks/bert/implementations/tinybox_red/dev_run.sh      | 2 +-
 .../benchmarks/bert/implementations/tinybox_red/run_and_time.sh | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
index 8814f62919..a6b45747f7 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_beam.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=6
 
 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
index 8df5f5dadd..a1bb814f07 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/dev_run.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=6
 
 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
index 8adeb7a877..50542450c9 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="."
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_green"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=6
 
 export BEAM=4 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=512
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
index 649d3942ef..5af5fd2d5f 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_beam.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=6
 
 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
index 9a6ee03432..a4636ac6f9 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/dev_run.sh
@@ -2,7 +2,7 @@
 
 export PYTHONPATH="."
 export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=6
 
 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1
diff --git a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
index 9ca38adbcc..458fcac97a 100755
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
@@ -3,7 +3,7 @@
 export PYTHONPATH="."
 export MODEL="bert"
 export SUBMISSION_PLATFORM="tinybox_red"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=66 EVAL_BS=6
+export DEFAULT_FLOAT="HALF" GPUS=6 BS=72 EVAL_BS=6
 
 export BEAM=3
 export IGNORE_JIT_FIRST_BEAM=1

From 38d45dfba54f2f665fb7efad0cde692abfdec0d0 Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Sat, 12 Oct 2024 22:03:04 +0800
Subject: [PATCH 11/17] hotfix: no rng in
 test/external/external_benchmark_schedule.py

---
 test/external/external_benchmark_schedule.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/external/external_benchmark_schedule.py b/test/external/external_benchmark_schedule.py
index 0bc58f5e96..9a1dc20fc8 100644
--- a/test/external/external_benchmark_schedule.py
+++ b/test/external/external_benchmark_schedule.py
@@ -1,6 +1,6 @@
 from typing import List
 from extra.models.resnet import ResNet50
-from tinygrad import Tensor, Device
+from tinygrad import Tensor, Device, nn
 from tinygrad.helpers import Profiling, Timing, getenv, BEAM, NOOPT, DEBUG, Context, ansilen
 from tinygrad.ops import UOps
 from tinygrad.codegen.kernel import Kernel
@@ -11,6 +11,7 @@ from tinygrad.engine.search import beam_search, bufs_from_lin
 
 if __name__ == "__main__":
   mdl = ResNet50()
+  for p in nn.state.get_parameters(mdl): p.replace(Tensor.empty(p.shape))
   img = Tensor.empty(64, 3, 224, 224)
 
   PROFILE = getenv("PROFILE", 0)

From 85a45164fb89fcba9032b95682184efa09a3fd4e Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 12 Oct 2024 22:36:24 +0800
Subject: [PATCH 12/17] remove pyint [pr] (#7016)

* remove pyint

* bump time on tp [pr]

* dont truncate in const fold

* remove dead code

* Revert "dont truncate in const fold"

This reverts commit 29c81db0f7880848b001c2728aa555a1ef17e7d3.

* remove define_var
---
 test/helpers.py                      |  1 -
 test/test_dtype.py                   |  4 +--
 test/test_profiler.py                |  2 +-
 test/test_uop_graph.py               | 26 +++++++++---------
 test/unit/test_graph_rewrite.py      | 16 +++++------
 test/unit/test_shapetracker.py       |  2 +-
 test/unit/test_simplify_valid_idx.py |  2 +-
 test/unit/test_uop_resolve.py        | 30 ++++++++++-----------
 test/unit/test_uop_vmin_vmax.py      | 40 ++++++++++++++--------------
 tinygrad/codegen/lowerer.py          | 10 +++----
 tinygrad/codegen/uopgraph.py         | 10 ++-----
 tinygrad/dtype.py                    |  3 +--
 tinygrad/ops.py                      | 13 +++------
 tinygrad/shape/view.py               |  4 +--
 14 files changed, 74 insertions(+), 89 deletions(-)

diff --git a/test/helpers.py b/test/helpers.py
index 9fbed0c1b6..ac566493f8 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -32,7 +32,6 @@ def assert_jit_cache_len(fxn, expected_len):
     assert len(fxn.jit_cache[0].prg.jit_cache) == expected_len
 
 def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT):
-  if dtype == dtypes.pyint and device != "PYTHON": return False
   if dtype == dtypes.bfloat16:
     # NOTE: this requires bf16 buffer support
     return device in {"AMD"} or (device in {"CUDA", "NV"} and not CI and not getenv("PTX"))
diff --git a/test/test_dtype.py b/test/test_dtype.py
index 49506a621a..67082a0e9a 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -12,7 +12,7 @@ from test.helpers import is_dtype_supported, rand_for_dtype
 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
 
-core_dtypes = list([v for k,v in DTYPES_DICT.items() if k != 'pyint'])
+core_dtypes = list(DTYPES_DICT.values())
 if Device.DEFAULT == "CPU": core_dtypes.remove(dtypes.bfloat16)  # NOTE: this is for teenygrad, don't remove
 dtype_ints = [dt for dt in core_dtypes if dtypes.is_int(dt) and is_dtype_supported(dt)]
 dtype_floats = [dt for dt in core_dtypes if dtypes.is_float(dt) and is_dtype_supported(dt)]
@@ -20,7 +20,7 @@ dtype_floats = [dt for dt in core_dtypes if dtypes.is_float(dt) and is_dtype_sup
 def get_available_cast_dtypes(dtype: DType) -> List[DType]:
   if not is_dtype_supported(dtype): return []
   # dont cast internal dtypes
-  return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_") and k != 'pyint']
+  return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_")]
 
 def _test_to_np(a:Tensor, np_dtype, target):
   if DEBUG >= 2: print(a)
diff --git a/test/test_profiler.py b/test/test_profiler.py
index f06a5008f4..bd2deffa66 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -147,7 +147,7 @@ class TestProfiler(unittest.TestCase):
 
     transfer_node_1 = helper_profile_filter_node(profile, name=f"{Device.DEFAULT} -> {Device.DEFAULT}:1")[0]
     helper_validate_node(transfer_node_1, profile=profile, pid_name=Device.DEFAULT, tid_name="DMA")
-    assert 80 < transfer_node_1['dur'] < (16000 if CI else 1400), f"Duration is not in the range: {transfer_node_1['dur']}"
+    assert 80 < transfer_node_1['dur'] < (20000 if CI else 1400), f"Duration is not in the range: {transfer_node_1['dur']}"
 
   @unittest.skipIf(MOCKGPU and Device.DEFAULT == "AMD", "AMD mockgpu with indirect buffers does not support queue wait interrupts")
   def test_profile_deps(self):
diff --git a/test/test_uop_graph.py b/test/test_uop_graph.py
index f3bdac5ba2..741ca2045e 100644
--- a/test/test_uop_graph.py
+++ b/test/test_uop_graph.py
@@ -172,10 +172,10 @@ class TestGraphRewrite(unittest.TestCase):
     self.assertEqual(nout.src[1].arg, 3.0)
 
   def test_consts_go_last(self):
-    a = UOp.define_var('a', dtypes.int, 0, 1)
-    b = UOp.define_var('b', dtypes.int, 0, 1)
-    c = UOp.define_var('c', dtypes.int, 0, 1)
-    d = UOp.define_var('d', dtypes.int, 0, 1)
+    a = UOp.variable('a', 0, 1)
+    b = UOp.variable('b', 0, 1)
+    c = UOp.variable('c', 0, 1)
+    d = UOp.variable('d', 0, 1)
     outs = [2+a, 2+a+d+3+b+c+4, UOp(UOps.ALU, a.dtype, src=(a.const_like(2), a), arg=BinaryOps.ADD), (4+d)+c+(2+a)+b]
     for out in outs:
       sink = graph_rewrite(out, sym)
@@ -196,7 +196,7 @@ class TestUOpGraph(unittest.TestCase):
     self.assertEqual(out.arg, 3.0)
 
   def test_where_same_fold(self):
-    v = UOp.define_var('tmp', dtypes.int, 0, 1)
+    v = UOp.variable('tmp', 0, 1)
     c0 = UOp(UOps.CONST, dtypes.int, arg=0)
     vc = UOp(UOps.ALU, dtypes.bool, (v, c0), BinaryOps.CMPNE)
     c1 = UOp(UOps.CONST, dtypes.float, arg=1.0)
@@ -290,7 +290,7 @@ class TestUOpGraph(unittest.TestCase):
     for i in [2, 4, 8]:
       vec = UOp(UOps.VECTORIZE, dtypes.half.vec(i), tuple(UOp.const(dtypes.half, 0.0) for _ in range(i)))
       var = UOp(UOps.DEFINE_VAR, dtypes.half.vec(i))
-      acc = UOp.define_var('acc', dtypes.half.vec(i), 0, 1)
+      acc = UOp.variable('acc', 0, 1, dtypes.half.vec(i))
       wmma = UOp(UOps.WMMA, dtypes.half.vec(i), (vec, var, acc))
       uops = to_uops_list([wmma])
       assert_equiv_uops(uops[0], acc)
@@ -299,7 +299,7 @@ class TestUOpGraph(unittest.TestCase):
     for i in [2, 4, 8]:
       var = UOp(UOps.DEFINE_VAR, dtypes.half.vec(i))
       vec = UOp(UOps.VECTORIZE, dtypes.half.vec(i), tuple(UOp.const(dtypes.half, 0.0) for _ in range(i)))
-      acc = UOp.define_var('acc', dtypes.half.vec(i), 0, 1)
+      acc = UOp.variable('acc', 0, 1, dtypes.half.vec(i))
       wmma = UOp(UOps.WMMA, dtypes.half.vec(i), (var, vec, acc))
       uops = to_uops_list([wmma])
       assert_equiv_uops(uops[0], acc)
@@ -366,7 +366,7 @@ class TestUOpGraph(unittest.TestCase):
     self.assertEqual(len([x for x in uops if x.op is UOps.CAST]), 1)
 
   def test_depth_2_const_fold(self):
-    v = UOp.define_var("tmp", dtypes.int, 0, 1)
+    v = UOp.variable("tmp", 0, 1)
     c2 = UOp(UOps.CONST, dtypes.int, arg=2)
     c4 = UOp(UOps.CONST, dtypes.int, arg=4)
     vc = UOp(UOps.ALU, dtypes.int, (v, c2), BinaryOps.ADD)
@@ -620,8 +620,8 @@ class TestLoadStoreFolder(unittest.TestCase):
 
   def test_simple_load_dont_fold_different_gated(self):
     buf = UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.float))
-    gate = UOp.define_var("g1", dtypes.bool, False, True)
-    gate2 = UOp.define_var("g2", dtypes.bool, False, True)
+    gate = UOp.variable("g1", False, True, dtypes.bool)
+    gate2 = UOp.variable("g2", False, True, dtypes.bool)
     load = [UOp(UOps.LOAD, dtypes.float, (buf, UOp.const(dtypes.int, i), UOp.const(dtypes.float, i), gate if i == 0 else gate2)) for i in range(4)]
     sink = UOp(UOps.VECTORIZE, dtypes.float.vec(len(load)), tuple(load))
     sink = float4_rewrite(sink)
@@ -636,7 +636,7 @@ class TestLoadStoreFolder(unittest.TestCase):
 
   def test_simple_store_fold_gate(self):
     buf = UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.float))
-    gate = UOp.define_var("g1", dtypes.bool, False, True)
+    gate = UOp.variable("g1", False, True, dtypes.bool)
     load = [UOp(UOps.STORE, dtypes.float, (buf, UOp.const(dtypes.int, i), UOp.const(dtypes.float, i), gate)) for i in range(4)]
     sink = UOp(UOps.SINK, dtypes.void, tuple(load))
     sink = float4_rewrite(sink)
@@ -647,8 +647,8 @@ class TestLoadStoreFolder(unittest.TestCase):
 
   def test_simple_store_dont_fold(self):
     buf = UOp(UOps.DEFINE_GLOBAL, PtrDType(dtypes.float))
-    gate = UOp.define_var("g1", dtypes.bool, False, True)
-    gate2 = UOp.define_var("g2", dtypes.bool, False, True)
+    gate = UOp.variable("g1", False, True, dtypes.bool)
+    gate2 = UOp.variable("g2", False, True, dtypes.bool)
     load = [UOp(UOps.STORE, dtypes.float, (buf, UOp.const(dtypes.int, i), UOp.const(dtypes.float, i), gate if i == 0 else gate2)) for i in range(4)]
     sink = UOp(UOps.SINK, dtypes.void, tuple(load))
     sink = float4_rewrite(sink)
diff --git a/test/unit/test_graph_rewrite.py b/test/unit/test_graph_rewrite.py
index b5221f7e7d..4a92572f34 100644
--- a/test/unit/test_graph_rewrite.py
+++ b/test/unit/test_graph_rewrite.py
@@ -94,20 +94,20 @@ class TestFoldingAndReduction(unittest.TestCase):
 
 class TestModuloAndDivisionFolding(unittest.TestCase):
   def test_full_graph_rewrite_modulo_folding_with_define_var(self):
-    x_var_uop = UOp.define_var('x', dtypes.int32, 0, 100)
+    x_var_uop = UOp.variable('x', 0, 100)
     optimized_mod_uop = apply_rewrite(((x_var_uop * 4) + 2) % 4)
     self.assertEqual(optimized_mod_uop.op, UOps.CONST)
     self.assertEqual(optimized_mod_uop.arg, 2)
 
   def test_full_graph_rewrite_division_folding_with_define_var(self):
-    n_var_uop = UOp.define_var('n', dtypes.int32, 1, 1000)
+    n_var_uop = UOp.variable('n', 1, 1000)
     optimized_div_uop = apply_rewrite((n_var_uop * 6) // 3)
     self.assertEqual(optimized_div_uop.op, UOps.ALU)
     self.assertEqual(optimized_div_uop.arg, BinaryOps.MUL)
     self.assertEqual(optimized_div_uop.src[1].arg, 2)
 
   def test_full_graph_rewrite_complex_mod_div_folding(self):
-    k_var_uop = UOp.define_var('k', dtypes.int32, 0, 50)
+    k_var_uop = UOp.variable('k', 0, 50)
     optimized_div_uop = apply_rewrite(((k_var_uop * 12 + 8) % 6) // 2)
     self.assertEqual(optimized_div_uop.op, UOps.CONST)
     self.assertEqual(optimized_div_uop.arg, 1)
@@ -124,17 +124,17 @@ class TestModuloAndDivisionFolding(unittest.TestCase):
     if opt.op is UOps.VECTORIZE: self.assertFalse(all_same(opt.src))
 
   def test_full_graph_rewrite_modulo_large_divisor(self):
-    x_var_uop = UOp.define_var('x', dtypes.int32, 1, 5)
+    x_var_uop = UOp.variable('x', 1, 5)
     self.assertIs(apply_rewrite(x_var_uop % 10), x_var_uop)
 
   def test_full_graph_rewrite_division_with_remainder(self):
-    x_var_uop = UOp.define_var('x', dtypes.int32, 7, 9)
+    x_var_uop = UOp.variable('x', 7, 9)
     optimized_sink = apply_rewrite(x_var_uop // 2)
     for x_value in range(7, 10):
       self.assertEqual(x_value // 2, evaluate_uop(optimized_sink, {'x': x_value}))
 
   def test_full_graph_rewrite_complex_mod_div_expression(self):
-    x_var_uop = UOp.define_var('x', dtypes.int32, 1, 10)
+    x_var_uop = UOp.variable('x', 1, 10)
     optimized_sink = apply_rewrite(((x_var_uop * 5) % 3) // 2)
     for x_value in range(1, 11):
       original_result = ((x_value * 5) % 3) // 2
@@ -152,14 +152,14 @@ class TestEdgeCasesAndSpecialOperations(unittest.TestCase):
 
   @unittest.skip("broken")
   def test_full_graph_rewrite_modulo_negative_dividend(self):
-    x_var_uop = UOp.define_var('x', dtypes.int32, -5, -1)
+    x_var_uop = UOp.variable('x', -5, -1)
     optimized_sink = full_graph_rewrite((x_var_uop % 3).sink())
     for x_value in range(-5, 0):
       self.assertEqual(x_value % 3, evaluate_uop(optimized_sink.src[0], {'x': x_value}))
 
   @unittest.skip("broken")
   def test_full_graph_rewrite_division_negative_divisor(self):
-    x_var_uop = UOp.define_var('x', dtypes.int32, 1, 5)
+    x_var_uop = UOp.variable('x', 1, 5)
     optimized_sink = full_graph_rewrite((x_var_uop // -2).sink())
     for x_value in range(1, 6):
       self.assertEqual(x_value // -2, evaluate_uop(optimized_sink.src[0], {'x': x_value}))
diff --git a/test/unit/test_shapetracker.py b/test/unit/test_shapetracker.py
index 5bd6927b94..0da63f62c1 100644
--- a/test/unit/test_shapetracker.py
+++ b/test/unit/test_shapetracker.py
@@ -11,7 +11,7 @@ from tinygrad.codegen.uopgraph import sym
 from itertools import product
 
 def shapetracker_getitem(st:ShapeTracker, val:int):
-  idx, valid = st.reshape((st.size,)).to_indexed_uops([UOp.const(dtypes.pyint, val)])
+  idx, valid = st.reshape((st.size,)).to_indexed_uops([UOp.const(dtypes.int, val)])
   idx, valid = graph_rewrite(idx, sym), graph_rewrite(valid, sym)
   assert idx.op is UOps.CONST and valid.op is UOps.CONST
   return idx.arg, valid.arg
diff --git a/test/unit/test_simplify_valid_idx.py b/test/unit/test_simplify_valid_idx.py
index 85c0af81a5..266d184ddf 100644
--- a/test/unit/test_simplify_valid_idx.py
+++ b/test/unit/test_simplify_valid_idx.py
@@ -32,7 +32,7 @@ def render(uop:UOp) -> str:
   return fxn.split("val0 = ")[1].split(";")[0]
 
 def Special(expr, nmax): return UOp(UOps.SPECIAL, dtypes.int, (), (expr, nmax))
-def Variable(expr, nmin, nmax): return UOp.define_var(expr, dtypes.int, nmin, nmax)
+def Variable(expr, nmin, nmax): return UOp.variable(expr, nmin, nmax)
 def Range(n, nmax):
   return UOp(UOps.RANGE, dtypes.int, arg=(n, True), src=(UOp.const(dtypes.int, 0), UOp.const(dtypes.int, nmax),))
 
diff --git a/test/unit/test_uop_resolve.py b/test/unit/test_uop_resolve.py
index 3f84d683d8..81b634750f 100644
--- a/test/unit/test_uop_resolve.py
+++ b/test/unit/test_uop_resolve.py
@@ -44,7 +44,7 @@ class TestUOpResolve(unittest.TestCase):
     self.assertEqual((8 * UOp.const(dtypes.int, 4)).ssimplify(), 32)
 
   def test_ambiguous_less_than(self):
-    u = UOp.define_var("i", dtypes.pyint, 1, 10)
+    u = UOp.variable("i", 1, 10)
     self.assertTrue(resolve(u < 4))
     self.assertFalse(resolve(u < 4, False))
     self.assertTrue(resolve(u < 11, False))
@@ -56,64 +56,64 @@ class TestUOpResolve(unittest.TestCase):
     self.assertEqual(float(u), 11.5)
 
   def test_var_cmp_t(self):
-    u = UOp.define_var("i", dtypes.pyint, 1, 10) < 20
+    u = UOp.variable("i", 1, 10) < 20
     self.assertTrue(u)
 
   def test_var_cmp_t2(self):
-    u = UOp.define_var("i", dtypes.pyint, 1, 10)//2 < 20
+    u = UOp.variable("i", 1, 10)//2 < 20
     self.assertTrue(u)
 
   def test_var_cmp_f(self):
-    u = UOp.define_var("i", dtypes.pyint, 1, 10) < 1
+    u = UOp.variable("i", 1, 10) < 1
     self.assertFalse(u)
 
   def test_var_cmp_f2(self):
-    u = UOp.define_var("i", dtypes.pyint, 1, 10) > 11
+    u = UOp.variable("i", 1, 10) > 11
     self.assertFalse(u)
 
   def test_or_true(self):
-    u = UOp.define_var("b", dtypes.bool, False, True) | True
+    u = UOp.variable("b", False, True, dtypes.bool) | True
     self.assertTrue(u)
 
   def test_or_false(self):
     with self.assertRaises(ValueError):
-      u = UOp.define_var("b", dtypes.bool, False, True) | False
+      u = UOp.variable("b", False, True, dtypes.bool) | False
       self.assertTrue(u)
 
   def test_and_false(self):
-    u = UOp.define_var("b", dtypes.bool, False, True) & False
+    u = UOp.variable("b", False, True, dtypes.bool) & False
     self.assertFalse(u)
 
   def test_max(self):
-    x = UOp.define_var("x", dtypes.pyint, 1, 10)
-    y = UOp.define_var("y", dtypes.pyint, 5, 10)
+    x = UOp.variable("x", 1, 10)
+    y = UOp.variable("y", 5, 10)
     u = x.max(y)
     self.assertTrue(u < 20)
     self.assertFalse(u < 3)
 
   def test_x_lt_x(self):
-    x = UOp.define_var("i", dtypes.pyint, 1, 10)
+    x = UOp.variable("i", 1, 10)
     self.assertFalse(x < x)
 
   @unittest.expectedFailure
   def test_x_lt_xp1(self):
-    x = UOp.define_var("i", dtypes.pyint, 1, 10)
+    x = UOp.variable("i", 1, 10)
     self.assertTrue(x < (x+1))
 
   def test_and_true(self):
     with self.assertRaises(ValueError):
-      u = UOp.define_var("b", dtypes.bool, False, True) & True
+      u = UOp.variable("b", False, True, dtypes.bool) & True
       self.assertFalse(u)
 
   @unittest.expectedFailure
   def test_var_cmp_range(self):
-    v = UOp.define_var("i", dtypes.pyint, 1, 10)
+    v = UOp.variable("i", 1, 10)
     u = (v > 4) | (v < 6)
     self.assertTrue(u)
 
   def test_var_cmp_assert(self):
     with self.assertRaises(ValueError):
-      u = UOp.define_var("i", dtypes.pyint, 1, 10) < 5
+      u = UOp.variable("i", 1, 10) < 5
       self.assertFalse(u)
 
 if __name__ == '__main__':
diff --git a/test/unit/test_uop_vmin_vmax.py b/test/unit/test_uop_vmin_vmax.py
index 6728e2d957..fbac2895ae 100644
--- a/test/unit/test_uop_vmin_vmax.py
+++ b/test/unit/test_uop_vmin_vmax.py
@@ -19,36 +19,36 @@ class TestVminVmaxProperties(unittest.TestCase):
 
   def test_vmin_vmax_addition_with_variable(self):
     # vmin and vmax for addition with a variable
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x + 5
     self.assertEqual(uop.vmin, 15)
     self.assertEqual(uop.vmax, 25)
 
   def test_vmin_vmax_multiplication_with_variable(self):
     # vmin and vmax for multiplication with a variable
-    x = UOp.define_var('x', dtypes.int32, -3, 4)
+    x = UOp.variable('x', -3, 4)
     uop = x * 2
     self.assertEqual(uop.vmin, -6)
     self.assertEqual(uop.vmax, 8)
 
   def test_vmin_vmax_with_negative_multiplication(self):
     # vmin and vmax when multiplying by a negative number
-    x = UOp.define_var('x', dtypes.int32, 2, 5)
+    x = UOp.variable('x', 2, 5)
     uop = x * -3
     self.assertEqual(uop.vmin, -15)
     self.assertEqual(uop.vmax, -6)
 
   def test_vmin_vmax_nested_min_max(self):
     # vmin and vmax with nested min/max operations
-    x = UOp.define_var('x', dtypes.int32, 0, 10)
+    x = UOp.variable('x', 0, 10)
     uop = x.max(5).min(8)
     self.assertEqual(uop.vmin, 5)
     self.assertEqual(uop.vmax, 8)
 
   def test_vmin_vmax_where(self):
-    x = UOp.define_var('x', dtypes.int, 0, 10)
-    y = UOp.define_var('y', dtypes.int, 1, 11)
-    z = UOp.define_var('z', dtypes.int, 2, 12)
+    x = UOp.variable('x', 0, 10)
+    y = UOp.variable('y', 1, 11)
+    z = UOp.variable('z', 2, 12)
     uop = x.lt(5).where(y, z)
     self.assertEqual(uop.vmin, 1)
     self.assertEqual(uop.vmax, 12)
@@ -56,21 +56,21 @@ class TestVminVmaxProperties(unittest.TestCase):
 class TestVminVmaxDivMod(unittest.TestCase):
   def test_vmin_vmax_division_positive(self):
     # vmin and vmax for division of a variable by a positive constant
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x // 2
     self.assertEqual(uop.vmin, 5)
     self.assertEqual(uop.vmax, 10)
 
   def test_vmin_vmax_division_negative(self):
     # vmin and vmax for division of a variable by a negative constant
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x // -2
     self.assertEqual(uop.vmin, -10)
     self.assertEqual(uop.vmax, -5)
 
   def test_vmin_vmax_mod_positive(self):
     # vmin and vmax for modulo of a variable by a positive constant
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x % 3
     self.assertEqual(uop.vmin, 0)
     self.assertEqual(uop.vmax, 2)
@@ -78,21 +78,21 @@ class TestVminVmaxDivMod(unittest.TestCase):
   @unittest.skip("broken")
   def test_vmin_vmax_mod_negative(self):
     # vmin and vmax for modulo of a variable by a negative constant
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x % -3
     self.assertEqual(uop.vmin, -2)
     self.assertEqual(uop.vmax, 0)
 
   def test_vmin_vmax_division_with_mixed_range(self):
     # vmin and vmax for division of a variable with a range crossing zero
-    x = UOp.define_var('x', dtypes.int32, -10, 10)
+    x = UOp.variable('x', -10, 10)
     uop = x // 3
     self.assertEqual(uop.vmin, -4)  # -10//3 = -4
     self.assertEqual(uop.vmax, 3)   # 10//3 = 3
 
   def test_vmin_vmax_mod_with_mixed_range(self):
     # vmin and vmax for modulo of a variable with a range crossing zero
-    x = UOp.define_var('x', dtypes.int32, -10, 10)
+    x = UOp.variable('x', -10, 10)
     uop = x % 4
     self.assertEqual(uop.vmin, 0)   # modulo always positive or zero when divisor is positive
     self.assertEqual(uop.vmax, 3)   # max possible mod is 3 when dividing by 4
@@ -146,26 +146,26 @@ class TestConstFactor(unittest.TestCase):
 
   def test_const_factor_with_variable(self):
     # const_factor for an expression involving a variable
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x * 3
     self.assertEqual(uop.const_factor(), 3)
 
   def test_const_factor_division(self):
     # const_factor for an expression with division
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x // 4
     self.assertEqual(uop.const_factor(), 1)  # Division reduces the const_factor to 1
 
   def test_const_factor_multiplication_of_var_and_const(self):
     # const_factor for multiplication of a variable and a constant
-    x = UOp.define_var('x', dtypes.int32, 6, 18)
+    x = UOp.variable('x', 6, 18)
     uop = x * 4
     self.assertEqual(uop.const_factor(), 4)  # Constant factor 4
 
   @unittest.skip("broken")
   def test_const_factor_multiplication_of_consts_and_vars(self):
     # Multiplying constants and variables
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = (x * 3) * 5
     self.assertEqual(uop.const_factor(), 15)  # Constant multipliers are combined (3 * 5 = 15)
 
@@ -186,7 +186,7 @@ class TestDivides(unittest.TestCase):
   @unittest.skip("broken")
   def test_divides_variable_and_constant(self):
     # Multiplying a variable by a constant, then dividing by the same constant
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = x * 6
     result = uop.divides(6)
     self.assertIsNotNone(result)
@@ -194,7 +194,7 @@ class TestDivides(unittest.TestCase):
 
   def test_divides_complex_expression(self):
     # Dividing a more complex expression
-    x = UOp.define_var('x', dtypes.int32, 10, 20)
+    x = UOp.variable('x', 10, 20)
     uop = (x * 6) + 18
     result = uop.divides(6)
     self.assertIsNotNone(result)
@@ -202,7 +202,7 @@ class TestDivides(unittest.TestCase):
 
   def test_divides_with_inexact_factors(self):
     # Multiplying by a constant but dividing by a non-exact divisor
-    x = UOp.define_var('x', dtypes.int32, 15, 45)
+    x = UOp.variable('x', 15, 45)
     uop = x * 4
     result = uop.divides(3)
     self.assertIsNone(result)  # Cannot divide by 3, since 4 is not divisible by 3
diff --git a/tinygrad/codegen/lowerer.py b/tinygrad/codegen/lowerer.py
index 96a1dbb320..59e009b646 100644
--- a/tinygrad/codegen/lowerer.py
+++ b/tinygrad/codegen/lowerer.py
@@ -33,7 +33,7 @@ def _limit_dims(dims:Tuple[sint, ...], max_sizes:Tuple[int, ...]):
 def get_grouped_dims(prefix, dims:Tuple[sint, ...], max_sizes:Optional[Tuple[int, ...]], reverse=False) -> List[UOp]:
   if reverse: dims = dims[::-1]
   limited = _limit_dims(dims, max_sizes) if max_sizes is not None else dims
-  ret = raw_idxs = [UOp(UOps.SPECIAL, dtypes.pyint, (), (f"{prefix}{i}", s)) for i,s in enumerate(limited)]
+  ret = raw_idxs = [UOp(UOps.SPECIAL, dtypes.int, (), (f"{prefix}{i}", s)) for i,s in enumerate(limited)]
   if limited != dims:
     ret = []
     # cast for mypy, get_contraction won't be None
@@ -75,22 +75,22 @@ def get_index(ast:UOp, opts:Renderer) -> IndexContext:
              get_grouped_dims("lidx", full_shape[global_dims:first_reduce+group_for_reduces], opts.local_max)
   else:
     # all loops are RANGES
-    idxs = [UOp(UOps.RANGE, dtypes.pyint, (UOp.const(dtypes.pyint, 0), variable_to_uop(g)), (i, False))
+    idxs = [UOp(UOps.RANGE, dtypes.int, (UOp.const(dtypes.int, 0), variable_to_uop(g)), (i, False))
                   for i,g in enumerate(full_shape[:first_reduce])]
 
   # reduce loops
-  idxs += [UOp(UOps.RANGE, dtypes.pyint, (UOp.const(dtypes.pyint, 0), variable_to_uop(g)), (i, True))
+  idxs += [UOp(UOps.RANGE, dtypes.int, (UOp.const(dtypes.int, 0), variable_to_uop(g)), (i, True))
     for i,g in enumerate(full_shape[first_reduce+group_for_reduces:first_upcasted], start=first_reduce+group_for_reduces)]
 
   # upcast loops
   for i,g in enumerate(full_shape[first_upcasted:], start=first_upcasted):
     assert isinstance(g, int), "needs to be int to upcast/unroll"
-    idxs.append(UOp(UOps.EXPAND, dtypes.pyint, (UOp.const(dtypes.pyint.vec(g), tuple(range(g))),), ((i,g),)))
+    idxs.append(UOp(UOps.EXPAND, dtypes.int, (UOp.const(dtypes.int.vec(g), tuple(range(g))),), ((i,g),)))
 
   # late indexes (group for reduce)
   ridxs = idxs[:]
   for a in range(first_reduce, first_reduce+group_for_reduces):
-    ridxs[a] = UOp(UOps.RANGE, dtypes.pyint, (UOp.const(dtypes.pyint, 0), variable_to_uop(full_shape[a])), (1000+a, True))
+    ridxs[a] = UOp(UOps.RANGE, dtypes.int, (UOp.const(dtypes.int, 0), variable_to_uop(full_shape[a])), (1000+a, True))
 
   return IndexContext(idxs, ridxs)
 
diff --git a/tinygrad/codegen/uopgraph.py b/tinygrad/codegen/uopgraph.py
index 6d6d3ce7bc..ef9977c5c6 100644
--- a/tinygrad/codegen/uopgraph.py
+++ b/tinygrad/codegen/uopgraph.py
@@ -118,9 +118,9 @@ def idx_given_valid(valid:UOp, idx:UOp) -> Optional[UOp]:
     candidates = []
     if uop.op is UOps.ALU and uop.arg is BinaryOps.ADD and all(is_irreducible(u) and u.vmin == 0 for u in _get_chain(uop, BinaryOps.ADD)):
       # if the constraint is a simplex: X0 + X1 + ... > 0, we can check if all Xi > 0 simplify into the same output
-      candidates.append([(Xi, UOp.define_var("fake", Xi.dtype, 1, Xi.vmax)) for Xi in _get_chain(uop, BinaryOps.ADD)])
+      candidates.append([(Xi, UOp.variable("fake", 1, Xi.vmax, Xi.dtype)) for Xi in _get_chain(uop, BinaryOps.ADD)])
     # try checking the whole clause
-    candidates.append([(uop, UOp.define_var("fake", uop.dtype, uop.vmin if v[0] is None else v[0], uop.vmax if v[1] is None else v[1]))])
+    candidates.append([(uop, UOp.variable("fake", uop.vmin if v[0] is None else v[0], uop.vmax if v[1] is None else v[1], uop.dtype))])
 
     for candidate in candidates:
       newidxs:List[List[UOp]] = [[], []]
@@ -538,9 +538,6 @@ reducer = PatternMatcher([
   (UPat(UOps.LOAD, src=(UPat.var("buf"), UPat()), allow_any_len=True, name="load"), simplify_valid_image_load),
 ])
 
-no_pyint = PatternMatcher([(UPat((UOps.CONST, UOps.VCONST, UOps.ALU, UOps.SPECIAL, UOps.RANGE, UOps.EXPAND, UOps.VECTORIZE, UOps.DEFINE_VAR),
-  name="x"), lambda x: UOp(x.op, dtypes.int32.vec(x.dtype.count), x.src, x.arg) if x.dtype.scalar() == dtypes.pyint else None)])
-
 # *** uop graph ***
 
 linearize_cnt = 0
@@ -552,9 +549,6 @@ def full_graph_rewrite(sink:UOp, opts:Optional[Renderer]=None) -> UOp:
   acc_number = 0
   sink = graph_rewrite(sink, sym)
 
-  # rewrite pyint to int32
-  sink = graph_rewrite(sink, no_pyint)
-
   # expand
   linearize_cnt += 1
   if linearize_cnt != (de:=getenv("DEBUG_EXPAND", 0)) and de != -1:
diff --git a/tinygrad/dtype.py b/tinygrad/dtype.py
index 10707e4006..8a2adb18ff 100644
--- a/tinygrad/dtype.py
+++ b/tinygrad/dtype.py
@@ -86,7 +86,6 @@ class dtypes:
   def fields() -> Dict[str, DType]: return DTYPES_DICT
   # TODO: priority should be higher than bool
   void: Final[DType] = DType(-1, 0, "void", None, 1)
-  pyint: Final[DType] = DType(-1, 8, "pyint", None, 1)   # arbitrary precision integer, same itemsize to int64 so min/max works
   bool: Final[DType] = DType(0, 1, "bool", '?', 1)
   int8: Final[DType] = DType(1, 1, "char", 'b', 1)
   uint8: Final[DType] = DType(2, 1, "unsigned char", 'B', 1)
@@ -118,7 +117,7 @@ class dtypes:
 
   floats = (float16, bfloat16, float32, float64)
   uints = (uint8, uint16, uint32, uint64)
-  sints = (int8, int16, int32, int64, pyint)
+  sints = (int8, int16, int32, int64)
   ints = uints + sints
 
 if (env_default_float := getenv("DEFAULT_FLOAT", "")):
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 10c95caeac..cbf0cf0c47 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -282,11 +282,8 @@ class UOp(MathTrait):
       out_dtype = dtypes.bool.vec(out_dtype.count) if out_dtype.count > 1 else dtypes.bool
     return UOp(UOps.ALU, out_dtype, (self,)+src, arg)
   @staticmethod
-  def const(dtype:DType, b:Tuple[ConstType, ...]|ConstType|Variable): return UOp._const(dtype, b)
-  @staticmethod
-  def _const(dtype:DType, b:Tuple[ConstType, ...]|ConstType|Variable):
+  def const(dtype:DType, b:Tuple[ConstType, ...]|ConstType|Variable):
     # TODO: fix dtype of b.max after Variable is just an UOp
-    #if isinstance(b, Variable): return UOp.define_var(b.expr, dtype, b.min, cast(int, b.max))
     if isinstance(b, UOp): return b.unbind()[0] if b.op is UOps.BIND else b
     if isinstance(b, tuple) and all_same(b): b = b[0]  # doesn't have to be a VCONST if they are all the same
     return UOp(UOps.VCONST if isinstance(b, tuple) else UOps.CONST, dtype, arg=dtypes.as_const(b, dtype) if dtype is not None else b) # type: ignore
@@ -299,9 +296,8 @@ class UOp(MathTrait):
   # *** uop Variable stuff ***
 
   @staticmethod
-  def variable(name:str, min_val:ConstType, max_val:ConstType): return UOp(UOps.DEFINE_VAR, dtypes.int, arg=(name, min_val, max_val))
-  @staticmethod
-  def define_var(name:str, dtype:DType, min_val:ConstType, max_val:ConstType): return UOp(UOps.DEFINE_VAR, dtype, arg=(name, min_val, max_val))
+  def variable(name:str, min_val:ConstType, max_val:ConstType, dtype:DType=dtypes.int):
+    return UOp(UOps.DEFINE_VAR, dtype, arg=(name, min_val, max_val))
   @property
   def expr(self):
     assert self.op is UOps.DEFINE_VAR, f"op is {self.op}, need DEFINE_VAR"
@@ -694,9 +690,6 @@ spec = PatternMatcher([
   (UPat(UOps.RANGE, src=(UPat(name="x"), UPat(name="y")), name="rng"), lambda rng,x,y: rng.dtype == x.dtype == y.dtype),
   (UPat(UOps.SPECIAL, src=()), lambda: True),
 
-  # no pyint allowed here!
-  (UPat(UOps.ALU, dtype=dtypes.pyint), lambda: False),
-
   # TODO: confirm the args of both of these are shapetrackers
   (UPat(UOps.VIEW, src=()), lambda: True),
   (UPat(UOps.VIEW, src=(UPat(),)), lambda: True),
diff --git a/tinygrad/shape/view.py b/tinygrad/shape/view.py
index eea20ea490..0fe9a78e48 100644
--- a/tinygrad/shape/view.py
+++ b/tinygrad/shape/view.py
@@ -82,7 +82,7 @@ def un1d(shape:Tuple[sint, ...], offs:sint) -> List[sint]:
     offs -= here * stride
   return result
 
-def variable_to_uop(x, ctx=None) -> UOp: return UOp.const(dtypes.pyint, x) if isinstance(x, int) else x
+def variable_to_uop(x, ctx=None) -> UOp: return UOp.const(dtypes.int, x) if isinstance(x, int) else x
 
 @dataclass(frozen=True)
 class View:
@@ -93,7 +93,7 @@ class View:
   contiguous:bool
 
   def to_indexed_uops(self:View, _idxs:Optional[List[UOp]]=None, vexpr:UOp=UOp.const(dtypes.bool, True)) -> Tuple[UOp, UOp]:
-    idxs = [UOp.range(dtypes.pyint, 0, s, i) for i,s in enumerate(self.shape)] if _idxs is None else _idxs
+    idxs = [UOp.range(dtypes.int, 0, s, i) for i,s in enumerate(self.shape)] if _idxs is None else _idxs
     iexpr = variable_to_uop(self.offset)
     for idx,sh,st,m in zip(idxs, self.shape, self.strides, self.mask if self.mask is not None else [None]*len(self.shape)):
       if resolve(sh != 1) and resolve(st != 0): iexpr = iexpr + idx*st

From 23faeacb23a65eeec0d36bdf47ec74e8567878b2 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Sat, 12 Oct 2024 10:51:07 -0400
Subject: [PATCH 13/17] remove outdated comments (#7018)

---
 tinygrad/dtype.py    | 1 -
 tinygrad/function.py | 1 -
 tinygrad/ops.py      | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tinygrad/dtype.py b/tinygrad/dtype.py
index 8a2adb18ff..a440754e8d 100644
--- a/tinygrad/dtype.py
+++ b/tinygrad/dtype.py
@@ -84,7 +84,6 @@ class dtypes:
     return {dtypes.float16: (5, 10), dtypes.bfloat16: (8, 7), dtypes.float32: (8, 23), dtypes.float64: (11, 52)}[dtype]
   @staticmethod
   def fields() -> Dict[str, DType]: return DTYPES_DICT
-  # TODO: priority should be higher than bool
   void: Final[DType] = DType(-1, 0, "void", None, 1)
   bool: Final[DType] = DType(0, 1, "bool", '?', 1)
   int8: Final[DType] = DType(1, 1, "char", 'b', 1)
diff --git a/tinygrad/function.py b/tinygrad/function.py
index b27a2e6a4f..70169cb858 100644
--- a/tinygrad/function.py
+++ b/tinygrad/function.py
@@ -39,7 +39,6 @@ class Sin(Function):
 
   def backward(self, grad_output:LazyBuffer) -> LazyBuffer: return (math.pi/2 - self.x).sin() * grad_output
 
-# NOTE: maximum(x, 0) behaves differently where x=0
 class Relu(Function):
   def forward(self, x:LazyBuffer) -> LazyBuffer:
     self.ret = x.max(0)
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index cbf0cf0c47..b2e5e00fde 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -283,7 +283,6 @@ class UOp(MathTrait):
     return UOp(UOps.ALU, out_dtype, (self,)+src, arg)
   @staticmethod
   def const(dtype:DType, b:Tuple[ConstType, ...]|ConstType|Variable):
-    # TODO: fix dtype of b.max after Variable is just an UOp
     if isinstance(b, UOp): return b.unbind()[0] if b.op is UOps.BIND else b
     if isinstance(b, tuple) and all_same(b): b = b[0]  # doesn't have to be a VCONST if they are all the same
     return UOp(UOps.VCONST if isinstance(b, tuple) else UOps.CONST, dtype, arg=dtypes.as_const(b, dtype) if dtype is not None else b) # type: ignore

From 5ce224ceb316ff15092161c85cbfbdb208b9e597 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 12 Oct 2024 23:16:27 +0800
Subject: [PATCH 14/17] handle arbitrary multiplication case (#7019)

* handle arbitrary multiplication case

* remove count restriction
---
 tinygrad/ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index b2e5e00fde..729601c5c1 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -354,17 +354,20 @@ class UOp(MathTrait):
     if self.op is UOps.SPECIAL: return 0, self.arg[1]-1 if isinstance(self.arg[1], int) else dtypes.max(self.dtype)
     if self.op is UOps.CONST: return self.arg, self.arg
     if self.op is UOps.VCONST: return (min(self.arg), max(self.arg))
-    if self.op is UOps.ALU and self.dtype.count == 1:
+    if self.op is UOps.ALU:
       s0,s1,s2 = [cast(UOp, self.src[i] if i < len(self.src) else None) for i in range(3)]
       if self.arg is BinaryOps.ADD: return s0.vmin+s1.vmin, s0.vmax+s1.vmax
       if self.arg is BinaryOps.MUL:
         # both are non-positive
         if (s0.vmax <= 0 and s1.vmax <= 0): return s0.vmax*s1.vmax, s0.vmin*s1.vmin
-        # at lease one is non-negative
+        # at least one is non-negative
         if (s0.vmin >= 0 or s1.vmin >= 0):
           Lmin, Lmax = (s0.vmin, s0.vmax) if s1.vmin >= 0 else (s0.vmax, s0.vmin)
           Rmin, Rmax = (s1.vmin, s1.vmax) if s0.vmin >= 0 else (s1.vmax, s1.vmin)
           return Lmin*Rmin, Lmax*Rmax
+        # arbitrary
+        products = [s0.vmin * s1.vmin, s0.vmin * s1.vmax, s0.vmax * s1.vmin, s0.vmax * s1.vmax]
+        return min(products), max(products)
       if self.arg is BinaryOps.MOD and s1.vmin > 0: return 0, s1.vmax-1
       if self.arg is BinaryOps.IDIV and s1.op is UOps.CONST:
         if s1.arg > 0: return s0.vmin//s1.arg, s0.vmax//s1.arg

From cae1c417553a95a7527444ae1a772b5fbd0b6b9f Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Sat, 12 Oct 2024 15:46:32 -0400
Subject: [PATCH 15/17] test case of softmax backward kernel count (#7022)

---
 test/test_schedule.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index f63c66c334..29a972c071 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -7,19 +7,17 @@ import numpy as np
 import functools
 from typing import List, Optional, Union, cast
 
-from tinygrad import nn, dtypes
-from tinygrad.device import Device
+from tinygrad import nn, dtypes, Device, Tensor
 from tinygrad.dtype import DType, PtrDType
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
-from tinygrad.tensor import Tensor
 from tinygrad.ops import BinaryOps, MetaOps, UOp, UnaryOps, UOps, graph_rewrite
 from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, GlobalCounters, flatten, getenv, SPLIT_REDUCEOP, unwrap, prod
 from tinygrad.codegen.kernel import Kernel, verify_ast
 from tinygrad.engine.schedule import BUF_LIMIT, create_schedule, reduceop_fusor, st_fixup
 from tinygrad.engine.realize import CompiledRunner, run_schedule
-from test.helpers import ast_const, is_dtype_supported, Context, timeit
 from tinygrad.engine.lazy import LazyBuffer, view_supported_devices
+from test.helpers import ast_const, is_dtype_supported, Context, timeit
 from extra.models.llama import precompute_freqs_cis
 
 class KernelCountException(Exception): pass
@@ -876,11 +874,16 @@ class TestSchedule(unittest.TestCase):
     Tensor.manual_seed(0)
     x = Tensor.randn(4, 12, 64, 64).realize()
     out = x.softmax()
-    # run_schedule(check_schedule(out, 2))
     run_schedule(check_schedule(out, 3))
     expected = (x_exp:=np.exp(x.numpy()-x.numpy().max(-1, keepdims=True)))/x_exp.sum(-1, keepdims=True)
     np.testing.assert_allclose(out.numpy(), expected, atol=1e-4, rtol=1e-4)
 
+  def test_softmax_backward(self):
+    Tensor.manual_seed(0)
+    x = Tensor.randn(4, 12, 64, 64, requires_grad=True).realize()
+    x.softmax().sum().backward()
+    run_schedule(check_schedule(x.grad, 6))
+
   # changed by: multireduce spec
   def test_layernorm_onelayer_fusion(self):
     Tensor.manual_seed(0)

From 04d9b46d5162e857df3d03994d539e825ed7c838 Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Sat, 12 Oct 2024 15:59:23 -0400
Subject: [PATCH 16/17] derivative of softmax is indepedent of max (#7009)

* derivative of softmax is indepedent of max

* update test
---
 test/test_schedule.py | 2 +-
 tinygrad/tensor.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_schedule.py b/test/test_schedule.py
index 29a972c071..19466bd6e6 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -882,7 +882,7 @@ class TestSchedule(unittest.TestCase):
     Tensor.manual_seed(0)
     x = Tensor.randn(4, 12, 64, 64, requires_grad=True).realize()
     x.softmax().sum().backward()
-    run_schedule(check_schedule(x.grad, 6))
+    run_schedule(check_schedule(x.grad, 4))
 
   # changed by: multireduce spec
   def test_layernorm_onelayer_fusion(self):
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 357866e43f..60f13f804c 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -1681,7 +1681,7 @@ class Tensor:
 
   def _softmax(self, axis, dtype:Optional[DTypeLike]=None):
     x = self.cast(dtype) if dtype is not None else self
-    m = x - x.max(axis=axis, keepdim=True)
+    m = x - x.max(axis=axis, keepdim=True).detach()
     e = m.exp()
     return m, e, e.sum(axis=axis, keepdim=True)
 

From 942a17109aa6ec609f8b3a5e08e24dee22725eaf Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Sat, 12 Oct 2024 23:44:36 +0300
Subject: [PATCH 17/17] qcom use QCOMBuffer for all allocated buffers (#7023)

* qcom use QCOMBuffer for all allocated buffers

* checks
---
 tinygrad/runtime/ops_qcom.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py
index a54fb97e87..7e2577a8aa 100644
--- a/tinygrad/runtime/ops_qcom.py
+++ b/tinygrad/runtime/ops_qcom.py
@@ -281,8 +281,11 @@ class QCOMProgram(HCQProgram):
     if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferOptions(cpu_access=True, nolru=True))
 
 class QCOMBuffer(HCQBuffer):
-  def __init__(self, va_addr:int, size:int, desc=None, ibo=None, pitch=None, real_stride=None):
-    self.va_addr, self.size, self.desc, self.ibo, self.pitch, self.real_stride = va_addr, size, desc, ibo, pitch, real_stride
+  def __init__(self, va_addr:int, size:int, info=None, mapped=False, desc=None, ibo=None, pitch=None, real_stride=None, **kwargs):
+    self.va_addr, self.size, self.info, self.mapped = va_addr, size, info, mapped
+
+    # Texture specific definitions
+    self.desc, self.ibo, self.pitch, self.real_stride = [0] * 16, [0] * 16, pitch, real_stride
 
 class QCOMAllocator(HCQAllocator):
   def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
@@ -298,8 +301,7 @@ class QCOMAllocator(HCQAllocator):
       if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size)
       else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE, map_to_cpu=True)
 
-      # Extend HCQBuffer with texture-related info.
-      texture.pitch, texture.real_stride, texture.desc, texture.ibo = pitch, real_stride, [0] * 16, [0] * 16
+      texture.pitch, texture.real_stride = pitch, real_stride
 
       tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
       texture.desc[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
@@ -318,12 +320,12 @@ class QCOMAllocator(HCQAllocator):
       src_off, dest_off = src_off+src_stride, dest_off+dest_stride
 
   def copyin(self, dest:HCQBuffer, src:memoryview):
-    if hasattr(qd:=cast(QCOMBuffer, dest), 'pitch'): self._do_copy(mv_address(src), qd.va_addr, len(src), qd.real_stride, qd.real_stride, qd.pitch)
+    if (qd:=cast(QCOMBuffer, dest)).pitch is not None: self._do_copy(mv_address(src), qd.va_addr, len(src), qd.real_stride, qd.real_stride, qd.pitch)
     else: ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes)
 
   def copyout(self, dest:memoryview, src:HCQBuffer):
     self.device.synchronize()
-    if hasattr(qs:=cast(QCOMBuffer, src), 'pitch'): self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride)
+    if (qs:=cast(QCOMBuffer, src)).pitch is not None: self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride)
     else: ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
 
   def as_buffer(self, src:HCQBuffer) -> memoryview:
@@ -379,7 +381,7 @@ class QCOMDevice(HCQCompiled):
       va_addr = libc.mmap(va_addr, va_len, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.fd, alloc.id * 0x1000)
       if fill_zeroes: ctypes.memset(va_addr, 0, va_len)
 
-    return SimpleNamespace(va_addr=va_addr, size=size, mapped=map_to_cpu, info=alloc)
+    return QCOMBuffer(va_addr=va_addr, size=size, mapped=map_to_cpu, info=alloc)
 
   def _gpu_free(self, mem):
     kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)