diff --git a/tinygrad/codegen/optimizer.py b/tinygrad/codegen/optimizer.py
index 399c6d641e..65902eb71d 100644
--- a/tinygrad/codegen/optimizer.py
+++ b/tinygrad/codegen/optimizer.py
@@ -337,10 +337,6 @@ class OptimizedKernel(Kernel):
     # no more opt if we are grouping
     if self.group_for_reduce: return
 
-    # no more opt if there's non ints in any shapes
-    # TODO: this is due to a bug. repro by commenting this one while running GPT-2 with the JIT
-    if self.has_variable_shape(): return
-
     # **** below this line need to be optional and benchmarked ****
 
     # if there are small dims with lots of valid masks, upcast them (they might be from Tensor.stack)
@@ -349,7 +345,8 @@ class OptimizedKernel(Kernel):
     # upcast leading axes first (hack-ish for winograd; we actually want to upcast masked axes with low stride first)
     for axis in range(self.first_reduce):
       # we might want to be able to split axes that are masked, or refuse to merge them in simplify_merge_adjacent
-      if self.full_shape[axis] <= 7 and any(st.axis_is_masked(axis) for st in self.sts) and prod(self.full_shape[self.shape_len - self.upcasted:]) * self.full_shape[axis] <= 7 * 7:
+      # for now skip upcasting here if there is a symbolic axis
+      if isinstance(self.full_shape[axis], int) and self.full_shape[axis] <= 7 and any(st.axis_is_masked(axis) for st in self.sts) and prod(self.full_shape[self.shape_len - self.upcasted:]) * self.full_shape[axis] <= 7 * 7:
         if DEBUG >= 4: print(f"upcasting masked axis : {axis}")
         to_upcast.append(axis)
     for axis in to_upcast[::-1]:
@@ -408,4 +405,4 @@ class OptimizedKernel(Kernel):
       for axis, local_sz in sorted(to_local[:3]):
         self.shift_to(axis, local_sz, insert_before=self.first_reduce)
         self.local_dims += 1
-    self.simplify_ones()
+    self.simplify_ones()
\ No newline at end of file