diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 50a202ebe2..86c582afc2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,12 @@
 repos:
   - repo: local
     hooks:
+      - id: ruff
+        name: ruff
+        entry: ruff .
+        language: system
+        always_run: true
+        pass_filenames: false
       - id: docs
         name: docs
         entry: python3 docs/abstractions.py
@@ -15,19 +21,13 @@ repos:
         pass_filenames: false
       - id: mypy
         name: mypy
-        entry: mypy tinygrad/ extra/helpers.py # --warn-return-any
+        entry: mypy tinygrad/ extra/helpers.py
         language: system
         always_run: true
         pass_filenames: false
       - id: tests
         name: subset of (CPU) tests
-        entry: env CPU=1 pytest test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py
-        language: system
-        always_run: true
-        pass_filenames: false
-      - id: pylint
-        name: pylint
-        entry: pylint tinygrad/
+        entry: env CPU=1 pytest test/unit/ test/test_ops.py test/test_dtype.py test/test_schedule.py test/test_custom_function.py test/test_assign.py test/test_symbolic_shapetracker.py
         language: system
         always_run: true
         pass_filenames: false
diff --git a/examples/handcode_resnet50_opt.py b/examples/handcode_resnet50_opt.py
index c81f43118d..04fec585d5 100644
--- a/examples/handcode_resnet50_opt.py
+++ b/examples/handcode_resnet50_opt.py
@@ -6,7 +6,7 @@ from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.runtime.ops_metal import renderer, MetalProgram, RawMetalBuffer
 from tinygrad.helpers import ansilen, DEBUG
-from extra.utils import print_tree
+from tinygrad.graph import print_tree
 
 if __name__ == "__main__":
   mdl = ResNet50()
diff --git a/extra/utils.py b/extra/utils.py
index 93268b4d19..aa416bfe9d 100644
--- a/extra/utils.py
+++ b/extra/utils.py
@@ -212,13 +212,3 @@ def get_child(parent, key):
     else:
       obj = getattr(obj, k)
   return obj
-
-def _tree(lazydata, prefix=""):
-  if type(lazydata).__name__ == "LazyBuffer": return [f"━━ realized {lazydata.dtype.name} {lazydata.shape}"] if (lazydata.realized) else _tree(lazydata.op, "LB ")
-  if len(lazydata.src) == 0: return [f"━━ {prefix}{lazydata.op.name} {lazydata.arg if lazydata.arg else ''}"]
-  lines = [f"━┳ {prefix}{lazydata.op.name} {lazydata.arg if lazydata.arg else ''}"]
-  childs = [_tree(c) for c in lazydata.src[:]]
-  for c in childs[:-1]: lines += [f" ┣{c[0]}"] + [f" ┃{l}" for l in c[1:]]
-  return lines + [" ┗"+childs[-1][0]] + ["  "+l for l in childs[-1][1:]]
-
-def print_tree(tensor:Union[Tensor, LazyBuffer]):print("\n".join([f"{str(i).rjust(3)} {s}" for i,s in enumerate(_tree(tensor if not isinstance(tensor, Tensor) else tensor.lazydata))]))
diff --git a/test/test_schedule.py b/test/test_schedule.py
index 3e97f1bc07..80702fdbf6 100644
--- a/test/test_schedule.py
+++ b/test/test_schedule.py
@@ -8,7 +8,7 @@ from tinygrad.tensor import Tensor
 from tinygrad.ops import LoadOps, Device, Compiled
 from tinygrad.helpers import DEBUG, dtypes
 from tinygrad.codegen.linearizer import Linearizer
-from tinygrad.graph import log_schedule_item
+from tinygrad.graph import log_schedule_item, print_tree
 from tinygrad import nn
 
 def check_schedule(t:Tensor, allowed:int, to_prerealize:Optional[List[Tensor]]=None, filter_loadops=True):
@@ -23,7 +23,6 @@ def check_schedule(t:Tensor, allowed:int, to_prerealize:Optional[List[Tensor]]=N
   if filter_loadops: sched = [s for s in sched if s[0].op not in LoadOps]
   if len(sched) != allowed: print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}")
   if len(sched) != allowed or DEBUG >= 3:
-    from extra.utils import print_tree
     for i, s in enumerate(sched):
       print("op", i)
       print_tree(s[0])
diff --git a/test/test_winograd.py b/test/test_winograd.py
index b062946a1f..ac8c338d1a 100644
--- a/test/test_winograd.py
+++ b/test/test_winograd.py
@@ -3,6 +3,7 @@ from tinygrad.helpers import Timing
 from tinygrad.tensor import Tensor
 from tinygrad.ops import LoadOps
 from tinygrad.codegen.linearizer import Linearizer
+from test.test_net_speed import start_profile, stop_profile
 
 class TestWinograd(unittest.TestCase):
   def setUp(self):
@@ -28,5 +29,12 @@ class TestWinograd(unittest.TestCase):
         l.hand_coded_optimizations()
         l.linearize()
 
+  def test_profile(self):
+    x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
+    pr = start_profile()
+    out = Tensor.conv2d(x,w).realize()
+    stop_profile(pr, sort='time')
+    out.numpy()
+
 if __name__ == '__main__':
   unittest.main(verbosity=2)
\ No newline at end of file
diff --git a/tinygrad/graph.py b/tinygrad/graph.py
index b93324fd63..f9264e4bf2 100644
--- a/tinygrad/graph.py
+++ b/tinygrad/graph.py
@@ -72,3 +72,13 @@ def log_schedule_item(iop: LazyOp, ret: 'LazyBuffer', inp: Tuple['LazyBuffer', .
     G.nodes[nm(ret)]['fillcolor'] = top_colors[optype]
     G.nodes[nm(ret)]['color'] = 'black'
     G.nodes[nm(ret)]['style'] = 'filled'
+
+def _tree(lazydata, prefix=""):
+  if type(lazydata).__name__ == "LazyBuffer": return [f"━━ realized {lazydata.dtype.name} {lazydata.shape}"] if (lazydata.realized) else _tree(lazydata.op, "LB ")
+  if len(lazydata.src) == 0: return [f"━━ {prefix}{lazydata.op.name} {lazydata.arg if lazydata.arg else ''}"]
+  lines = [f"━┳ {prefix}{lazydata.op.name} {lazydata.arg if lazydata.arg else ''}"]
+  childs = [_tree(c) for c in lazydata.src[:]]
+  for c in childs[:-1]: lines += [f" ┣{c[0]}"] + [f" ┃{l}" for l in c[1:]]
+  return lines + [" ┗"+childs[-1][0]] + ["  "+l for l in childs[-1][1:]]
+
+def print_tree(lazydata:LazyOp): print("\n".join([f"{str(i).rjust(3)} {s}" for i,s in enumerate(_tree(lazydata))]))
\ No newline at end of file
diff --git a/tinygrad/realize.py b/tinygrad/realize.py
index 427cc1ed1a..5e21d1ec07 100644
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@@ -1,15 +1,13 @@
 from typing import List, Tuple, cast, Dict, Callable
 import numpy as np
 from tinygrad.ops import LazyOp, LoadOps, Device, UnaryOps, BufferOps, MemBuffer, get_lazyop_info
-from tinygrad.graph import log_schedule_item
+from tinygrad.graph import log_schedule_item, print_tree
 from tinygrad.lazy import LazyBuffer
 from tinygrad.helpers import DEBUG, prod, all_int, getenv, IMAGE, ImageDType, dtypes
 
 from tinygrad.runtime.lib import RawBufferMapped, RawBufferTransfer
 from tinygrad.runtime.ops_disk import RawDiskBuffer
 
-P2P = getenv("P2P", 0)
-
 def fix_schedule_for_images(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]]):
   # this is the fundamental fix, find unwritable or unreadable images and convert them to normal float32 (TODO: should it be float16?)
   for op,out,buffers in schedule:
@@ -41,6 +39,7 @@ def fix_schedule_for_images(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBu
     fixed_schedule.append((op, out, buffers))
   return fixed_schedule
 
+# *** this is where things happen ***
 
 def run_schedule(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]]):
   # HACK: images can be not usable due to shape
@@ -51,9 +50,7 @@ def run_schedule(schedule:List[Tuple[LazyOp, LazyBuffer, Tuple[LazyBuffer, ...]]
     op,out,buffers = schedule.pop(0)
     log_schedule_item(op, out, buffers)
     assert all(x.realized for x in buffers), "can't run schedule, some buffers aren't realized"
-    if DEBUG >= 3:
-      from extra.utils import print_tree   # type: ignore
-      print_tree(op)
+    if DEBUG >= 3: print_tree(op)
     if op.op in LoadOps:
       # confirm the LoadOps are contiguous and in order
       for i,s in enumerate(op.src): assert isinstance(s, LazyOp) and s.op == BufferOps.MEM and s.arg.idx == i+1 and s.arg.st.contiguous, f"bad LoadOps src {i}: {s}"
@@ -89,7 +86,7 @@ def _realize_from(buffer: LazyBuffer, src: LazyBuffer) -> None:
     assert all_int(buffer.shape), "does not support symbolic shape"
     buffer.realized = Device[buffer.device].buffer(prod(buffer.shape), buffer.dtype, **buffer._device_extra_args())
     src.realized.readinto(cast(RawBufferMapped, buffer.realized)._buffer())
-  elif isinstance(src.realized, RawBufferTransfer) and issubclass(Device[buffer.device].buffer, RawBufferTransfer) and P2P >= 1:
+  elif isinstance(src.realized, RawBufferTransfer) and issubclass(Device[buffer.device].buffer, RawBufferTransfer) and getenv("P2P", 0) >= 1:
     buffer.realized = cast(RawBufferTransfer, Device[buffer.device].buffer).transfer(src.realized, buffer.shape, buffer.dtype, **buffer._device_extra_args())
   else:
     # TODO: schedule this as FROM to go to CPU, and a FROM to go to device