diff --git a/docs/design b/docs/design
deleted file mode 100644
index 56058ae738..0000000000
--- a/docs/design
+++ /dev/null
@@ -1,68 +0,0 @@
-Getting the core instruction set correct is the value of tinygrad
-
-Max size tensor is 6-D for the pool2d
-
-Unary Ops
-===
-
-These are the simplest to reason about, and have pointwise mem access.
-A and B are always the same size
-
-Forward          :    A      -> B
-Backward (binary):   (B', A) -> A'
-
-
-
-Reduce Ops (with axis)
-===
-
-These take in an axis argument. B is smaller than A
-Max and Sum are pretty different, do we really need Max?
-
-Forward          :    A      -> B
-Backward         :    B'     -> A'
-
-
-
-Binary Ops (with broadcasting)
-===
-
-Pointwise mem access also.
-Broadcasting adds complexity, aliased input.
-Unbroadcasting for grad is a sum, but should be combined with the ternary op.
-
-Forward           :   (A,  B)     -> C
-Backward (ternary):   (C', A, B)  -> (A', B')
-
-C.shape = max(A.shape, B.shape)
-
-
-
-Movement Ops (2 or 1)
-===
-
-Reshape, Transpose, Slice
-
-Depending on your Tensor implementation, these are free.
-Reshape is almost always free.
-Slice can be made free, but probably shouldn't be.
-Transpose is hard to make free except in trivial cases.
-
-Regardless, these are "reindexings" of existing arrays
-Transpose and Slice are similar enough I think they can be merged.
-They should use a DMA engine
-
-
-
-Processing Ops (4)
-===
-
-Matmul is 1 matmul for forward, 2 for backward.
-* It's actually three matmuls transposed
-* cublasSgemm()
-
-Conv2D is very complex. It seems to need three.
-* cudnnConvolutionForward()
-* cudnnConvolutionBackwardData()
-* cudnnConvolutionBackwardFilter()
-NOTE: Tensor Cores require that the tensors be in the NHWC data layout
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 3b10e512f9..c54d7d1561 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -20,7 +20,14 @@ def mnum(i) -> str: return str(i) if i >= 0 else f"m{-i}"
 @functools.lru_cache(maxsize=None)
 def getenv(key, default=0): return type(default)(os.getenv(key, default))
 
-DEBUG, IMAGE = getenv("DEBUG", 0), getenv("IMAGE", 0)
+# NOTE: hack to allow DEBUG(x) to set the debug value at runtime
+class DebugSingleton:
+  def __init__(self): self.value = getenv("DEBUG", 0)
+  def __call__(self, x): self.value = x
+  def __bool__(self): return self.value != 0
+  def __ge__(self, x): return self.value >= x
+
+DEBUG, IMAGE = DebugSingleton(), getenv("IMAGE", 0)
 
 # **** tinygrad now supports dtypes! *****
 
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index 72240ceeec..0cd44a4a93 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Optional, Tuple, Union, List, Dict, Any, cast
 import sys, weakref, importlib, inspect, functools, pathlib
 from weakref import WeakValueDictionary
-from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten, ImageDType
+from tinygrad.helpers import prod, getenv, DType, dtypes, LazyNumpyArray, flatten, ImageDType, DEBUG
 from tinygrad.shape.shapetracker import ShapeTracker, get_contraction
 from tinygrad.ops import Compiled, Interpreted, UnaryOps, BinaryOps, ReduceOps, MovementOps, LoadOps, OpType, LazyOp, get_lazyops, get_buffers, map_buffers
 from tinygrad.runtime.lib import RawConst, RawBuffer
@@ -111,6 +111,7 @@ class LazyBuffer:
         if prod(self.op.arg.shape) == 1 and hasattr(Device[self.device].codegen, 'supports_constant_folding'):
           self.realized = RawConst(1, dtypes.from_np(self.op.arg.dtype), self.op.arg().flatten()[0])
         else:
+          if DEBUG >= 4: print(f"copying {self.op.arg.shape}:{dtypes.from_np(self.op.arg.dtype)} -> {self.device}")
           self.realized = Device[self.device].buffer.fromCPU(self.op.arg(), **self._device_extra_args())
       elif self.op.op == LoadOps.CONTIGUOUS:
         realized = self.op.src[0].realize().realized