diff --git a/examples/train_efficientnet.py b/examples/train_efficientnet.py
index 688793c105..4346ef534b 100644
--- a/examples/train_efficientnet.py
+++ b/examples/train_efficientnet.py
@@ -12,7 +12,7 @@ if __name__ == "__main__":
   img = np.zeros((BS,3,224,224), dtype=np.float32)
 
   for i in range(32):
-    print("running batch %d" % i)
+    print("running batch %d, %d tensors allocated" % (i, Tensor.allocated))
 
     st = time.time()
     out = model.forward(Tensor(img))
@@ -31,3 +31,5 @@ if __name__ == "__main__":
     et = time.time()
     print("backward %.2f s" % (et-st))
 
+    del out, y, loss
+
diff --git a/test/test_gc.py b/test/test_gc.py
new file mode 100644
index 0000000000..524c9d0206
--- /dev/null
+++ b/test/test_gc.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad.tensor import Tensor, GPU
+
+class TestGC(unittest.TestCase):
+  gpu = False
+  def test_gc(self):
+    a = Tensor.zeros(4,4, gpu=self.gpu)
+    b = Tensor.zeros(4,4, gpu=self.gpu)
+    (a*b).mean().backward()
+    assert(Tensor.allocated > 0)
+    del a,b
+    assert(Tensor.allocated == 0)
+
+  def test_gc_complex(self):
+    a = Tensor.zeros(4,4, gpu=self.gpu)
+    b = Tensor.zeros(4,4, gpu=self.gpu)
+    assert(Tensor.allocated == 2)
+    (a*b).mean().backward()
+    assert(Tensor.allocated == 4)
+    del b
+    assert(Tensor.allocated == 2)
+    b = Tensor.zeros(4,4, gpu=self.gpu)
+    print(Tensor.allocated)
+    (a*b).mean().backward()
+    print(Tensor.allocated)
+    assert(Tensor.allocated == 4)
+    del b
+    assert(Tensor.allocated == 2)
+
+    
+
+if GPU:
+  class TestGCGPU(TestGC):
+    gpu = True
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index ba46bb008e..838acb7629 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -59,9 +59,21 @@ class GPUBuffer:
 
 # **** start with two base classes ****
 
+def deepwalk(node, visited=None, nodes=None):
+  if visited == None and nodes == None:
+    visited, nodes = set(), []
+  visited.add(node)
+  if node._ctx:
+    for i in node._ctx.parents:
+      if i not in visited:
+        deepwalk(i, visited, nodes)
+    nodes.append(node)
+  return nodes
+
 class Tensor:
   did_float_warning = False
   default_gpu = False
+  allocated = 0
 
   def __init__(self, data, gpu=None):
     if gpu is None:
@@ -89,6 +101,12 @@ class Tensor:
     # internal variables used for autograd graph construction
     self._ctx = None
 
+    Tensor.allocated += 1
+
+  def __del__(self):
+    #print("cleanup", self.shape)
+    Tensor.allocated -= 1
+
   def __repr__(self):
     return "Tensor %r with grad %r" % (self.data, self.grad.data if self.grad else None)
 
@@ -104,20 +122,20 @@ class Tensor:
     return self.data.dtype
 
   @staticmethod
-  def zeros(*shape):
-    return Tensor(np.zeros(shape, dtype=np.float32))
+  def zeros(*shape, gpu=None):
+    return Tensor(np.zeros(shape, dtype=np.float32), gpu)
 
   @staticmethod
-  def ones(*shape):
-    return Tensor(np.ones(shape, dtype=np.float32))
+  def ones(*shape, gpu=None):
+    return Tensor(np.ones(shape, dtype=np.float32), gpu)
 
   @staticmethod
-  def randn(*shape):
-    return Tensor(np.random.randn(*shape).astype(np.float32))
+  def randn(*shape, gpu=None):
+    return Tensor(np.random.randn(*shape).astype(np.float32), gpu)
 
   @staticmethod
-  def eye(dim):
-    return Tensor(np.eye(dim).astype(np.float32))
+  def eye(dim, gpu=None):
+    return Tensor(np.eye(dim).astype(np.float32), gpu)
 
   def backward(self, allow_fill=True):
     if self._ctx is None:
@@ -129,17 +147,7 @@ class Tensor:
       assert self.shape == (1,)
       self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), gpu=self.gpu)
 
-    visited, nodes = set(), []
-    def deepwalk(node):
-      visited.add(node)
-      if node._ctx:
-        for i in node._ctx.parents:
-          if i not in visited:
-            deepwalk(i)
-        nodes.append(node)
-    deepwalk(self)
-
-    for t0 in reversed(nodes):
+    for t0 in reversed(deepwalk(self)):
       assert (t0.grad is not None)
       with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
         grads = t0._ctx.backward(t0._ctx, t0.grad.data)
@@ -151,6 +159,8 @@ class Tensor:
         assert g.shape == t.shape, \
           "grad shape must match tensor shape in %r, %r != %r" % (self._ctx, g.shape, t.shape)
         t.grad = Tensor(g) if t.grad is None else (t.grad + Tensor(g))
+        del t.grad._ctx  # no backward pass through the add
+
 
   # ***** tinygrad supports CPU and GPU *****