diff --git a/test/test_assign.py b/test/test_assign.py
index 115f1674c0..7e7ed73bc3 100644
--- a/test/test_assign.py
+++ b/test/test_assign.py
@@ -281,6 +281,57 @@ class TestAssign(unittest.TestCase):
       #assert ba1 == ba2 and ba1 != bb1
       np.testing.assert_allclose(a.numpy(), np.arange(N*N).reshape((N,N)) + np.arange(N*N).reshape((N,N)).transpose(1,0))
 
+  def test_simple_assignment_multioutput(self):
+    a = Tensor.randn(32, 32).realize()
+    b = Tensor.full((32, ), 1.).contiguous().realize()
+    c = Tensor.full((32, ), 2.).contiguous().realize()
+    d = Tensor.full((32, ), 3.).contiguous().realize()
+
+    r = a.sum(axis=1)
+    b.assign(r + b)
+    c.assign(r + c)
+    d.assign(r + d)
+
+    kc = GlobalCounters.kernel_count
+    Tensor.realize(b, c, d)
+    assert GlobalCounters.kernel_count - kc == 1
+    np.testing.assert_allclose(b.numpy(), a.sum(1).numpy()+1)
+    np.testing.assert_allclose(c.numpy(), a.sum(1).numpy()+2)
+    np.testing.assert_allclose(d.numpy(), a.sum(1).numpy()+3)
+
+  # NOTE: if the assign target is read/write in a single kernel, it should be contiguous
+
+  def test_permuted_assignment_correct(self):
+    a = Tensor.arange(4 * 4).reshape(4, 4).contiguous().realize()
+    b = Tensor.arange(4 * 4).reshape(4, 4).contiguous().realize()
+    # TODO: scheduler limitation, should NOT raise AssertionError from numpy.
+    with self.assertRaises(RuntimeError):
+      a = a.permute(1, 0)
+      new_val = a + b
+      a.assign(new_val)
+      np.testing.assert_equal(a.numpy(), np.arange(4 * 4).reshape(4, 4).transpose(1, 0) + np.arange(4 * 4).reshape(4, 4))
+
+  def test_permuted_reduceop_child_dual_use(self):
+    a = Tensor.randn(32, 32, 32).realize()
+    b = Tensor.full((32, 32), 1.).contiguous().realize()
+    with self.assertRaises(RuntimeError):
+      r = a.sum(axis=1)
+      b.assign(r + b.permute(1, 0))
+      b.realize()
+
+  def test_permuted_reduceop_multioutput_dual_use(self):
+    a = Tensor.randn(32, 32, 32).realize()
+    b = Tensor.full((32, 32), 1.).contiguous().realize()
+    c = Tensor.full((32, 32), 2.).contiguous().realize()
+
+    # TODO: this is failing in cycle error, it should fail earlier.
+    with self.assertRaises(RuntimeError):
+      r = a.sum(axis=1)
+      b_perm = b.permute(1, 0)
+      b.assign(r + b)
+      c.assign(r + b_perm)
+      Tensor.realize(b, c)
+
   # TODO: is there a way to sneak in a permute such that it returns the wrong answer?
 
   @unittest.skip("don't use output buffer, and mismatch dtype no longer supported")