# tensor tests that pass on NULL backend (no copyout needed)
import numpy as np
import unittest
from tinygrad import Tensor, Device, dtypes
from tinygrad.device import is_dtype_supported
from tinygrad.uop.ops import Ops, UOp
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.nir import NIRRenderer
from tinygrad.engine.realize import get_program
from tinygrad.dtype import DType

x_init = np.random.randn(1,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)

class TestTrainMode(unittest.TestCase):
  def test_train_mode(self):
    assert not Tensor.training
    @Tensor.train()
    def f():
      assert Tensor.training
    f()
    assert not Tensor.training

class TestInferenceMode(unittest.TestCase):
  def test_inference(self):
    x = Tensor(x_init, requires_grad=True)
    m = Tensor(m_init, requires_grad=True)
    W = Tensor(W_init, requires_grad=True)
    tmp = x.mul(m)
    mm = tmp.matmul(W)
    out = mm.relu()
    out = out.sum()
    #out.backward()
    assert x.grad is None
    assert m.grad is None
    assert tmp.grad is None
    assert mm.grad is None
    assert W.grad is None
    assert W.requires_grad

  def test_no_grad_mode_context_manager(self):
    x = Tensor(x_init, requires_grad=True)
    m = Tensor(m_init, requires_grad=True)
    W = Tensor(W_init, requires_grad=True)
    def f(x, m, W):
      tmp = x.mul(m)
      mm = tmp.matmul(W)
      out = mm.relu()
      out = out.sum()
      #out.backward()
      assert x.grad is None
      assert m.grad is None
      assert tmp.grad is None
      assert mm.grad is None
      assert W.grad is None
    f(x, m, W)

class TestIdxUpcast(unittest.TestCase):
  def _find_op(self, ast: UOp, op: Ops):
    if ast.op is op: return ast
    for src in ast.src:
      if (ret:=self._find_op(src, op)) is not None: return ret
  def _schedule_render(self, a: Tensor):
    schedule, _ = a.schedule_with_vars()
    for s in schedule:
      if s.ast.op is Ops.SINK:
        renderer = Device[s.bufs[0].device].renderer
        prg = get_program(s.ast, renderer)
        return prg.uops

  def _assert(self, dtype: DType, a: Tensor):
    uops = self._schedule_render(a)
    # Assert the dtype of the INDEX value, This will need be updated if UOp spec changes
    store = next(uop for uop in uops if uop.op is Ops.STORE)
    assert store.op is Ops.STORE
    idx = self._find_op(store, Ops.INDEX)
    # PTX and NIR turn Ops.INDEX into pointer arithmetic earlier than cstyle, plus it's already cast to int64
    if not isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, NIRRenderer)):
      assert idx.op is Ops.INDEX
      idx_val = idx.src[1]
      assert idx_val.dtype is dtype

  # use expand to generate kernel that uses large idx
  def do_op_then_assert(self, dtype: DType, dim1, dim2, dim3):
    self._assert(dtype, Tensor.empty(dim1, dim2, 1).expand(-1, -1, dim3).contiguous())

  @unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_overflow(self):
    # 2**11, 2**11, 2**11 -> 2**33 will overflow when indexed
    self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)

  @unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_overflow_sym(self):
    self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 1, 2048).bind(32))

  def test_regular(self):
    self.do_op_then_assert(dtypes.int, 64, 64, 64)

  def test_regular_sym(self):
    self.do_op_then_assert(dtypes.int, 2048, 2048, UOp.variable("dim3", 1, 64).bind(32))

  @unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, (PTXRenderer, NIRRenderer)), "PTX and NIR always converts Ops.INDEX to int64")
  def test_symfold(self):
    # This would cause an overflow, but after sym fold it's within int32
    a = Tensor.arange(65535)
    uops = self._schedule_render(a)
    assert all(uop.dtype is not dtypes.long for uop in uops)

  def test_arange_raise_overflow(self):
    with self.assertRaises(ValueError):
      self._schedule_render(Tensor.arange(2**33, dtype=dtypes.int))

  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_int64_unsupported_overflow_sym(self):
    with self.assertRaises(KeyError):
      self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 1, 2048).bind(32))

  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
  @unittest.expectedFailure  # bug in gpu dims limiting
  def test_int64_unsupported_overflow(self):
    with self.assertRaises(KeyError):
      self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)

  @unittest.skip("This is kept for reference, it requires large memory to run")
  def test_overflow_kernel_run(self):
    # This creates a total of 2**31+10 elements, requiring at least 2147 MB memory to run
    # Modified example from issue 3271
    a = Tensor.empty(2**11, 2**11, 1, dtype=dtypes.int8).permute((2, 0, 1)).expand((2**9+10, -1, -1)).contiguous()
    a.realize()

class TestTensorUnique(unittest.TestCase):
  def test_empty_bufs_unique(self):
    a = Tensor.empty(10, 10).contiguous()
    b = Tensor.empty(10, 10).contiguous()
    Tensor.realize(a,b)
    self.assertIsNot(a.uop.buffer, b.uop.buffer)

  def test_zeros_bufs_unique_sep(self):
    a = Tensor.zeros(10, 10).contiguous()
    Tensor.realize(a)
    b = Tensor.zeros(10, 10).contiguous()
    Tensor.realize(b)
    self.assertIsNot(a.uop.buffer, b.uop.buffer)

  def test_zeros_bufs_unique(self):
    a = Tensor.zeros(10, 10).contiguous()
    b = Tensor.zeros(10, 10).contiguous()
    Tensor.realize(a,b)
    self.assertIsNot(a.uop.buffer, b.uop.buffer)

  def test_eye_bufs_unique(self):
    a = Tensor.eye(10).contiguous()
    b = Tensor.eye(10).contiguous()
    Tensor.realize(a,b)
    self.assertIsNot(a.uop.buffer, b.uop.buffer)

  def test_times_2_not_unique(self):
    a = Tensor.zeros(10, 10).contiguous()
    b = a * 2
    c = a * 2
    Tensor.realize(b,c)
    self.assertIs(b.uop.buffer, c.uop.buffer)

if __name__ == '__main__':
  unittest.main()