# simple tests
import unittest
import torch
import numpy as np
from tinygrad.helpers import getenv, GlobalCounters
if getenv("TINY_BACKEND2"):
  import extra.torch_backend.backend2
  device = "cpu"
else:
  import extra.torch_backend.backend
  device = "tiny"

class TestTorchBackend(unittest.TestCase):
  def test_randperm_generator_out(self):
    n = 10
    out = torch.empty(n, dtype=torch.long, device=device)
    res = torch.randperm(n, out=out).cpu().numpy()
    np.testing.assert_equal(set(res), set(range(n)))
    np.testing.assert_equal(out.cpu().numpy(), res)

    res2 = torch.randperm(n).cpu().numpy()
    np.testing.assert_equal(set(res2), set(range(n)))

  def test_numpy_ones(self):
    a = torch.ones(4, device=device)
    np.testing.assert_equal(a.cpu().numpy(), [1,1,1,1])

  def test_numpy_ones_int32(self):
    a = torch.ones(4, dtype=torch.int32, device=device)
    assert a.dtype == torch.int32
    np.testing.assert_equal(a.cpu().numpy(), [1,1,1,1])

  def test_plus(self):
    a = torch.ones(4, device=device)
    b = torch.ones(4, device=device)
    c = a+b
    np.testing.assert_equal(c.cpu().numpy(), [2,2,2,2])

  def test_expand(self):
    a = torch.Tensor([1,2,3,4]).to(device)
    out = a.reshape(4,1).expand(4,4)
    np.testing.assert_equal(out.cpu().numpy(), [[1,1,1,1],[2,2,2,2],[3,3,3,3],[4,4,4,4]])

  def test_reshape(self):
    a = torch.Tensor([[1,2],[3,4]]).to(device)
    np.testing.assert_equal(a.reshape(4).cpu().numpy(), [1,2,3,4])
    np.testing.assert_equal(a.reshape(2,1,2).cpu().numpy(), [[[1,2]],[[3,4]]])
    np.testing.assert_equal(a.unsqueeze(1).cpu().numpy(), [[[1,2]],[[3,4]]])
    np.testing.assert_equal(a.unsqueeze(1).unsqueeze(1).cpu().numpy(), [[[[1,2]]],[[[3,4]]]])
    np.testing.assert_equal(a.unsqueeze(1).unsqueeze(1).squeeze().cpu().numpy(), [[1,2],[3,4]])

  def test_permute(self):
    a = torch.Tensor([[1,2],[3,4]]).to(device)
    print(a.stride())
    null = a.permute(0,1)
    perm = a.permute(1,0)
    back = perm.permute(1,0)
    np.testing.assert_equal(a.cpu().numpy(), [[1,2],[3,4]])
    np.testing.assert_equal(null.cpu().numpy(), [[1,2],[3,4]])
    np.testing.assert_equal(perm.cpu().numpy(), [[1,3],[2,4]])
    np.testing.assert_equal(back.cpu().numpy(), [[1,2],[3,4]])

  def test_shrink(self):
    a = torch.Tensor([1,2,3,4]).to(device)
    np.testing.assert_equal(a[:3].cpu().numpy(), [1,2,3])
    np.testing.assert_equal(a[1:].cpu().numpy(), [2,3,4])

  def test_as_strided(self):
    a = torch.arange(70, device=device).reshape(1,1,10,7)
    a = a.as_strided((1,1,10,5), (0,0,7,1), storage_offset=0)
    a = a.as_strided((1,1,5,5), (50,50,7,1), storage_offset=21)
    np.testing.assert_equal(a.cpu().numpy().sum(-1), [[[115,150,185,220,255]]])

  def test_plus_inplace(self):
    a = torch.ones(4, device=device)
    b = torch.ones(4, device=device)
    a += b
    a += b
    np.testing.assert_equal(a.cpu().numpy(), [3,3,3,3])

  def test_exp2(self):
    a = torch.ones(4, device=device)
    b = a.exp2()
    np.testing.assert_equal(b.cpu().numpy(), [2,2,2,2])

  def test_amax(self):
    x = torch.tensor([[[ 1.5,  2.3,  3.1,  4.7],
                       [ 5.2,  6.8,  7.4,  12.9],
                       [ 9.0, 12.3, 11.6, 10.1]],
                      [[13.2, 16.9, 15.5, 14.1],
                       [17.1, 24.9, 19.8, 20.2],
                       [21.0, 22.3, 23.6, 18.4]]], device=device)

    y1 = torch.amax(x)
    expected = np.array([24.9], dtype=np.float32)
    np.testing.assert_equal(y1.cpu().numpy(), expected)

    y2 = torch.amax(x, dim=(1,2))
    expected = np.array([12.9, 24.9], dtype=np.float32)
    np.testing.assert_equal(y2.cpu().numpy(), expected)

    y3 = torch.amax(x, dim=2)
    expected = np.array([[4.7, 12.9, 12.3], [16.9, 24.9, 23.6]], dtype=np.float32)
    np.testing.assert_equal(y3.cpu().numpy(), expected)


  def test_amin(self):
    x = torch.tensor([[[ 1.5,  2.3,  3.1,  4.7],
                       [ 5.2,  6.8,  7.4,  12.9],
                       [ 9.0, 12.3, 11.6, 10.1]],
                      [[13.2, 16.9, 15.5, 14.1],
                       [17.1, 24.9, 19.8, 20.2],
                       [21.0, 22.3, 23.6, 18.4]]], device=device)

    y1 = torch.amin(x)
    expected = np.array([1.5], dtype=np.float32)
    np.testing.assert_equal(y1.cpu().numpy(), expected)

    y2 = torch.amin(x, dim=(1,2))
    expected = np.array([1.5, 13.2], dtype=np.float32)
    np.testing.assert_equal(y2.cpu().numpy(), expected)

    y3 = torch.amin(x, dim=2)
    expected = np.array([[1.5, 5.2, 9.0], [13.2, 17.1, 18.4]], dtype=np.float32)
    np.testing.assert_equal(y3.cpu().numpy(), expected)

  def test_isfinite(self):
    a = torch.ones(4, device=device)
    np.testing.assert_equal(torch.isfinite(a).cpu().numpy(), [True, True, True, True])

  def test_eq(self):
    a = torch.ones(4, device=device)
    b = torch.ones(4, device=device)
    c = a == b
    print(c.cpu())

  def test_maxpool2d_backward(self):
    x = torch.arange(3*3, dtype=torch.float32, device=device).reshape(1, 1, 3, 3).requires_grad_(True)
    torch.nn.functional.max_pool2d(x, kernel_size=2, stride=1).sum().backward()
    np.testing.assert_equal(x.grad.squeeze().cpu().numpy(), [[0, 0, 0], [0, 1, 1], [0, 1, 1]])

  def test_copy_cast(self):
    x = torch.zeros(4, device=device, dtype=torch.int64)
    y = torch.ones(4, device=device, dtype=torch.float32).to(dtype=torch.int64)
    res1 = x ^ y # an operation that only works on int types
    print(res1.cpu())
    y = y.cpu().float().to(device=device, dtype=torch.int64)
    res2 = x ^ y
    print(res2.cpu())

  def test_topk(self):
    # test topk return_types
    a = torch.tensor([1, 3, 2, 4], device=device)
    out = torch.topk(a, k=2)
    np.testing.assert_equal(out.values.cpu().numpy(), [4, 3])
    np.testing.assert_equal(out.indices.cpu().numpy(), [3, 1])

  def test_masked_select(self):
    a = torch.tensor([4, 3, 2, 1], device=device)
    mask = torch.tensor([True, False, True, False], device=device)
    out = torch.masked_select(a, mask)
    np.testing.assert_equal(out.cpu().numpy(), [4, 2])
    mask = torch.tensor(True, device=device)
    out = torch.masked_select(a, mask)
    np.testing.assert_equal(out.cpu().numpy(), [4, 3, 2, 1])

  def test_isin_tensor_tensor_out(self):
    a = torch.tensor([1, 2, 3], device=device)
    b = torch.tensor([2, 4], device=device)
    expected_base = torch.tensor([False, True, False], device=device)
    for assume_unique in [False, True]:
      for invert, expected in [(False, expected_base), (True, ~expected_base)]:
        out = torch.empty_like(a, dtype=torch.bool)
        res = torch.ops.aten.isin.Tensor_Tensor_out(a, b, invert=invert, assume_unique=assume_unique, out=out)
        np.testing.assert_equal(out.cpu().numpy(), expected.cpu().numpy())

  def test_uniform(self):
    for torch_dtype in [torch.float32, torch.float16]:
      a = torch.rand(10, 10, device=device, dtype=torch_dtype)
      self.assertEqual(a.dtype, torch_dtype)

  def test_normal(self):
    for torch_dtype in [torch.float32, torch.float16]:
      a = torch.randn(10, 10, device=device, dtype=torch_dtype)
      self.assertEqual(a.dtype, torch_dtype)

  def test_equal(self):
    tensor_a = torch.tensor([[1, 2], [3, 4]], device=device)
    tensor_b = torch.tensor([[1, 2], [3, 4]], device=device)
    tensor_c = torch.tensor([[1, 2], [1, 2]], device=device)
    assert torch.equal(tensor_a, tensor_b)
    assert not torch.equal(tensor_a, tensor_c)

  @unittest.skip("# TODO: this test is slow")
  def test_linalg_svd(self):
    A = torch.randn(5, 5, device=device)
    U, S, Vh = torch.linalg.svd(A)
    np.testing.assert_equal(U.shape, (5,5))
    np.testing.assert_equal(Vh.shape, (5,5))
    np.testing.assert_allclose(torch.dist(A, U @ torch.diag(S) @ Vh).cpu().numpy(), 0, atol=1e-5)

    A = torch.randn(5, 3, device=device)
    U, S, Vh = torch.linalg.svd(A, full_matrices=False)
    np.testing.assert_equal(U.shape, (5,3))
    np.testing.assert_equal(Vh.shape, (3,3))
    np.testing.assert_allclose(torch.dist(A, U @ torch.diag(S) @ Vh).cpu().numpy(), 0, atol=1e-5)

  def test_linalg_eigh(self):
    a = torch.tensor([[1, 2], [2, 1]], dtype=torch.float32, device=device)
    w, v = torch.linalg.eigh(a)
    np.testing.assert_equal(w.cpu().numpy(), [-1, 3])
    recon = (v @ torch.diag(w) @ v.T).cpu().numpy()
    np.testing.assert_allclose(recon, a.cpu().numpy(), atol=1e-6)

  def test_linalg_det(self):
    a = torch.diag(torch.tensor([1,2,3,4,5], dtype = torch.float32, device=device))
    b = torch.linalg.det(a)
    np.testing.assert_equal(b.cpu().numpy(), 120.0)

  def test_linalg_cross(self):
    a = torch.tensor([[1, 0, 0], [0, 1, 0]], dtype=torch.float32, device=device)
    b = torch.tensor([[0, 0, 1]], dtype=torch.float32, device=device)
    cross = torch.linalg.cross(a, b)
    np.testing.assert_equal(cross.cpu().numpy(), np.array([[0, -1, 0], [1, 0, 0]], dtype=np.float32))

  def test_scalar_assign(self):
    a = torch.tensor([1, 2, 3], device=device)
    a[1] = 4
    np.testing.assert_equal(a.cpu().numpy(), [1, 4, 3])

  @unittest.skip("meh")
  def test_str(self):
    a = torch.ones(4, device=device)
    print(str(a))

  def test_floor_div(self):
    a = torch.tensor([10., 7., 5.], device=device)
    b = torch.tensor([3., 2., 2.], device=device)
    result = a // b
    np.testing.assert_equal(result.cpu().numpy(), [3., 3., 2.])

  def test_mnist_index(self):
    GlobalCounters.reset()
    from tinygrad.nn.datasets import mnist
    X_train, Y_train, _, _ = mnist()
    X_train = torch.tensor(X_train.float().numpy(), device=device)
    Y_train = torch.tensor(Y_train.cast('int64').numpy(), device=device)
    samples = torch.randint(0, X_train.shape[0], (32,))
    X,Y = X_train[samples], Y_train[samples]
    X.cpu(), Y.cpu()
    self.assertLessEqual(GlobalCounters.global_ops, 10_000_000)

  def _test_diagonal(self, *shape):
    a = torch.randn(*shape, dtype=torch.float32, device=device)
    ref = np.diagonal(a.cpu().numpy(), axis1=-2, axis2=-1)
    diag = torch.linalg.diagonal(a)
    np.testing.assert_equal(diag.cpu().numpy(), ref)
    np.testing.assert_equal(diag[-1].cpu().numpy(), ref[-1])

  def test_diagonal_cube(self): self._test_diagonal(3, 3, 3)
  def test_diagonal_rectangular(self): self._test_diagonal(4, 5, 6)
  def test_diagonal_4d(self): self._test_diagonal(2, 3, 4, 5)

  def test_pad_circular_simple(self):
    a = torch.arange(4, dtype=torch.float32, device=device).reshape(1,1,2,2)
    padded = torch.nn.functional.pad(a, (1,1,1,1), mode="circular")
    expected = np.array([[[[3.,2.,3.,2.], [1.,0.,1.,0.], [3.,2.,3.,2.], [1.,0.,1.,0.]]]], dtype=np.float32)
    np.testing.assert_allclose(padded.cpu().numpy(), expected)

  def test_pad_circular_backward(self):
    a = torch.arange(4, dtype=torch.float32, device=device).reshape(1,1,2,2).requires_grad_(True)
    padded = torch.nn.functional.pad(a, (1,1,1,1), mode="circular")
    loss = padded.sum()
    loss.backward()
    expected_grad = np.array([[[[4., 4.], [4., 4.]]]], dtype=np.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad)


  def test_matmul_backward(self):
    x = torch.randn(3, 4, device=device, dtype=torch.float32, requires_grad=True)
    y = torch.randn(4, 5, device=device, dtype=torch.float32, requires_grad=True)
    z = (x @ y).sum()
    z.backward()
    assert x.grad is not None
    assert y.grad is not None
    assert x.grad.shape == x.shape
    assert y.grad.shape == y.shape

  def test_matmul_broadcast_backward(self):
    x = torch.randn(2, 3, 4, device=device, dtype=torch.float32, requires_grad=True)
    y = torch.randn(4, 5, device=device, dtype=torch.float32, requires_grad=True)
    z = (x @ y).sum()
    z.backward()
    assert x.grad is not None
    assert y.grad is not None
    assert x.grad.shape == x.shape
    assert y.grad.shape == y.shape

  def test_diag_vector_to_matrix(self):
    vec = torch.tensor([1., 2., 3., 4., 5.], dtype=torch.float32, device=device)
    mat = torch.diag(vec)
    expected = np.diag([1., 2., 3., 4., 5.])
    np.testing.assert_allclose(mat.cpu().numpy(), expected, rtol=1e-5)
    assert mat.shape == (5, 5)

  def test_diagonal_matrix_to_vector(self):
    mat = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]], dtype=torch.float32, device=device)
    vec = torch.linalg.diagonal(mat)
    expected = np.array([1., 5., 9.])
    np.testing.assert_allclose(vec.cpu().numpy(), expected, rtol=1e-5)
    assert vec.shape == (3,)

  def test_permute_2(self):
    a = torch.randn(2, 3, 4, dtype=torch.float32, device=device)
    b = a.permute(2, 0, 1)
    assert b.shape == (4, 2, 3)
    np.testing.assert_equal(b.cpu().numpy(), a.cpu().numpy().transpose(2, 0, 1))

  def test_batchnorm_unsqueeze(self):
    bn = torch.nn.BatchNorm2d(4).to(device)
    x = torch.randn(8, 4, 3, 3, device=device)
    out = bn(x)
    self.assertEqual(out.shape, x.shape)

  def test_slice_inplace_zero(self):
    a = torch.ones((3, 3), device=device)
    b = a[1:, 1:]
    b.zero_()
    expected = np.array([[1., 1., 1.],
                         [1., 0., 0.],
                         [1., 0., 0.]])
    np.testing.assert_equal(a.cpu().numpy(), expected)

  def test_slice_inplace_fill(self):
    a = torch.ones((3, 3), device=device)
    b = a[1:, 1:]
    b.fill_(5.0)
    expected = np.array([[1., 1., 1.],
                         [1., 5., 5.],
                         [1., 5., 5.]])
    np.testing.assert_equal(a.cpu().numpy(), expected)

  def test_fill_tensor_value(self):
    a = torch.zeros((2, 2), dtype=torch.float32, device=device)
    value = torch.tensor(3, dtype=torch.int64, device=device)
    a.fill_(value)
    expected = np.full((2, 2), 3, dtype=np.float32)
    np.testing.assert_equal(a.cpu().numpy(), expected)

  def test_slice_inplace_mul(self):
    a = torch.ones((3, 3), device=device)
    b = a[1:, 1:]
    b *= 2
    expected = np.array([[1., 1., 1.],
                         [1., 2., 2.],
                         [1., 2., 2.]])
    np.testing.assert_equal(a.cpu().numpy(), expected)

  def test_permute_slice_zero(self):
    a = torch.ones((3, 3), device=device)
    b = a[1:, 1:].permute(1, 0)
    b.zero_()
    expected = np.array([[1., 1., 1.],
                         [1., 0., 0.],
                         [1., 0., 0.]])
    np.testing.assert_equal(a.cpu().numpy(), expected)

  def test_permute_slice_mul(self):
    a = torch.ones((3, 3), device=device)
    b = a[1:, 1:].permute(1, 0)
    b *= 2
    expected = np.array([[1., 1., 1.],
                         [1., 2., 2.],
                         [1., 2., 2.]])
    np.testing.assert_equal(a.cpu().numpy(), expected)

  def test_simple_slice_setitem(self):
    a = torch.tensor([10, 20, 30], device=device)
    a[1] = 99
    np.testing.assert_equal(a.cpu().numpy(), [10, 99, 30])

  def test_2d_slice_setitem(self):
    a = torch.zeros((3, 3), device=device)
    a[1, 2] = 99
    self.assertEqual(a[1, 2].item(), 99)
    self.assertEqual(a.sum().item(), 99)

  def test_view_copy(self):
    a = torch.tensor([10, 20, 30], device=device)
    view = a[1]
    view.copy_(torch.tensor(88, device=device))
    np.testing.assert_equal(a.cpu().numpy(), [10, 88, 30])

  def test_diag_2d_input(self):
    a = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device)
    d = torch.diag(a)
    np.testing.assert_equal(d.cpu().numpy(), [1, 5, 9])

  def test_diag_1d_input(self):
    a = torch.tensor([1, 2, 3], device=device)
    d = torch.diag(a)
    expected = [[1, 0, 0], [0, 2, 0], [0, 0, 3]]
    np.testing.assert_equal(d.cpu().numpy(), expected)

  def test_permute_view_tracking(self):
    a = torch.ones((2, 3, 4), device=device)
    b = a.permute(2, 0, 1)
    self.assertEqual(b.shape, (4, 2, 3))

  def test_detach_view_creation(self):
    a = torch.tensor([1.0, 2.0, 3.0], device=device)
    b = a.detach()
    np.testing.assert_equal(b.cpu().numpy(), [1.0, 2.0, 3.0])

  def test_view_zero_inplace(self):
    a = torch.ones((4, 4), device=device)
    view = a[1:3, 1:3]
    view.zero_()
    self.assertEqual(view.sum().item(), 0)

  def test_view_fill_inplace(self):
    a = torch.zeros((4, 4), device=device)
    view = a[1:3, 1:3]
    view.fill_(5)
    self.assertEqual(view.sum().item(), 20)

  def test_permute_contiguous(self):
    a = torch.tensor([[1, 2], [3, 4]], device=device)
    b = a.permute(1, 0)
    c = b.contiguous()
    expected = [[1, 3], [2, 4]]
    np.testing.assert_equal(c.cpu().numpy(), expected)

  def test_diag_2d_extract_diagonal(self):
    a = torch.tensor([[1, 2], [3, 4]], device=device)
    result = torch.diag(a)
    np.testing.assert_equal(result.cpu().numpy(), [1, 4])

  def test_slice_inplace_multiply_offset_preservation(self):
    a = torch.tensor([1, 2, 3], device=device)
    a[1:] *= 2
    np.testing.assert_equal(a.cpu().numpy(), [1, 4, 6])

  def test_slice_inplace_mul_pattern(self):
    a = torch.tensor([1, 2, 3, 4], device=device)
    a[:2] *= 3
    a[2:] *= 2
    np.testing.assert_equal(a.cpu().numpy(), [3, 6, 6, 8])

  def test_chained_slice_column(self):
    a = torch.arange(16, dtype=torch.float32, device=device).reshape(4, 4)
    torch_res = a[:, 1:2][:, 0:1].cpu().numpy()
    cpu_res = torch.arange(16, dtype=torch.float32).reshape(4, 4)[:, 1:2][:, 0:1].numpy()
    np.testing.assert_equal(torch_res, cpu_res)

  def test_slice_with_step(self):
    a = torch.arange(20, dtype=torch.float32, device=device)
    torch_res = a[::2][1:4].cpu().numpy()
    cpu_res = torch.arange(20, dtype=torch.float32)[::2][1:4].numpy()
    np.testing.assert_equal(torch_res, cpu_res)

  def test_slice_negative_dim(self):
    a = torch.arange(13, dtype=torch.int32, device=device).repeat(8, 1)
    torch_chunks = a.chunk(3, -1)
    cpu_chunks = torch.arange(13, dtype=torch.int32).repeat(8, 1).chunk(3, -1)
    assert len(torch_chunks) == len(cpu_chunks)
    for i in range(len(torch_chunks)):
      np.testing.assert_equal(torch_chunks[i].cpu().numpy(), cpu_chunks[i].numpy())

  def test_dot_vector_matrix(self):
    a = torch.arange(65, dtype=torch.float32, device=device)
    b = torch.arange(65*45, dtype=torch.float32, device=device).reshape(65, 45)
    torch_res = a.matmul(b).reshape(-1).cpu().numpy()
    cpu_res = torch.arange(65, dtype=torch.float32).matmul(torch.arange(65*45, dtype=torch.float32).reshape(65, 45)).numpy()
    np.testing.assert_equal(torch_res, cpu_res)

  def test_alias_passthrough(self):
    a = torch.randn(3, 3, device=device)
    alias_view = torch.ops.aten.alias(a)
    alias_view += 1
    np.testing.assert_equal(a.cpu().numpy(), alias_view.cpu().numpy())

  def test_split_simple_vector(self):
    a = torch.arange(10, dtype=torch.float32, device=device)
    torch_chunks = a.split([1,4,5])
    cpu_chunks = torch.arange(10, dtype=torch.float32).split([1,4,5])
    for tc, cc in zip(torch_chunks, cpu_chunks):
      np.testing.assert_equal(tc.cpu().numpy(), cc.cpu().numpy())

  def test_split_matches_torch(self):
    a = torch.arange(10, dtype=torch.float32, device=device)
    torch_chunks = a.split([1,4,5])
    tiny_chunks = [chunk.cpu().numpy() for chunk in torch_chunks]
    cpu_chunks = [torch.arange(10, dtype=torch.float32).split([1,4,5])[i].numpy() for i in range(3)]
    for tr, cr in zip(tiny_chunks, cpu_chunks): np.testing.assert_equal(tr, cr)

  def test_sum_matches_torch(self):
    a = torch.arange(6, dtype=torch.float32, device=device).reshape(2,3)
    torch_res = a.sum().cpu().numpy()
    cpu_res = torch.arange(6, dtype=torch.float32).reshape(2,3).sum().numpy()
    np.testing.assert_equal(torch_res, cpu_res)

  def test_view_matches_torch(self):
    a = torch.arange(6, dtype=torch.float32, device=device)
    torch_res = a.view(2, 3).cpu().numpy()
    cpu_res = torch.arange(6, dtype=torch.float32).view(2, 3).numpy()
    np.testing.assert_equal(torch_res, cpu_res)

  def test_view_zero_with_indices(self):
    a = torch.tensor([1, 2, 3, 4], device=device)
    a[1:3].zero_()
    np.testing.assert_equal(a.cpu().numpy(), [1, 0, 0, 4])

  def test_view_fill_with_indices(self):
    a = torch.tensor([1, 2, 3, 4], device=device)
    a[::2].fill_(9)
    np.testing.assert_equal(a.cpu().numpy(), [9, 2, 9, 4])

  def test_nested_slice_inplace_ops(self):
    a = torch.tensor([1, 2, 3, 4, 5, 6], device=device)
    a[:3] += 10
    a[3:] *= 2
    np.testing.assert_equal(a.cpu().numpy(), [11, 12, 13, 8, 10, 12])

  def test_diag_1d(self):
    a = torch.tensor([1, 2, 3], device=device)
    result = torch.diag(a)
    expected = [[1, 0, 0], [0, 2, 0], [0, 0, 3]]
    np.testing.assert_equal(result.cpu().numpy(), expected)

  def test_diag_backward(self):
    a = torch.randn(5, dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diag(a)
    b.sum().backward()
    assert a.grad is not None

  def test_diagonal(self):
    a = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diagonal(a)
    expected = torch.tensor([1., 5., 9.], dtype=torch.float32)
    self.assertEqual(b.shape, (3,))
    np.testing.assert_allclose(b.detach().cpu().numpy(), expected.numpy(), rtol=1e-5)

  def test_diagonal_backward(self):
    a = torch.randn(5, 5, dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diagonal(a)
    b.sum().backward()
    assert a.grad is not None

  def test_expand_backward(self):
    a = torch.randn(4, 3, 1, 6, dtype=torch.float32, device=device, requires_grad=True)
    b = a.expand(4, 3, 2, 6)
    b.sum().backward()
    assert a.grad is not None

  def test_einsum_backward(self):
    a = torch.randn(10, 10, dtype=torch.float32, device=device, requires_grad=True)
    b = torch.einsum('ij->ji', a)
    b.sum().backward()
    assert a.grad is not None

  def test_diag_backward_gradient_values(self):
    a = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diag(a)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.ones(3, dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_diag_backward_gradient_values_2d_to_1d(self):
    a = torch.tensor([[1.0, 2.0, 3.0],
                      [4.0, 5.0, 6.0],
                      [7.0, 8.0, 9.0]], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diagonal(a)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([[1.0, 0.0, 0.0],
                                   [0.0, 1.0, 0.0],
                                   [0.0, 0.0, 1.0]], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_expand_backward_gradient_values(self):
    a = torch.tensor([[1.0], [2.0], [3.0]], dtype=torch.float32, device=device, requires_grad=True)
    b = a.expand(3, 4)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([[4.0], [4.0], [4.0]], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_expand_backward_with_leading_dims(self):
    a = torch.tensor([[1.0, 2.0]], dtype=torch.float32, device=device, requires_grad=True)
    b = a.expand(3, 1, 2)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([[3.0, 3.0]], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_diag_2d_to_1d_backward(self):
    a = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diag(a)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([[1.0, 0.0], [0.0, 1.0]], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_expand_complex_backward(self):
    a = torch.tensor([[[1.0, 2.0]]], dtype=torch.float32, device=device, requires_grad=True)
    b = a.expand(2, 3, 2)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([[[6.0, 6.0]]], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_diag_backward_with_scaling(self):
    a = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diag(a)
    loss = (b * torch.tensor([[2.0, 0.0, 0.0],
                               [0.0, 3.0, 0.0],
                               [0.0, 0.0, 4.0]], device=device)).sum()
    loss.backward()
    expected_grad = torch.tensor([2.0, 3.0, 4.0], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_repeat_basic(self):
    a = torch.tensor([1, 2, 3], dtype=torch.float32, device=device)
    b = a.repeat(2, 1)
    expected = torch.tensor([[1, 2, 3], [1, 2, 3]], dtype=torch.float32)
    np.testing.assert_equal(b.cpu().numpy(), expected.numpy())

  def test_repeat_multidim(self):
    a = torch.arange(6, dtype=torch.float32, device=device).reshape(2, 3)
    b = a.repeat(2, 3)
    expected = torch.arange(6, dtype=torch.float32).reshape(2, 3).repeat(2, 3)
    np.testing.assert_equal(b.cpu().numpy(), expected.numpy())

  def test_repeat_backward(self):
    a = torch.tensor([[1.0, 2.0]], dtype=torch.float32, device=device, requires_grad=True)
    b = a.repeat(3, 2)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([[6.0, 6.0]], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_cumsum_1d(self):
    a = torch.tensor([1, 2, 3, 4], dtype=torch.float32, device=device)
    b = torch.cumsum(a, dim=0)
    expected = torch.tensor([1, 3, 6, 10], dtype=torch.float32)
    np.testing.assert_equal(b.cpu().numpy(), expected.numpy())

  def test_cumsum_2d(self):
    a = torch.arange(12, dtype=torch.float32, device=device).reshape(3, 4)
    b = torch.cumsum(a, dim=0)
    expected = torch.arange(12, dtype=torch.float32).reshape(3, 4).cumsum(dim=0)
    np.testing.assert_equal(b.cpu().numpy(), expected.numpy())

    c = torch.cumsum(a, dim=1)
    expected = torch.arange(12, dtype=torch.float32).reshape(3, 4).cumsum(dim=1)
    np.testing.assert_equal(c.cpu().numpy(), expected.numpy())

  def test_cumsum_backward(self):
    a = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.cumsum(a, dim=0)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.tensor([4.0, 3.0, 2.0, 1.0], dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_constant_pad_nd_1d(self):
    a = torch.tensor([1, 2, 3], dtype=torch.float32, device=device)
    b = torch.nn.functional.pad(a, (1, 2), mode='constant', value=0)
    expected = torch.tensor([0, 1, 2, 3, 0, 0], dtype=torch.float32)
    np.testing.assert_equal(b.cpu().numpy(), expected.numpy())

  def test_constant_pad_nd_2d(self):
    a = torch.arange(6, dtype=torch.float32, device=device).reshape(2, 3)
    b = torch.nn.functional.pad(a, (1, 1, 1, 1), mode='constant', value=0)
    expected = torch.nn.functional.pad(torch.arange(6, dtype=torch.float32).reshape(2, 3), (1, 1, 1, 1), mode='constant', value=0)
    np.testing.assert_equal(b.cpu().numpy(), expected.numpy())

  def test_constant_pad_nd_2d_backward(self):
    a = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.nn.functional.pad(a, (1, 1, 1, 1), mode='constant', value=0)
    loss = b.sum()
    loss.backward()
    expected_grad = torch.ones((2, 2), dtype=torch.float32)
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected_grad.numpy(), rtol=1e-5)

  def test_negative_strides_cumsum_backward(self):
    a = torch.randn(5, device=device, requires_grad=True)
    b = torch.cumsum(a, dim=0)
    b.sum().backward()
    grad = a.grad.cpu().numpy()
    self.assertEqual(len(grad), 5)

  def test_cumsum_fix_gradient_values(self):
    a = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.cumsum(a, dim=0)
    loss = b.sum()
    loss.backward()
    expected = np.array([4.0, 3.0, 2.0, 1.0])
    np.testing.assert_allclose(a.grad.cpu().numpy(), expected, rtol=1e-5)

  def test_cumsum_arange_large(self):
    # Tests cumsum with an unrealized arange input with size > 512 (the split threshold)
    # This exercises the _split_cumalu path which uses a two-stage algorithm
    for size in [513, 1022]:
      a = torch.arange(size, dtype=torch.float32, device=device)
      result = torch.cumsum(a, dim=0)
      expected = torch.arange(size, dtype=torch.float32).cumsum(dim=0)
      np.testing.assert_allclose(result.cpu().numpy(), expected.numpy(), rtol=1e-5)

  def test_diag_1d_to_2d(self):
    a = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32, device=device, requires_grad=True)
    b = torch.diag(a)
    expected = [[1, 0, 0], [0, 2, 0], [0, 0, 3]]
    np.testing.assert_equal(b.detach().cpu().numpy(), expected)

  def test_diag_2d_to_1d(self):
    c = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32, device=device)
    d = torch.diag(c)
    np.testing.assert_equal(d.cpu().numpy(), [1, 5, 9])

  def test_biased_conv2d(self):
    # Test case for two sequential conv2d with same weights/bias and ReLU in between, this is as special case from test_ops.py
    torch.manual_seed(0)
    C = 8
    x_cpu = torch.randn(1, C, 5, 5, requires_grad=True)
    w_cpu = torch.randn(C, C, 1, 1, requires_grad=True)
    b_cpu = torch.randn(C, requires_grad=True)
    x_tiny = x_cpu.detach().to(device).requires_grad_(True)
    w_tiny = w_cpu.detach().to(device).requires_grad_(True)
    b_tiny = b_cpu.detach().to(device).requires_grad_(True)
    out_cpu = torch.nn.functional.conv2d(torch.nn.functional.conv2d(x_cpu, w_cpu, b_cpu).relu(), w_cpu, b_cpu)
    out_tiny = torch.nn.functional.conv2d(torch.nn.functional.conv2d(x_tiny, w_tiny, b_tiny).relu(), w_tiny, b_tiny)
    grad_out = torch.randn_like(out_cpu)
    out_cpu.backward(grad_out)
    out_tiny.backward(grad_out.to(device))
    np.testing.assert_allclose(x_tiny.grad.cpu().numpy(), x_cpu.grad.numpy(), atol=1e-4, rtol=1e-3)
    np.testing.assert_allclose(w_tiny.grad.cpu().numpy(), w_cpu.grad.numpy(), atol=1e-4, rtol=1e-3)
    np.testing.assert_allclose(b_tiny.grad.cpu().numpy(), b_cpu.grad.numpy(), atol=1e-4, rtol=1e-3)


from tinygrad import Tensor
class TestBackendHelpers(unittest.TestCase):

  def test_calculate_storage_offset_no_shrink(self):
    t = Tensor.ones(3, 4)
    assert extra.torch_backend.backend.calculate_storage_offset(t) == 0

  def test_calculate_storage_offset_with_shrink(self):
    t = Tensor.ones(10, 10)[2:5, 3:7]
    # strides for (10, 10) are [10, 1]
    # offset = 2*10 + 3*1 = 23
    assert extra.torch_backend.backend.calculate_storage_offset(t) == 23

  def test_calculate_storage_offset_multiple_shrinks(self):
    t = Tensor.ones(5, 6, 7)[1:3, 2:4, 3:5]
    # strides for (5, 6, 7) are [42, 7, 1]
    # offset = 1*42 + 2*7 + 3*1 = 42 + 14 + 3 = 59
    assert extra.torch_backend.backend.calculate_storage_offset(t) == 59

  def test_calculate_storage_offset_with_reshape(self):
    t = Tensor.ones(10, 10)
    orig_offset = extra.torch_backend.backend.calculate_storage_offset(t)
    assert orig_offset == 0
    t = t.reshape(100)
    assert extra.torch_backend.backend.calculate_storage_offset(t) == orig_offset

  def test_slice_values_match_torch(self):
    torch_cpu = torch.arange(100, dtype=torch.float32).reshape(10, 10)
    torch_tiny = torch_cpu.to(device)
    sliced_cpu = torch_cpu[2:5, 3:7]
    sliced_tiny = torch_tiny[2:5, 3:7]
    np.testing.assert_equal(sliced_tiny.cpu().numpy(), sliced_cpu.numpy())

  def test_slice_values_match_torch_3d(self):
    torch_cpu_3d = torch.arange(210, dtype=torch.float32).reshape(5, 6, 7)
    torch_tiny_3d = torch_cpu_3d.to(device)
    sliced_cpu_3d = torch_cpu_3d[1:3, 2:4, 3:5]
    sliced_tiny_3d = torch_tiny_3d[1:3, 2:4, 3:5]
    np.testing.assert_equal(sliced_tiny_3d.cpu().numpy(), sliced_cpu_3d.numpy())

  def test_topk_out(self):
    a = torch.tensor([1, 3, 2, 4], device=device)
    values = torch.empty(2, device=device)
    indices = torch.empty(2, dtype=torch.int64, device=device)
    ret_values, ret_indices = torch.topk(a, k=2, out=(values, indices))
    np.testing.assert_equal(values.cpu().numpy(), [4, 3])
    np.testing.assert_equal(indices.cpu().numpy(), [3, 1])
    assert ret_values is values
    assert ret_indices is indices

  def test_sort_out(self):
    a = torch.tensor([3, 1, 4, 2], device=device)
    values = torch.empty(4, device=device)
    indices = torch.empty(4, dtype=torch.int64, device=device)
    ret_values, ret_indices = torch.sort(a, out=(values, indices))
    np.testing.assert_equal(values.cpu().numpy(), [1, 2, 3, 4])
    np.testing.assert_equal(indices.cpu().numpy(), [1, 3, 0, 2])
    assert ret_values is values
    assert ret_indices is indices

  def test_cat_out(self):
    a = torch.tensor([1, 2], device=device)
    b = torch.tensor([3, 4], device=device)
    out = torch.empty(4, device=device)
    ret = torch.cat([a, b], out=out)
    np.testing.assert_equal(out.cpu().numpy(), [1, 2, 3, 4])
    assert ret is out

  def test_scatter_add_out(self):
    src = torch.tensor([[1, 2, 3], [4, 5, 6]], device=device, dtype=torch.float32)
    index = torch.tensor([[0, 1, 2], [0, 1, 2]], device=device)
    input = torch.zeros(3, 3, device=device, dtype=torch.float32)
    out = torch.zeros(3, 3, device=device, dtype=torch.float32)
    ret = torch.scatter_add(input, 0, index, src, out=out)
    expected = torch.tensor([[5, 0, 0], [0, 7, 0], [0, 0, 9]], dtype=torch.float32)
    np.testing.assert_allclose(out.cpu().numpy(), expected.cpu().numpy())
    assert ret is out

  def test_floor_divide_inplace_identity(self):
    x = torch.tensor([10, 20, 30, 40], dtype=torch.int32, device=device)
    y = torch.tensor([2, 4, 5, 8], dtype=torch.int32, device=device)
    ret = x.floor_divide_(y)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [5, 5, 6, 5])

  def test_lshift_inplace_identity(self):
    x = torch.tensor([1, 2, 3, 4], dtype=torch.int32, device=device)
    ret = x.__ilshift__(2)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [4, 8, 12, 16])

  def test_rshift_inplace_identity(self):
    x = torch.tensor([16, 32, 48, 64], dtype=torch.int32, device=device)
    ret = x.__irshift__(2)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [4, 8, 12, 16])

  def test_relu_inplace_identity(self):
    x = torch.tensor([-1.0, 2.0, -3.0, 4.0], device=device)
    ret = x.relu_()
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [0.0, 2.0, 0.0, 4.0])

  def test_random_inplace_identity(self):
    x = torch.zeros(10, dtype=torch.int32, device=device)
    ret = x.random_()
    assert ret is x
    assert x.shape == (10,)

  def test_random_from_inplace_identity(self):
    x = torch.zeros(10, dtype=torch.int32, device=device)
    ret = x.random_(5, 10)
    assert ret is x
    # values should be in range [5, 10)
    assert torch.all(x >= 5).item() and torch.all(x < 10).item()

  def test_uniform_inplace_identity(self):
    x = torch.zeros(10, device=device)
    ret = x.uniform_(0.0, 1.0)
    assert ret is x
    # values should be in range [0, 1)
    assert torch.all(x >= 0.0).item() and torch.all(x < 1.0).item()

  def test_normal_inplace_identity(self):
    x = torch.zeros(100, device=device)
    ret = x.normal_(0.0, 1.0)
    assert ret is x
    # just check that values changed from zeros
    assert not torch.all(x == 0.0).item()

  def test_logical_or_inplace_identity(self):
    x = torch.tensor([True, False, True, False], device=device)
    y = torch.tensor([False, False, True, True], device=device)
    ret = x.logical_or_(y)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [True, False, True, True])

  def test_masked_fill_scalar_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    mask = torch.tensor([True, False, True, False], device=device)
    ret = x.masked_fill_(mask, 0.0)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [0.0, 2.0, 0.0, 4.0])

  def test_masked_fill_tensor_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    mask = torch.tensor([True, False, True, False], device=device)
    value = torch.tensor(99.0, device=device)
    ret = x.masked_fill_(mask, value)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [99.0, 2.0, 99.0, 4.0])

  def test_zero_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    ret = x.zero_()
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [0.0, 0.0, 0.0, 0.0])

  def test_fill_scalar_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    ret = x.fill_(5.0)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [5.0, 5.0, 5.0, 5.0])

  def test_fill_tensor_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    value = torch.tensor(7.0, device=device)
    ret = x.fill_(value)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [7.0, 7.0, 7.0, 7.0])

  def test_add_tensor_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    y = torch.tensor([10.0, 20.0, 30.0, 40.0], device=device)
    ret = x.add_(y)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [11.0, 22.0, 33.0, 44.0])

  def test_add_scalar_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    ret = x.add_(10.0)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [11.0, 12.0, 13.0, 14.0])

  def test_mul_tensor_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    y = torch.tensor([2.0, 3.0, 4.0, 5.0], device=device)
    ret = x.mul_(y)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [2.0, 6.0, 12.0, 20.0])

  def test_mul_scalar_inplace_identity(self):
    x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)
    ret = x.mul_(2.0)
    assert ret is x
    np.testing.assert_equal(x.cpu().numpy(), [2.0, 4.0, 6.0, 8.0])

if __name__ == "__main__":
  unittest.main()