From 03b367c014847e671c754879ee7bf5f1f312e2ae Mon Sep 17 00:00:00 2001
From: chenyu <chenyu@fastmail.com>
Date: Mon, 17 Jun 2024 21:12:52 -0400
Subject: [PATCH] handle float16 overflow in PYTHON (#5022)

* handle float16 overflow in PYTHON

use `truncate` when constructing tensor from list to make sure all values are packable (might be slow, but should be correct). add truncate_fp16 to cast overflowed values to inf/-inf.

* all valid fmt supports truncate
---
 test/test_dtype.py             |  1 -
 test/test_tensor.py            | 19 ++++++++++++++++++-
 tinygrad/ops.py                | 13 ++++++++++---
 tinygrad/runtime/ops_python.py |  4 +++-
 tinygrad/tensor.py             |  5 +++--
 5 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/test/test_dtype.py b/test/test_dtype.py
index 6aaa5eaa18..c9c7de9984 100644
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -544,7 +544,6 @@ class TestAutoCastType(unittest.TestCase):
     assert (Tensor([0, 1], dtype=dtypes.float32)).sum().dtype == dtypes.float32
     assert (Tensor([0, 1], dtype=dtypes.float64)).sum().dtype == dtypes.float64
 
-  @unittest.skipIf(Device.DEFAULT == "PYTHON", "TODO: support inf to half in PYTHON backend")
   @unittest.skipUnless(is_dtype_supported(dtypes.float16), "need float16")
   def test_sum_acc_dtype(self):
     t = Tensor([40000, 40000], dtype=dtypes.float16)
diff --git a/test/test_tensor.py b/test/test_tensor.py
index e3ee3f903c..39380452e6 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -1,10 +1,11 @@
 import numpy as np
 import torch
-import unittest, copy, mmap, random
+import unittest, copy, mmap, random, math
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.helpers import getenv, temp, CI
 from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 from hypothesis import given, settings, strategies as strat
+from test.helpers import is_dtype_supported
 
 settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
 settings.load_profile("my_profile")
@@ -302,6 +303,22 @@ class TestTinygrad(unittest.TestCase):
         data = _generate_data(depth)
         np.testing.assert_allclose(Tensor(data).numpy(), np.array(data))
 
+  def test_tensor_list_special_values(self):
+    if is_dtype_supported(dtypes.float16):
+      data = [math.nan, -math.inf, 65504, 65519, 65519.999, 65520, 65520.1]
+      data = data + [-x for x in data]
+      np.testing.assert_allclose(Tensor(data, dtype=dtypes.float16).numpy(), np.array(data, dtype=np.float16))
+
+    # uint32
+    data = [1 << 33, 1 << 32, 1 << 32 - 1, 1]
+    data = data + [-x for x in data]
+    np.testing.assert_allclose(Tensor(data, dtype=dtypes.uint32).numpy(), np.array(data, dtype=np.uint32))
+
+    # int32
+    data = [1 << 33, 1 << 32, 1 << 32 - 1, 1]
+    data = data + [-x for x in data]
+    np.testing.assert_allclose(Tensor(data, dtype=dtypes.int32).numpy(), np.array(data, dtype=np.int32))
+
   def test_tensor_bytes(self):
     data = b"abc123"
     t = Tensor(data)
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index acae977351..37daea008a 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import Union, Tuple, Any, List, Dict, Callable
-import functools, hashlib, math, operator, ctypes
+import functools, hashlib, math, operator, ctypes, struct
 from enum import Enum, auto
 from dataclasses import dataclass
 from tinygrad.helpers import prod, dedup
@@ -128,9 +128,16 @@ python_alu = {
   BinaryOps.MOD: lambda x,y: abs(int(x))%abs(int(y))*(1,-1)[x<0], BinaryOps.IDIV: lambda x, y: int(x/y) if y != 0 else x*math.inf,
   TernaryOps.WHERE: lambda x,y,z: y if x else z}
 
+def truncate_fp16(x):
+  try:
+    x = float(x)
+    struct.pack("@e", x)
+    return x
+  except OverflowError: return x * math.inf
+
 truncate: Dict[DType, Callable] = {dtypes.bool: bool,
-  # TODO: float16 and bfloat16?
-  dtypes.float32: lambda x: ctypes.c_float(x).value, dtypes.float64: lambda x: ctypes.c_double(x).value,
+  # TODO: bfloat16
+  dtypes.float16: truncate_fp16, dtypes.float32: lambda x: ctypes.c_float(x).value, dtypes.float64: lambda x: ctypes.c_double(x).value,
   dtypes.uint8: lambda x: ctypes.c_uint8(x).value, dtypes.uint16: lambda x: ctypes.c_uint16(x).value,
   dtypes.uint32: lambda x: ctypes.c_uint32(x).value, dtypes.uint64: lambda x: ctypes.c_uint64(x).value,
   dtypes.int8: lambda x: ctypes.c_int8(x).value, dtypes.int16: lambda x: ctypes.c_int16(x).value,
diff --git a/tinygrad/runtime/ops_python.py b/tinygrad/runtime/ops_python.py
index 35f8ec854b..4d0b89cf5b 100644
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@@ -7,7 +7,7 @@ from tinygrad.dtype import DType, dtypes, ImageDType
 from tinygrad.helpers import all_same, getenv, flatten
 from tinygrad.device import Compiled, Compiler, Allocator
 from tinygrad.codegen.uops import UOpGraph, UOps
-from tinygrad.ops import BinaryOps, TernaryOps, exec_alu
+from tinygrad.ops import BinaryOps, TernaryOps, exec_alu, truncate
 from tinygrad.renderer import Renderer
 from tinygrad.renderer.cstyle import CUDARenderer, MetalRenderer, AMDRenderer
 
@@ -110,6 +110,8 @@ class PythonProgram:
               if dtypes.is_int(dtype):
                 overflow_adjust = 2**(dtype.itemsize*8 - 1) if not dtypes.is_unsigned(dtype) else 0
                 casted = [((x + overflow_adjust) % 2**(dtype.itemsize*8) - overflow_adjust) for x in casted]
+              elif dtypes.is_float(dtype):
+                casted = [truncate.get(dtype, lambda dt: dt)(x) for x in casted]
               ul[i] = list(struct.unpack(unpack_format, struct.pack(unpack_format, *casted)))
         elif uop is UOps.LOAD:
           if isinstance(dtp[0], ImageDType):
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 9242779bd5..490131bbf6 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -11,7 +11,7 @@ from tinygrad.helpers import argfix, make_pair, flatten, prod, all_int, round_up
 from tinygrad.helpers import IMAGE, DEBUG, WINO, THREEFRY
 from tinygrad.lazy import LazyBuffer
 from tinygrad.multi import MultiLazyBuffer
-from tinygrad.ops import LoadOps
+from tinygrad.ops import LoadOps, truncate
 from tinygrad.device import Device, Buffer, BufferOptions
 from tinygrad.shape.symbolic import sint, Variable, MulNode, SumNode, NumNode, Node
 from tinygrad.engine.realize import run_schedule
@@ -51,7 +51,8 @@ def _frompy(x:Union[List, Tuple, bytes], dtype:DType) -> LazyBuffer:
   else:
     ret = LazyBuffer.loadop(LoadOps.EMPTY, get_shape(x), dtype, "PYTHON")
     assert dtype.fmt is not None, f"{dtype=} has None fmt"
-    data = struct.pack(f"@{ret.size}{dtype.fmt}", *fully_flatten(x))
+    truncate_function = truncate[dtype]
+    data = struct.pack(f"@{ret.size}{dtype.fmt}", *[truncate_function(xi) for xi in fully_flatten(x)])
   # fake realize
   ret.buffer.allocate(memoryview(data))
   del ret.srcs