move bf16 cast hack to Tensor.llvm_bf16_cast (#3788)

2026-01-09 23:18:04 -05:00 · 2024-03-17 18:51:22 -04:00
parent 311cf2b7d3
commit 639bd5dbfc
5 changed files with 13 additions and 16 deletions
--- a/examples/coder.py
+++ b/examples/coder.py
@@ -34,7 +34,7 @@ if __name__ == "__main__":
    part2 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true"))

  # fix bf16, TODO: check if device supports bf16
-  def fix_bf16(weights): return {k:v.llvm().cast(dtypes.float16).to(Device.DEFAULT) if v.dtype == dtypes.bfloat16 else v for k,v in weights.items()}
+  def fix_bf16(weights): return {k:v.llvm_bf16_cast(dtypes.half).to(v.device) if v.dtype == dtypes.bfloat16 else v for k,v in weights.items()}

  with Timing("weights -> model: "):
    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, model, 32, 8)), strict=False)
--- a/test/test_dtype.py
+++ b/test/test_dtype.py
@@ -117,7 +117,7 @@ def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype=None):
  _assert_eq(Tensor([[1,2],[3,4]], dtype=a_dtype)@Tensor.eye(2, dtype=b_dtype), target_dtype, [[1,2],[3,4]])
  _assert_eq(Tensor([1,1,1,1], dtype=a_dtype)+Tensor.ones((4,4), dtype=b_dtype), target_dtype, 2*Tensor.ones(4,4).numpy())

-@unittest.skipUnless(Device.DEFAULT in ["LLVM", "HIP", "METAL"], "bfloat16 not supported")
+@unittest.skipUnless(is_dtype_supported(dtypes.bfloat16), "bfloat16 not supported")
 class TestBFloat16(unittest.TestCase):
  def test_bf16_creation_numpy(self):
    data = [-1, 1, 2]
@@ -127,21 +127,19 @@ class TestBFloat16(unittest.TestCase):
    assert tnp.dtype == np.float32
    np.testing.assert_allclose(tnp, np.array(data))

-  @unittest.skipIf(Device.DEFAULT=="LLVM", "no LLVM bf16 buffer")
  def test_bf16_ones(self):
    # TODO: fix this with correct bfloat16 cast
    t = Tensor.ones(3, 5, dtype=dtypes.bfloat16)
    assert t.dtype == dtypes.bfloat16
    np.testing.assert_allclose(t.numpy(), np.ones((3, 5)))

-  @unittest.skipIf(Device.DEFAULT=="LLVM", "no LLVM bf16 buffer")
  def test_bf16_eye(self):
    # TODO: fix this with correct bfloat16 cast
    t = Tensor.eye(3, dtype=dtypes.bfloat16)
    assert t.dtype == dtypes.bfloat16
    np.testing.assert_allclose(t.numpy(), np.eye(3))

-@unittest.skipUnless(Device.DEFAULT in ["LLVM", "HIP"], "bfloat16 not supported")
+@unittest.skipUnless(is_dtype_supported(dtypes.bfloat16), "bfloat16 not supported")
 class TestBFloat16DType(unittest.TestCase):
  def test_bf16_to_float(self):
    _test_cast(Tensor([100000], dtype=dtypes.bfloat16), dtypes.float32)
@@ -157,7 +155,7 @@ class TestBFloat16DType(unittest.TestCase):
    back = t.cast(dtypes.float32)
    assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)

-@unittest.skipUnless(Device.DEFAULT in ["HIP"], "bfloat16 not supported")
+@unittest.skipUnless(is_dtype_supported(dtypes.bfloat16), "bfloat16 not supported")
 class TestBFloat16DTypeCast(unittest.TestCase):
  def test_f16_to_bf16_conversion(self):
    original_tensor = Tensor([1.0, 2.0, 3.0], dtype=dtypes.float16)
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -216,7 +216,7 @@ class TestDiskTensor(unittest.TestCase):

  @unittest.skipIf(Device.DEFAULT == "RHIP", "no real HIP device exists in CI")
  def test_bf16_disk_write_read(self):
-    t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.float32)
+    t = Tensor([10000, -1, -1000, -10000, 20], dtype=dtypes.float32)
    t.to(f"disk:{temp('f32')}").realize()

    # hack to "cast" f32 -> bf16
@@ -224,9 +224,8 @@ class TestDiskTensor(unittest.TestCase):
    adat = b''.join([dat[i+2:i+4] for i in range(0, len(dat), 4)])
    with open(temp('bf16'), "wb") as f: f.write(adat)

-    t = Tensor.empty(5, dtype=dtypes.bfloat16, device=f"disk:{temp('bf16')}").llvm().realize()
-    back = t.cast(dtypes.float32)
-    assert tuple(back.numpy().tolist()) == (9984., -1, -1000, -9984, 20)
+    t = Tensor.empty(5, dtype=dtypes.bfloat16, device=f"disk:{temp('bf16')}").llvm_bf16_cast(dtypes.float)
+    assert t.numpy().tolist() == [9984., -1, -1000, -9984, 20]

 if __name__ == "__main__":
  unittest.main()
--- a/tinygrad/nn/state.py
+++ b/tinygrad/nn/state.py
@@ -91,8 +91,8 @@ def torch_load(fn:str) -> Dict[str, Tensor]:
    if tuple(permute_indexes) != tuple(range(len(permute_indexes))):
      intermediate_shape = tuple([shape_strides[x][0] for x in argsort(permute_indexes)])
      assert tuple([shape_strides[i][1] for i in argsort(permute_indexes)]) == strides_for_shape(intermediate_shape), "nonpermutable strides"
-      if DEBUG >= 3: print(f"WARNING: this torch load is slow. CPU to permute {intermediate_shape} with {permute_indexes}")
-      assert storage[1] != dtypes.bfloat16, "can't CPU permute BF16"
+      if DEBUG >= 3: print(f"WARNING: this torch load is slow. CLANG to permute {intermediate_shape} with {permute_indexes}")
+      assert storage[1] != dtypes.bfloat16, "can't CLANG permute BF16"
      # TODO: find a nice way to support all shapetracker on disktensors
      # TODO: BUG: a ".realize()" is needed here for 'GPU=1 python3 test/models/test_efficientnet.py TestEfficientNet.test_car'
      ret = ret.clang().reshape(intermediate_shape).permute(permute_indexes).realize()
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -993,11 +993,11 @@ class Tensor:

  # ***** cast ops *****

-  def cast(self, dtype:DType) -> Tensor:
-    if self.dtype == dtype: return self
+  def llvm_bf16_cast(self, dtype:DType):
    # hack for devices that don't support bfloat16
-    if self.dtype == dtypes.bfloat16: return self.bitcast(dtypes.uint16).cast(dtypes.uint32).mul(1<<16).bitcast(dtypes.float32).cast(dtype)
-    return mlops.Cast.apply(self, dtype=dtype)
+    assert self.dtype == dtypes.bfloat16
+    return self.to("LLVM").bitcast(dtypes.uint16).cast(dtypes.uint32).mul(1<<16).bitcast(dtypes.float32).cast(dtype)
+  def cast(self, dtype:DType) -> Tensor: return self if self.dtype == dtype else mlops.Cast.apply(self, dtype=dtype)
  def bitcast(self, dtype:DType) -> Tensor:
    assert self.dtype.itemsize == dtype.itemsize, "can't bitcast mismatched dtype itemsizes"
    return mlops.Cast.apply(self, dtype=dtype, bitcast=True) if self.dtype != dtype else self