make the bf16 include dynamic (#3642)

* dynamic prefix

* add common ones above

these are common dtypes

aesthetics

* regression test

fuzz it

test

* run in CI

* use .append

* faster
This commit is contained in:
qazal
2024-03-07 17:31:35 +02:00
committed by GitHub
parent 4552248c84
commit bdd62c7fd8
3 changed files with 53 additions and 8 deletions

View File

@@ -466,7 +466,7 @@ jobs:
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --durations=20
- name: Run pytest (hip)
if: matrix.backend=='hip'
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/imported/test_indexing.py --durations=20
run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/imported/test_indexing.py test/external/external_test_hip_compile.py --durations=20
#testunicorn:
# name: ARM64 unicorn Test

View File

@@ -0,0 +1,39 @@
import time, unittest
from tinygrad.runtime.driver.hip_comgr import compile_hip
from tinygrad import Tensor
from tinygrad.device import Device
from tinygrad.realize import create_schedule
from tinygrad.codegen.linearizer import Linearizer
class TestHIPCompileSpeed(unittest.TestCase):
@unittest.skipIf(Device.DEFAULT != "HIP", "only run on HIP")
def test_hip_compile(self):
a, b = Tensor([1,2,3,4,5]), Tensor([1,2,3,4,5])
out = a + b
lin = Linearizer(create_schedule([out.lazydata])[-1].ast)
lin.linearize()
reference = """
#include <hip/hip_common.h>
typedef long unsigned int size_t;
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
extern "C" __attribute__((global))void {name}(int* data0, const int* data1, const int* data2) {{
int gidx0 = __ockl_get_group_id(0); /* 5 */
int val0 = data1[gidx0];
int val1 = data2[gidx0];
data0[gidx0] = (val0+val1);
}}
"""
def time_compile(code):
st = time.perf_counter()
compile_hip(code)
return (time.perf_counter() - st) * 1000
tinygrad_tm = min([time_compile(Device[Device.DEFAULT].compiler.render(f"test{i}", lin.uops)) for i in range(10)])
ref_tm = min([time_compile(reference.format(name=f"test{i}")) for i in range(10)])
print(f"tinygrad {tinygrad_tm:6.2f} ms")
print(f"reference {ref_tm:6.2f} ms")
assert (tinygrad_tm - ref_tm) <= 10

View File

@@ -266,10 +266,8 @@ def _make_hip_dtype(base_type, name, cnt):
") { return {" + ', '.join(nms) + "}; }"
class HIPLanguage(CStyleLanguage):
kernel_prefix = "#include <hip/hip_common.h>\n#define INFINITY (__builtin_inff())\n#define NAN (__builtin_nanf(\"\"))" + """
typedef long unsigned int size_t;
kernel_prefix = """
#define half _Float16
#include <hip/hip_bfloat16.h>
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
@@ -291,10 +289,9 @@ class HIPLanguage(CStyleLanguage):
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
__attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
}\n""" + '\n'.join([
_make_hip_dtype(*x) for x in [("_Float16", "half", 2), ("_Float16", "half", 4), ("_Float16", "half", 8), ("_Float16", "half", 16),
("float", "float", 8)
]]) + """
}\n""" + '\n'.join([_make_hip_dtype(*x) for x in [
("_Float16", "half", 2), ("_Float16", "half", 4), ("_Float16", "half", 8), ("_Float16", "half", 16),
("float", "float", 8)]]) + """
static __attribute__((device)) half8 __hip_wmma_f16_f16(half16 a, half16 b, half8 c) {
half16 c_frag = {}; half8 d; for (int n = 0; n < 8; n++) { c_frag[n*2] = c[n]; }
c_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a, b, c_frag, false);
@@ -309,4 +306,13 @@ class HIPLanguage(CStyleLanguage):
float4 = "make_float4"
uses_ptr_arithmetic = False # NOTE: this fixes TestLinearizerOverflowAlt
type_map = {dtypes.bfloat16: "hip_bfloat16"}
def render_kernel(self, function_name, kernel, bufs, uops, prefix=None) -> str:
prefix = ["#include <hip/hip_common.h>\n#define INFINITY (__builtin_inff())\n#define NAN (__builtin_nanf(\"\"))",
"typedef long unsigned int size_t;"]
if any(uop.dtype == dtypes.bfloat16 for uop in uops): prefix.append("#include <hip/amd_detail/amd_hip_bfloat16.h>")
else: prefix.append('\n'.join(_make_hip_dtype(*x) for x in [("float", "float", 2), ("float", "float", 4),
("signed int", "int", 4), ("signed int", "int", 2)]))
return super().render_kernel(function_name, kernel, bufs, uops, prefix)
HIPRenderer = functools.partial(uops_to_cstyle, HIPLanguage())