diff --git a/tinygrad/codegen/kernel.py b/tinygrad/codegen/kernel.py index 286058d802..285f47aff9 100644 --- a/tinygrad/codegen/kernel.py +++ b/tinygrad/codegen/kernel.py @@ -59,6 +59,7 @@ tensor_cores: Dict[str, List[TensorCore]] = { TensorCore(dims=[8,16,16], dtype_in=dtypes.half, dtype_out=dtypes.float, wmma_func="__cuda_mma_m16n8k16_f16_f32", threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[-2],[5],[0],[0],[-1,1,2,-3],[3,4]], [[5],[0],[0],[4],[3],[-1,1,2,-2],[0]], [[2],[-2],[5],[1],[-1],[0],[3,4]] ]), # noqa: E501 ], } +tensor_cores["HSA"] = tensor_cores["HIP"] class LocalBuffer(NamedTuple): name: str diff --git a/tinygrad/features/search.py b/tinygrad/features/search.py index 09d81dd2b7..9af2e3ac1b 100644 --- a/tinygrad/features/search.py +++ b/tinygrad/features/search.py @@ -104,7 +104,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea beam: List[Tuple[Linearizer, float]] = [] seen_libs = set() - default_parallel, min_progress_micros = 1 if lin.opts.device in {"CUDA", "HIP"} else 0, getenv("BEAM_MIN_PROGRESS",0) + default_parallel, min_progress_micros = 1 if lin.opts.device in {"CUDA", "HIP", "HSA"} else 0, getenv("BEAM_MIN_PROGRESS",0) if beam_pool is None and getenv("PARALLEL", default_parallel): beam_pool = multiprocessing.Pool(multiprocessing.cpu_count(), _init_worker) try: @@ -115,7 +115,7 @@ def beam_search(lin:Linearizer, rawbufs, amt:int, allow_test_size=True) -> Linea while not exiting: acted_lins = flatten([get_linearizer_actions(lin, include_0=False).values() for lin,_ in beam]) if len(beam) else [lin] timed_lins: List[Tuple[Linearizer, float]] = [] - _compile_fn = functools.partial(_try_compile_linearized_w_idx, compiler=Device[lin.opts.device].compiler) + _compile_fn = functools.partial(_try_compile_linearized_w_idx, compiler=dev.compiler) for i,proc in (map(_compile_fn, enumerate(acted_lins)) if beam_pool is None else beam_pool.imap_unordered(_compile_fn, enumerate(acted_lins))): if proc is None: continue lib, global_size, local_size, vars = proc diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py index 379b8b10bf..2853dfa3ca 100644 --- a/tinygrad/runtime/ops_hsa.py +++ b/tinygrad/runtime/ops_hsa.py @@ -4,10 +4,12 @@ from typing import Tuple, TypeVar, List, Dict import tinygrad.runtime.autogen.hsa as hsa from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t from tinygrad.device import Compiled, LRUAllocator, BufferOptions +from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.runtime.ops_hip import HIPCompiler from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue -HSACompiler = HIPCompiler +class HSACompiler(HIPCompiler): + linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True) class HSAProgram: def __init__(self, device:HSADevice, name:str, lib:bytes):