From 24933ab5517ff3eedb41dd2c147b007b343ebbe4 Mon Sep 17 00:00:00 2001 From: terafo <19949489+terafo@users.noreply.github.com> Date: Sun, 6 Aug 2023 20:35:25 +0300 Subject: [PATCH] Actually flip local_max in CUDA (#1462) * Actually do the flip * Fixed typo --------- Co-authored-by: terafo --- tinygrad/codegen/linearizer.py | 1 + tinygrad/runtime/ops_cuda.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 0a9c4635e6..fb5ddaebbf 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -139,6 +139,7 @@ class LinearizerOptions(NamedTuple): supports_float4: bool = True supports_float4_alu: bool = True has_local: bool = True + # NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered global_max: Optional[List[int]] = None local_max: Optional[List[int]] = None diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index b020d01942..942d41b1b4 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -92,4 +92,4 @@ renderer = functools.partial(uops_to_cstyle, CStyleLanguage( __device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); } }; """)) -CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [1024, 1024, 64]), renderer, CUDAProgram, cuda.Context.synchronize) +CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize)