mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-23 05:48:08 -05:00
Actually flip local_max in CUDA (#1462)
* Actually do the flip * Fixed typo --------- Co-authored-by: terafo <terafo@protonmail.com>
This commit is contained in:
@@ -139,6 +139,7 @@ class LinearizerOptions(NamedTuple):
|
||||
supports_float4: bool = True
|
||||
supports_float4_alu: bool = True
|
||||
has_local: bool = True
|
||||
# NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
|
||||
global_max: Optional[List[int]] = None
|
||||
local_max: Optional[List[int]] = None
|
||||
|
||||
|
||||
@@ -92,4 +92,4 @@ renderer = functools.partial(uops_to_cstyle, CStyleLanguage(
|
||||
__device__ __forceinline__ explicit operator float4() const {return make_float4(__half2float(x.x), __half2float(x.y), __half2float(y.x), __half2float(y.y)); }
|
||||
};
|
||||
"""))
|
||||
CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [1024, 1024, 64]), renderer, CUDAProgram, cuda.Context.synchronize)
|
||||
CUDABuffer = Compiled(RawCUDABuffer, LinearizerOptions(supports_float4_alu=False, global_max = [65535, 65535, 2147483647], local_max = [64, 1024, 1024]), renderer, CUDAProgram, cuda.Context.synchronize)
|
||||
|
||||
Reference in New Issue
Block a user