move tc to renderers (#4631)

* move tc to renderers

* missed import

* fix typo

* fix

* fix imports

* remove from tests

* fix 4607

* nv emulate timestamp

* time is int

* correct time
This commit is contained in:
nimlgen
2024-05-18 00:36:29 +03:00
committed by GitHub
parent d70988dddf
commit daf57af3eb
10 changed files with 60 additions and 67 deletions

View File

@@ -4,6 +4,18 @@ from dataclasses import dataclass
from tinygrad.helpers import to_function_name
from tinygrad.codegen.uops import UOpGraph
from tinygrad.shape.symbolic import sym_infer, sint, Variable
from tinygrad.dtype import DType
@dataclass(frozen=True)
class TensorCore: # D = A * B + C, A is (M x K), B is (K x N), C and D are (M x N)
dims: Tuple[int,int,int] # N, M, K
dtype_in: DType # dtype for A and B
dtype_out: DType # dtype for C and D
threads: List[Tuple[int,int]] # list of (TC dim,amt) that construct the warp thread structure
thread_local_aliases: List[List[List[int]]] # a list of [threads_1, ..., threads_n, upcast_1(unrolled), upcast_2(upcast)] defining the alias (-1 is upcast, 1-n is warp threads) for each TC dim # noqa: E501
thread_local_sizes: List[List[int]] # in each thread, the number of elements stored in registers for each TC dim
def __str__(self): return "_".join(["WMMA"] + list(map(str, self.dims)) + [self.dtype_in.name, self.dtype_out.name])
def num_upcasts(self): return len(self.thread_local_aliases[0]) - len(self.threads)
@dataclass(frozen=True)
class Program:
@@ -40,10 +52,10 @@ class Renderer:
supports_float4: bool = True
has_local: bool = True
has_shared: bool = True
has_tensor_cores: bool = False
# NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
global_max: Optional[List[int]] = None
local_max: Optional[List[int]] = None
shared_max: int = 32768
tensor_cores: List[TensorCore] = []
def render(self, name:str, uops:UOpGraph) -> str: raise NotImplementedError("needs a renderer")

View File

@@ -5,7 +5,7 @@ from tinygrad.codegen.linearizer import UOps, UOp
from tinygrad.ops import BinaryOps, UnaryOps, TernaryOps, Op
from tinygrad.dtype import dtypes, DType, PtrDType, ConstType
from tinygrad.codegen.uops import UOpGraph, PatternMatcher
from tinygrad.renderer import Renderer
from tinygrad.renderer import Renderer, TensorCore
def render_val(x, dtype):
if dtypes.is_float(dtype):
@@ -48,11 +48,11 @@ def optimize_gated_loads(uops: UOpGraph):
class PTXRenderer(Renderer):
device = "CUDA"
suffix = "PTX"
global_max=[65535, 65535, 2147483647]
local_max=[64, 1024, 1024]
shared_max=49152
has_tensor_cores = False
def __init__(self, arch:str): self.has_tensor_cores=int(arch[3:]) >= 80
global_max = [65535, 65535, 2147483647]
local_max = [64, 1024, 1024]
shared_max = 49152
tensor_cores = [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)])] # noqa: E501
def __init__(self, arch:str): self.tensor_cores = PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else []
# language options
kernel_prefix = """.version VERSION

View File

@@ -6,7 +6,7 @@ from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
from tinygrad.helpers import strip_parens, getenv, prod
from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, ConstType
from tinygrad.codegen.uops import UOpGraph
from tinygrad.renderer import Renderer
from tinygrad.renderer import Renderer, TensorCore
class CStyleLanguage(Renderer):
kernel_prefix: str = ""
@@ -206,8 +206,9 @@ class OpenCLRenderer(CStyleLanguage):
class MetalRenderer(CStyleLanguage):
device = "METAL"
has_tensor_cores=os.uname().machine == "arm64"
shared_max=32768
shared_max = 32768
tensor_cores = [TensorCore(dims=(8,8,8), threads=[(0,2),(1,4),(0,2),(1,2)], thread_local_sizes=[[2],[2],[2]], thread_local_aliases=[ [[0],[2],[0],[4],[-1, 1, 3],[0]], [[1],[0],[3],[0],[2, 4],[-1]], [[1],[2],[3],[4],[0],[-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.float, dtypes.float), (dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]] # noqa: E501
def __init__(self): self.tensor_cores = MetalRenderer.tensor_cores if os.uname().machine == "arm64" else []
# language options
kernel_prefix = "kernel "
@@ -251,11 +252,11 @@ def _make_cuda_dtype(base_type, name, cnt):
class CUDARenderer(CStyleLanguage):
device = "CUDA"
global_max=[65535, 65535, 2147483647]
local_max=[64, 1024, 1024]
shared_max=49152
has_tensor_cores = False
def __init__(self, arch:str): self.has_tensor_cores=int(arch[3:]) >= 80
global_max = [65535, 65535, 2147483647]
local_max = [64, 1024, 1024]
shared_max = 49152
tensor_cores = [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])] # noqa: E501
def __init__(self, arch:str): self.tensor_cores = CUDARenderer.tensor_cores if int(arch[3:]) >= 80 else []
# language options
kernel_prefix = "extern \"C\" __global__ "
@@ -313,8 +314,8 @@ def _make_hip_dtype(base_type, name, cnt):
class HIPRenderer(CStyleLanguage):
device = "HSA"
has_tensor_cores = True
shared_max = 65536
tensor_cores = [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[0],[0],[2],[-1],[1]], [[1],[2],[0],[-1],[0]], [[1],[2],[-2],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]] # noqa: E501
# language options
kernel_prefix = """extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
@@ -378,3 +379,6 @@ static __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat16 b) {
# https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
# NOTE: this makes hlb_cifar10 twice as fast, there may be more gains in tweaking these parameters
return f"__attribute__((amdgpu_flat_work_group_size(1, {requiredMaxThreadsPerBlock})))"
class NVRenderer(CUDARenderer): device = "NV"
class AMDRenderer(HIPRenderer): device = "AMD"