mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
move tc to renderers (#4631)
* move tc to renderers * missed import * fix typo * fix * fix imports * remove from tests * fix 4607 * nv emulate timestamp * time is int * correct time
This commit is contained in:
@@ -4,6 +4,18 @@ from dataclasses import dataclass
|
||||
from tinygrad.helpers import to_function_name
|
||||
from tinygrad.codegen.uops import UOpGraph
|
||||
from tinygrad.shape.symbolic import sym_infer, sint, Variable
|
||||
from tinygrad.dtype import DType
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TensorCore: # D = A * B + C, A is (M x K), B is (K x N), C and D are (M x N)
|
||||
dims: Tuple[int,int,int] # N, M, K
|
||||
dtype_in: DType # dtype for A and B
|
||||
dtype_out: DType # dtype for C and D
|
||||
threads: List[Tuple[int,int]] # list of (TC dim,amt) that construct the warp thread structure
|
||||
thread_local_aliases: List[List[List[int]]] # a list of [threads_1, ..., threads_n, upcast_1(unrolled), upcast_2(upcast)] defining the alias (-1 is upcast, 1-n is warp threads) for each TC dim # noqa: E501
|
||||
thread_local_sizes: List[List[int]] # in each thread, the number of elements stored in registers for each TC dim
|
||||
def __str__(self): return "_".join(["WMMA"] + list(map(str, self.dims)) + [self.dtype_in.name, self.dtype_out.name])
|
||||
def num_upcasts(self): return len(self.thread_local_aliases[0]) - len(self.threads)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Program:
|
||||
@@ -40,10 +52,10 @@ class Renderer:
|
||||
supports_float4: bool = True
|
||||
has_local: bool = True
|
||||
has_shared: bool = True
|
||||
has_tensor_cores: bool = False
|
||||
# NOTE: these two should be in z,y,x(reversed) order for cstyle backends, they are flipped when kernel is rendered
|
||||
global_max: Optional[List[int]] = None
|
||||
local_max: Optional[List[int]] = None
|
||||
shared_max: int = 32768
|
||||
tensor_cores: List[TensorCore] = []
|
||||
|
||||
def render(self, name:str, uops:UOpGraph) -> str: raise NotImplementedError("needs a renderer")
|
||||
|
||||
@@ -5,7 +5,7 @@ from tinygrad.codegen.linearizer import UOps, UOp
|
||||
from tinygrad.ops import BinaryOps, UnaryOps, TernaryOps, Op
|
||||
from tinygrad.dtype import dtypes, DType, PtrDType, ConstType
|
||||
from tinygrad.codegen.uops import UOpGraph, PatternMatcher
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.renderer import Renderer, TensorCore
|
||||
|
||||
def render_val(x, dtype):
|
||||
if dtypes.is_float(dtype):
|
||||
@@ -48,11 +48,11 @@ def optimize_gated_loads(uops: UOpGraph):
|
||||
class PTXRenderer(Renderer):
|
||||
device = "CUDA"
|
||||
suffix = "PTX"
|
||||
global_max=[65535, 65535, 2147483647]
|
||||
local_max=[64, 1024, 1024]
|
||||
shared_max=49152
|
||||
has_tensor_cores = False
|
||||
def __init__(self, arch:str): self.has_tensor_cores=int(arch[3:]) >= 80
|
||||
global_max = [65535, 65535, 2147483647]
|
||||
local_max = [64, 1024, 1024]
|
||||
shared_max = 49152
|
||||
tensor_cores = [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float)])] # noqa: E501
|
||||
def __init__(self, arch:str): self.tensor_cores = PTXRenderer.tensor_cores if int(arch[3:]) >= 80 else []
|
||||
|
||||
# language options
|
||||
kernel_prefix = """.version VERSION
|
||||
|
||||
@@ -6,7 +6,7 @@ from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
|
||||
from tinygrad.helpers import strip_parens, getenv, prod
|
||||
from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, ConstType
|
||||
from tinygrad.codegen.uops import UOpGraph
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.renderer import Renderer, TensorCore
|
||||
|
||||
class CStyleLanguage(Renderer):
|
||||
kernel_prefix: str = ""
|
||||
@@ -206,8 +206,9 @@ class OpenCLRenderer(CStyleLanguage):
|
||||
|
||||
class MetalRenderer(CStyleLanguage):
|
||||
device = "METAL"
|
||||
has_tensor_cores=os.uname().machine == "arm64"
|
||||
shared_max=32768
|
||||
shared_max = 32768
|
||||
tensor_cores = [TensorCore(dims=(8,8,8), threads=[(0,2),(1,4),(0,2),(1,2)], thread_local_sizes=[[2],[2],[2]], thread_local_aliases=[ [[0],[2],[0],[4],[-1, 1, 3],[0]], [[1],[0],[3],[0],[2, 4],[-1]], [[1],[2],[3],[4],[0],[-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.float, dtypes.float), (dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]] # noqa: E501
|
||||
def __init__(self): self.tensor_cores = MetalRenderer.tensor_cores if os.uname().machine == "arm64" else []
|
||||
|
||||
# language options
|
||||
kernel_prefix = "kernel "
|
||||
@@ -251,11 +252,11 @@ def _make_cuda_dtype(base_type, name, cnt):
|
||||
|
||||
class CUDARenderer(CStyleLanguage):
|
||||
device = "CUDA"
|
||||
global_max=[65535, 65535, 2147483647]
|
||||
local_max=[64, 1024, 1024]
|
||||
shared_max=49152
|
||||
has_tensor_cores = False
|
||||
def __init__(self, arch:str): self.has_tensor_cores=int(arch[3:]) >= 80
|
||||
global_max = [65535, 65535, 2147483647]
|
||||
local_max = [64, 1024, 1024]
|
||||
shared_max = 49152
|
||||
tensor_cores = [TensorCore(dims=(8,16,16), threads=[(0,2),(0,2),(1,2),(1,2),(0,2)], thread_local_sizes=[[2,2,2],[2,2],[2,2]], thread_local_aliases=[ [[0],[0],[5],[-2],[0],[-1,1,2,-3],[3,4]], [[3],[4],[0],[0],[5],[-1,1,2,-2],[0]], [[-1],[1],[5],[-2],[2],[0],[3,4]] ], dtype_in=di, dtype_out=do) for (di, do) in ([(dtypes.half, dtypes.float), (dtypes.bfloat16, dtypes.float)])] # noqa: E501
|
||||
def __init__(self, arch:str): self.tensor_cores = CUDARenderer.tensor_cores if int(arch[3:]) >= 80 else []
|
||||
|
||||
# language options
|
||||
kernel_prefix = "extern \"C\" __global__ "
|
||||
@@ -313,8 +314,8 @@ def _make_hip_dtype(base_type, name, cnt):
|
||||
|
||||
class HIPRenderer(CStyleLanguage):
|
||||
device = "HSA"
|
||||
has_tensor_cores = True
|
||||
shared_max = 65536
|
||||
tensor_cores = [TensorCore(dims=(16,16,16), threads=[(0,8),(0,2),(1,2)], thread_local_sizes=[[16],[16],[4,2]], thread_local_aliases=[ [[0],[0],[2],[-1],[1]], [[1],[2],[0],[-1],[0]], [[1],[2],[-2],[0],[3,-1]] ], dtype_in=di, dtype_out=do) for (di, do) in [(dtypes.half, dtypes.float), (dtypes.half, dtypes.half)]] # noqa: E501
|
||||
|
||||
# language options
|
||||
kernel_prefix = """extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
|
||||
@@ -378,3 +379,6 @@ static __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat16 b) {
|
||||
# https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
|
||||
# NOTE: this makes hlb_cifar10 twice as fast, there may be more gains in tweaking these parameters
|
||||
return f"__attribute__((amdgpu_flat_work_group_size(1, {requiredMaxThreadsPerBlock})))"
|
||||
|
||||
class NVRenderer(CUDARenderer): device = "NV"
|
||||
class AMDRenderer(HIPRenderer): device = "AMD"
|
||||
|
||||
Reference in New Issue
Block a user