diff --git a/extra/thunder/cuda/include/ops/group/group.cuh b/extra/thunder/cuda/include/ops/group/group.cuh index 1a9d69971c..b9e23634c6 100644 --- a/extra/thunder/cuda/include/ops/group/group.cuh +++ b/extra/thunder/cuda/include/ops/group/group.cuh @@ -46,9 +46,9 @@ __device__ static inline void arrive(int id) { #include "memory/memory.cuh" #include "shared/shared.cuh" #include "register/register.cuh" +#include "mma/mma.cuh" #ifdef KITTENS_HOPPER -#include "mma/mma.cuh" template __device__ static inline void increase_registers() { static_assert(n_reg % 8 == 0, "n_reg must be a multiple of 8"); @@ -93,4 +93,4 @@ __device__ static inline void sync() { using warp = group<1>; // scope used by most pre-Hopper GPUs, and also for most register operations. using warpgroup = group<4>; // special scope commonly used by Hopper and later. -} \ No newline at end of file +} diff --git a/extra/thunder/cuda/matmul.py b/extra/thunder/cuda/matmul.py index fe3bd577e4..ea0454edcb 100644 --- a/extra/thunder/cuda/matmul.py +++ b/extra/thunder/cuda/matmul.py @@ -6,7 +6,7 @@ from tinygrad.runtime.support.compiler_cuda import pretty_ptx, NVCCCompiler if __name__ == "__main__": code = (pathlib.Path(__file__).parent / "matmul.cu").read_text() device = Device["CUDA"] - kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr", "-DKITTENS_HOPPER"] + kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr"] lib = NVCCCompiler(device.compiler.arch, kitten_args).compile(code) kernel_name = lib.decode().split(".globl\t")[1].split("\n")[0] print("kernel name", kernel_name)