fix: don't use KITTENS_HOPPER for 4090 (#12954)

This commit is contained in:
wozeparrot
2025-10-27 17:19:53 -07:00
committed by GitHub
parent 372d9e5753
commit 24884c6768
2 changed files with 3 additions and 3 deletions

View File

@@ -46,9 +46,9 @@ __device__ static inline void arrive(int id) {
#include "memory/memory.cuh"
#include "shared/shared.cuh"
#include "register/register.cuh"
#include "mma/mma.cuh"
#ifdef KITTENS_HOPPER
#include "mma/mma.cuh"
template<int n_reg> __device__ static inline void increase_registers() {
static_assert(n_reg % 8 == 0, "n_reg must be a multiple of 8");
@@ -93,4 +93,4 @@ __device__ static inline void sync() {
using warp = group<1>; // scope used by most pre-Hopper GPUs, and also for most register operations.
using warpgroup = group<4>; // special scope commonly used by Hopper and later.
}
}

View File

@@ -6,7 +6,7 @@ from tinygrad.runtime.support.compiler_cuda import pretty_ptx, NVCCCompiler
if __name__ == "__main__":
code = (pathlib.Path(__file__).parent / "matmul.cu").read_text()
device = Device["CUDA"]
kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr", "-DKITTENS_HOPPER"]
kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr"]
lib = NVCCCompiler(device.compiler.arch, kitten_args).compile(code)
kernel_name = lib.decode().split(".globl\t")[1].split("\n")[0]
print("kernel name", kernel_name)