fix: don't use KITTENS_HOPPER for 4090 (#12954)

2026-01-09 15:08:02 -05:00 · 2025-10-27 17:19:53 -07:00
parent 372d9e5753
commit 24884c6768
2 changed files with 3 additions and 3 deletions
--- a/extra/thunder/cuda/include/ops/group/group.cuh
+++ b/extra/thunder/cuda/include/ops/group/group.cuh
@@ -46,9 +46,9 @@ __device__ static inline void arrive(int id) {
 #include "memory/memory.cuh"
 #include "shared/shared.cuh"
 #include "register/register.cuh"
+#include "mma/mma.cuh"

 #ifdef KITTENS_HOPPER
-#include "mma/mma.cuh"

 template<int n_reg> __device__ static inline void increase_registers() {
    static_assert(n_reg % 8 == 0, "n_reg must be a multiple of 8");
@@ -93,4 +93,4 @@ __device__ static inline void sync() {
 using warp = group<1>;      // scope used by most pre-Hopper GPUs, and also for most register operations.
 using warpgroup = group<4>; // special scope commonly used by Hopper and later.

-}
+}
--- a/extra/thunder/cuda/matmul.py
+++ b/extra/thunder/cuda/matmul.py
@@ -6,7 +6,7 @@ from tinygrad.runtime.support.compiler_cuda import pretty_ptx, NVCCCompiler
 if __name__ == "__main__":
  code = (pathlib.Path(__file__).parent / "matmul.cu").read_text()
  device = Device["CUDA"]
-  kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr", "-DKITTENS_HOPPER"]
+  kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr"]
  lib = NVCCCompiler(device.compiler.arch, kitten_args).compile(code)
  kernel_name = lib.decode().split(".globl\t")[1].split("\n")[0]
  print("kernel name", kernel_name)