diff --git a/extra/thunder/cuda/include/ops/group/group.cuh b/extra/thunder/cuda/include/ops/group/group.cuh
index 1a9d69971c..b9e23634c6 100644
--- a/extra/thunder/cuda/include/ops/group/group.cuh
+++ b/extra/thunder/cuda/include/ops/group/group.cuh
@@ -46,9 +46,9 @@ __device__ static inline void arrive(int id) {
 #include "memory/memory.cuh"
 #include "shared/shared.cuh"
 #include "register/register.cuh"
+#include "mma/mma.cuh"
 
 #ifdef KITTENS_HOPPER
-#include "mma/mma.cuh"
 
 template<int n_reg> __device__ static inline void increase_registers() {
     static_assert(n_reg % 8 == 0, "n_reg must be a multiple of 8");
@@ -93,4 +93,4 @@ __device__ static inline void sync() {
 using warp = group<1>;      // scope used by most pre-Hopper GPUs, and also for most register operations.
 using warpgroup = group<4>; // special scope commonly used by Hopper and later.
 
-}
\ No newline at end of file
+}
diff --git a/extra/thunder/cuda/matmul.py b/extra/thunder/cuda/matmul.py
index fe3bd577e4..ea0454edcb 100644
--- a/extra/thunder/cuda/matmul.py
+++ b/extra/thunder/cuda/matmul.py
@@ -6,7 +6,7 @@ from tinygrad.runtime.support.compiler_cuda import pretty_ptx, NVCCCompiler
 if __name__ == "__main__":
   code = (pathlib.Path(__file__).parent / "matmul.cu").read_text()
   device = Device["CUDA"]
-  kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr", "-DKITTENS_HOPPER"]
+  kitten_args = [f"-I{(pathlib.Path(__file__).parent / 'include').as_posix()}", "-std=c++20", "--expt-relaxed-constexpr"]
   lib = NVCCCompiler(device.compiler.arch, kitten_args).compile(code)
   kernel_name = lib.decode().split(".globl\t")[1].split("\n")[0]
   print("kernel name", kernel_name)