Files
tinygrad/extra/thunder/cuda/include/ops/device/device.cuh
2025-10-10 00:32:33 -07:00

52 lines
1.6 KiB
Plaintext

/**
* @file
* @brief An aggregate header of all device (multi-GPU) operations defined by ThunderKittens
*/
#pragma once
#include "../../types/types.cuh"
namespace kittens {
template<int _NUM_DEVICES>
struct device {
static_assert(_NUM_DEVICES >= 0 && _NUM_DEVICES <= 72, "Invalid number of devices");
static constexpr int NUM_DEVICES = _NUM_DEVICES;
#ifdef KITTENS_HOPPER
using barrier_t = pgl<gl<int, 1, 1, 1, -1>, NUM_DEVICES, true>;
/**
* @brief Multi-GPU synchronization barrier for coordinated kernel exit
*
* Performs a synchronization across all devices to ensure all GPUs complete
* their work before any kernel exits. Does not synchronize intra-node threads
* or threadblocks.
*
* @param barrier Pre-allocated barrier structure, must be initialized to 0
* @param dev_idx Current device index (0 to NUM_DEVICES - 1)
* @param id Synchronization point identifier (default: 0). 0 is fine for most cases
*
*/
__device__ static inline void sync_on_exit(const barrier_t &barrier, const int dev_idx, const int id = 0) {
if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
cuda::atomic_ref<int, cuda::thread_scope_system> barrier_uc(barrier[dev_idx][{id}]);
// Inter-note check-in
multimem<int>::red<reduce_op::ADD>(barrier.mc_ptr_at({id}), 1);
asm volatile ("{fence.proxy.alias;}" ::: "memory");
while (barrier_uc.load(cuda::memory_order_acquire) < NUM_DEVICES);
barrier_uc.fetch_sub(NUM_DEVICES, cuda::memory_order_release);
}
}
#endif
};
} // namespace kittens