diff --git a/include/negation.h b/include/negation.h
new file mode 100644
index 000000000..5be088a00
--- /dev/null
+++ b/include/negation.h
@@ -0,0 +1,20 @@
+#ifndef CUDA_NEGATE_H_
+#define CUDA_NEGATE_H_
+
+#include <cstdint>
+
+extern "C" {
+
+void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count);
+void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count);
+}
+
+#endif // CUDA_NEGATE_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5260a879e..33f7a144b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,6 @@
-set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h 
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h)
+set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
+        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
+        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/negation.h)
 file(GLOB SOURCES
      "*.cu"
      "*.h"
diff --git a/src/negation.cu b/src/negation.cu
new file mode 100644
index 000000000..054465c1e
--- /dev/null
+++ b/src/negation.cu
@@ -0,0 +1,23 @@
+#include "negation.cuh"
+#include "negation.h"
+
+void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count) {
+
+  host_negation(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
+                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
+void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count) {
+
+  host_negation(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
diff --git a/src/negation.cuh b/src/negation.cuh
new file mode 100644
index 000000000..17647af3d
--- /dev/null
+++ b/src/negation.cuh
@@ -0,0 +1,46 @@
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#endif
+
+#ifndef CUDA_NEGATE
+#define CUDA_NEGATE
+
+#include "utils/kernel_dimensions.cuh"
+#include <thread>
+
+template <typename T>
+__global__ void negation(T *output, T *input, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  if (tid < num_entries) {
+    int index = blockIdx.x * blockDim.x + tid;
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = -input[index];
+  }
+}
+
+template <typename T>
+__host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
+                            T *input, uint32_t input_lwe_dimension,
+                            uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
+
+  cudaStreamSynchronize(*stream);
+}
+
+#endif // CUDA_NEGATE
diff --git a/src/utils/kernel_dimensions.cuh b/src/utils/kernel_dimensions.cuh
index afd7e008d..367ec6f33 100644
--- a/src/utils/kernel_dimensions.cuh
+++ b/src/utils/kernel_dimensions.cuh
@@ -1,4 +1,6 @@
-int nextPow2(int x) {
+#include "math_functions.h"
+
+__host__ __device__ int nextPow2(int x) {
   --x;
   x |= x >> 1;
   x |= x >> 2;
@@ -8,8 +10,10 @@ int nextPow2(int x) {
   return ++x;
 }
 
-void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
-                            int &threads) {
-  threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
+__host__ __device__ void getNumBlocksAndThreads(const int n,
+                                                const int maxBlockSize,
+                                                int &blocks, int &threads) {
+  threads =
+      (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
   blocks = (n + threads - 1) / threads;
 }