diff --git a/compiler/Makefile b/compiler/Makefile
index d90ad9ad8..3ee64bb03 100644
--- a/compiler/Makefile
+++ b/compiler/Makefile
@@ -231,6 +231,14 @@ run-end-to-end-dataflow-tests: build-end-to-end-dataflow-tests
 	$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_auto_parallelization
 	$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed
 
+## GPU tests
+
+build-end-to-end-gpu-tests: build-initialized
+	cmake --build $(BUILD_DIR) --target end_to_end_gpu_test
+
+run-end-to-end-gpu-tests: build-end-to-end-gpu-tests
+	$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_gpu_test
+
 # benchmark
 
 build-benchmarks: build-initialized
diff --git a/compiler/include/concretelang/Conversion/ConcreteToBConcrete/Pass.h b/compiler/include/concretelang/Conversion/ConcreteToBConcrete/Pass.h
index aae7d3f4a..e39beaaff 100644
--- a/compiler/include/concretelang/Conversion/ConcreteToBConcrete/Pass.h
+++ b/compiler/include/concretelang/Conversion/ConcreteToBConcrete/Pass.h
@@ -12,7 +12,7 @@ namespace mlir {
 namespace concretelang {
 /// Create a pass to convert `Concrete` dialect to `BConcrete` dialect.
 std::unique_ptr<OperationPass<ModuleOp>>
-createConvertConcreteToBConcretePass(bool loopParallelize, bool useGPU);
+createConvertConcreteToBConcretePass(bool loopParallelize, bool emitGPUOps);
 } // namespace concretelang
 } // namespace mlir
 
diff --git a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
index bb2217e80..0af28d5d2 100644
--- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
+++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
@@ -155,7 +155,7 @@ def BConcrete_AwaitFutureOp :
 
 // This is a different op in BConcrete just because of the way we are lowering to CAPI
 // When the CAPI lowering is detached from bufferization, we can remove this op, and lower
-// to the appropriate CAPI (gpu or cpu) depending on the useGPU compilation option
+// to the appropriate CAPI (gpu or cpu) depending on the emitGPUOps compilation option
 def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer"> {
     let arguments = (ins
         1DTensorOf<[I64]>:$input_ciphertext,
@@ -172,7 +172,7 @@ def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer">
 
 // This is a different op in BConcrete just because of the way we are lowering to CAPI
 // When the CAPI lowering is detached from bufferization, we can remove this op, and lower
-// to the appropriate CAPI (gpu or cpu) depending on the useGPU compilation option
+// to the appropriate CAPI (gpu or cpu) depending on the emitGPUOps compilation option
 def BConcrete_KeySwitchLweGPUBufferOp : BConcrete_Op<"keyswitch_lwe_gpu_buffer"> {
     let arguments = (ins
         1DTensorOf<[I64]>:$ciphertext,
diff --git a/compiler/include/concretelang/Support/CompilerEngine.h b/compiler/include/concretelang/Support/CompilerEngine.h
index e50c321e3..6ef2f289b 100644
--- a/compiler/include/concretelang/Support/CompilerEngine.h
+++ b/compiler/include/concretelang/Support/CompilerEngine.h
@@ -55,7 +55,7 @@ struct CompilationOptions {
   bool asyncOffload;
   bool optimizeConcrete;
   /// use GPU during execution by generating GPU operations if possible
-  bool useGPU;
+  bool emitGPUOps;
   llvm::Optional<std::vector<int64_t>> fhelinalgTileSizes;
 
   llvm::Optional<std::string> clientParametersFuncName;
@@ -66,7 +66,7 @@ struct CompilationOptions {
       : v0FHEConstraints(llvm::None), verifyDiagnostics(false),
         autoParallelize(false), loopParallelize(false),
         dataflowParallelize(false), asyncOffload(false), optimizeConcrete(true),
-        useGPU(false), clientParametersFuncName(llvm::None),
+        emitGPUOps(false), clientParametersFuncName(llvm::None),
         optimizerConfig(optimizer::DEFAULT_CONFIG){};
 
   CompilationOptions(std::string funcname) : CompilationOptions() {
diff --git a/compiler/include/concretelang/Support/Pipeline.h b/compiler/include/concretelang/Support/Pipeline.h
index 21c0276c5..ed85f5b76 100644
--- a/compiler/include/concretelang/Support/Pipeline.h
+++ b/compiler/include/concretelang/Support/Pipeline.h
@@ -47,7 +47,7 @@ lowerTFHEToConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
 mlir::LogicalResult
 lowerConcreteToBConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
                          std::function<bool(mlir::Pass *)> enablePass,
-                         bool parallelizeLoops, bool useGPU);
+                         bool parallelizeLoops, bool emitGPUOps);
 
 mlir::LogicalResult
 optimizeConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
diff --git a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
index a0d580165..dc138b2ab 100644
--- a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
+++ b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
@@ -48,12 +48,12 @@ struct ConcreteToBConcretePass
     : public ConcreteToBConcreteBase<ConcreteToBConcretePass> {
   void runOnOperation() final;
   ConcreteToBConcretePass() = delete;
-  ConcreteToBConcretePass(bool loopParallelize, bool useGPU)
-      : loopParallelize(loopParallelize), useGPU(useGPU){};
+  ConcreteToBConcretePass(bool loopParallelize, bool emitGPUOps)
+      : loopParallelize(loopParallelize), emitGPUOps(emitGPUOps){};
 
 private:
   bool loopParallelize;
-  bool useGPU;
+  bool emitGPUOps;
 };
 } // namespace
 
@@ -919,7 +919,7 @@ void ConcreteToBConcretePass::runOnOperation() {
         LowToBConcrete<Concrete::WopPBSLweOp, BConcrete::WopPBSCRTLweBufferOp,
                        BConcrete::WopPBSCRTLweBufferOp>>(&getContext());
 
-    if (this->useGPU) {
+    if (this->emitGPUOps) {
       patterns
           .insert<LowToBConcrete<
                       mlir::concretelang::Concrete::BootstrapLweOp,
@@ -1063,8 +1063,8 @@ void ConcreteToBConcretePass::runOnOperation() {
 namespace mlir {
 namespace concretelang {
 std::unique_ptr<OperationPass<ModuleOp>>
-createConvertConcreteToBConcretePass(bool loopParallelize, bool useGPU) {
-  return std::make_unique<ConcreteToBConcretePass>(loopParallelize, useGPU);
+createConvertConcreteToBConcretePass(bool loopParallelize, bool emitGPUOps) {
+  return std::make_unique<ConcreteToBConcretePass>(loopParallelize, emitGPUOps);
 }
 } // namespace concretelang
 } // namespace mlir
diff --git a/compiler/lib/Support/CompilerEngine.cpp b/compiler/lib/Support/CompilerEngine.cpp
index fc5c0015f..244f4e331 100644
--- a/compiler/lib/Support/CompilerEngine.cpp
+++ b/compiler/lib/Support/CompilerEngine.cpp
@@ -356,7 +356,7 @@ CompilerEngine::compile(llvm::SourceMgr &sm, Target target, OptionalLib lib) {
   // Concrete -> BConcrete
   if (mlir::concretelang::pipeline::lowerConcreteToBConcrete(
           mlirContext, module, this->enablePass, loopParallelize,
-          options.useGPU)
+          options.emitGPUOps)
           .failed()) {
     return StreamStringError(
         "Lowering from Concrete to Bufferized Concrete failed");
diff --git a/compiler/lib/Support/Pipeline.cpp b/compiler/lib/Support/Pipeline.cpp
index 7ca12616f..a58c1a447 100644
--- a/compiler/lib/Support/Pipeline.cpp
+++ b/compiler/lib/Support/Pipeline.cpp
@@ -242,13 +242,13 @@ optimizeConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
 mlir::LogicalResult
 lowerConcreteToBConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
                          std::function<bool(mlir::Pass *)> enablePass,
-                         bool parallelizeLoops, bool useGPU) {
+                         bool parallelizeLoops, bool emitGPUOps) {
   mlir::PassManager pm(&context);
   pipelinePrinting("ConcreteToBConcrete", pm, context);
 
   std::unique_ptr<Pass> conversionPass =
       mlir::concretelang::createConvertConcreteToBConcretePass(parallelizeLoops,
-                                                               useGPU);
+                                                               emitGPUOps);
 
   bool passEnabled = enablePass(conversionPass.get());
 
diff --git a/compiler/src/main.cpp b/compiler/src/main.cpp
index 76b0873b8..52e0a3b87 100644
--- a/compiler/src/main.cpp
+++ b/compiler/src/main.cpp
@@ -98,8 +98,8 @@ llvm::cl::opt<bool>
                                     "dialects. (Enabled by default)"),
                      llvm::cl::init<bool>(true));
 
-llvm::cl::opt<bool> useGPU(
-    "use-gpu",
+llvm::cl::opt<bool> emitGPUOps(
+    "emit-gpu-ops",
     llvm::cl::desc(
         "enable/disable generating GPU operations (Disabled by default)"),
     llvm::cl::init<bool>(false));
@@ -289,7 +289,7 @@ cmdlineCompilationOptions() {
   options.loopParallelize = cmdline::loopParallelize;
   options.dataflowParallelize = cmdline::dataflowParallelize;
   options.optimizeConcrete = cmdline::optimizeConcrete;
-  options.useGPU = cmdline::useGPU;
+  options.emitGPUOps = cmdline::emitGPUOps;
 
   if (!cmdline::v0Constraint.empty()) {
     if (cmdline::v0Constraint.size() != 2) {
diff --git a/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir b/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir
index 42622555c..435f2ffb7 100644
--- a/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir
+++ b/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: concretecompiler --passes concrete-to-bconcrete --action=dump-bconcrete --use-gpu %s 2>&1| FileCheck %s
+// RUN: concretecompiler --passes concrete-to-bconcrete --action=dump-bconcrete --emit-gpu-ops %s 2>&1| FileCheck %s
 
 
 //CHECK: func.func @main(%arg0: tensor<1025xi64>) -> tensor<1025xi64> {
diff --git a/compiler/tests/end_to_end_benchmarks/end_to_end_benchmark.cpp b/compiler/tests/end_to_end_benchmarks/end_to_end_benchmark.cpp
index 615660af4..6db099920 100644
--- a/compiler/tests/end_to_end_benchmarks/end_to_end_benchmark.cpp
+++ b/compiler/tests/end_to_end_benchmarks/end_to_end_benchmark.cpp
@@ -137,7 +137,7 @@ static int registerEndToEndTestFromFile(std::string prefix, std::string path,
   registe("loop", loop);
 #ifdef CONCRETELANG_CUDA_SUPPORT
   mlir::concretelang::CompilationOptions gpu;
-  gpu.useGPU = true;
+  gpu.emitGPUOps = true;
   registe("gpu", gpu);
 #endif
   // mlir::concretelang::CompilationOptions dataflow;
diff --git a/compiler/tests/end_to_end_benchmarks/end_to_end_mlbench.cpp b/compiler/tests/end_to_end_benchmarks/end_to_end_mlbench.cpp
index b9a01f91f..bde8f0b74 100644
--- a/compiler/tests/end_to_end_benchmarks/end_to_end_mlbench.cpp
+++ b/compiler/tests/end_to_end_benchmarks/end_to_end_mlbench.cpp
@@ -138,7 +138,7 @@ static int registerEndToEndTestFromFile(std::string prefix, std::string path,
   registe("loop", loop);
 #ifdef CONCRETELANG_CUDA_SUPPORT
   mlir::concretelang::CompilationOptions gpu;
-  gpu.useGPU = true;
+  gpu.emitGPUOps = true;
   registe("gpu", gpu);
 #endif
 #ifdef CONCRETELANG_DATAFLOW_EXECUTION_ENABLED
diff --git a/compiler/tests/end_to_end_tests/CMakeLists.txt b/compiler/tests/end_to_end_tests/CMakeLists.txt
index 88ca6fda1..0ca05beae 100644
--- a/compiler/tests/end_to_end_tests/CMakeLists.txt
+++ b/compiler/tests/end_to_end_tests/CMakeLists.txt
@@ -70,3 +70,11 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
     globals.cc
   )
 endif()
+
+if(CONCRETELANG_CUDA_SUPPORT)
+  add_concretecompiler_unittest(
+    end_to_end_gpu_test
+    end_to_end_gpu_test.cc
+    globals.cc
+  )
+endif()
diff --git a/compiler/tests/end_to_end_tests/end_to_end_gpu_test.cc b/compiler/tests/end_to_end_tests/end_to_end_gpu_test.cc
new file mode 100644
index 000000000..c4610c59d
--- /dev/null
+++ b/compiler/tests/end_to_end_tests/end_to_end_gpu_test.cc
@@ -0,0 +1,55 @@
+
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <type_traits>
+
+#include "end_to_end_gpu_test.h"
+#include "tests_tools/GtestEnvironment.h"
+
+TEST(GPULookupTable, lut_precision2) {
+  checkedJit(lambda, R"XXX(
+func.func @main(%arg0: !FHE.eint<2>) -> !FHE.eint<2> {
+  %arg1 = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+  %1 = "FHE.apply_lookup_table"(%arg0, %arg1): (!FHE.eint<2>, tensor<4xi64>) -> (!FHE.eint<2>)
+  return %1: !FHE.eint<2>
+}
+)XXX",
+             "main", true);
+
+  ASSERT_EXPECTED_VALUE(lambda(0_u64), (uint64_t)1);
+  ASSERT_EXPECTED_VALUE(lambda(1_u64), (uint64_t)2);
+  ASSERT_EXPECTED_VALUE(lambda(2_u64), (uint64_t)3);
+  ASSERT_EXPECTED_VALUE(lambda(3_u64), (uint64_t)0);
+}
+
+TEST(GPULookupTable, lut_precision4) {
+  checkedJit(lambda, R"XXX(
+func.func @main(%arg0: !FHE.eint<4>) -> !FHE.eint<4> {
+  %arg1 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1, 0]> : tensor<16xi64>
+  %1 = "FHE.apply_lookup_table"(%arg0, %arg1): (!FHE.eint<4>, tensor<16xi64>) -> (!FHE.eint<4>)
+  return %1: !FHE.eint<4>
+}
+)XXX",
+             "main", true);
+
+  ASSERT_EXPECTED_VALUE(lambda(0_u64), (uint64_t)1);
+  ASSERT_EXPECTED_VALUE(lambda(1_u64), (uint64_t)2);
+  ASSERT_EXPECTED_VALUE(lambda(7_u64), (uint64_t)8);
+  ASSERT_EXPECTED_VALUE(lambda(15_u64), (uint64_t)0);
+}
+
+TEST(GPULookupTable, lut_precision7) {
+  checkedJit(lambda, R"XXX(
+func.func @main(%arg0: !FHE.eint<7>) -> !FHE.eint<7> {
+  %arg1 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0]> : tensor<128xi64>
+  %1 = "FHE.apply_lookup_table"(%arg0, %arg1): (!FHE.eint<7>, tensor<128xi64>) -> (!FHE.eint<7>)
+  return %1: !FHE.eint<7>
+}
+)XXX",
+             "main", true);
+
+  ASSERT_EXPECTED_VALUE(lambda(0_u64), (uint64_t)1);
+  ASSERT_EXPECTED_VALUE(lambda(1_u64), (uint64_t)2);
+  ASSERT_EXPECTED_VALUE(lambda(120_u64), (uint64_t)121);
+  ASSERT_EXPECTED_VALUE(lambda(127_u64), (uint64_t)0);
+}
diff --git a/compiler/tests/end_to_end_tests/end_to_end_gpu_test.h b/compiler/tests/end_to_end_tests/end_to_end_gpu_test.h
new file mode 100644
index 000000000..3712df0dd
--- /dev/null
+++ b/compiler/tests/end_to_end_tests/end_to_end_gpu_test.h
@@ -0,0 +1,60 @@
+#ifndef END_TO_END_GPU_TEST_H
+#define END_TO_END_GPU_TEST_H
+
+#include <gtest/gtest.h>
+
+#include "../tests_tools/keySetCache.h"
+
+#include "concretelang/Support/CompilerEngine.h"
+#include "concretelang/Support/JITSupport.h"
+
+#include "end_to_end_test.h"
+#include "globals.h"
+#include "tests_tools/assert.h"
+
+// Jit-compiles the function specified by `func` from `src` and
+// returns the corresponding lambda. Any compilation errors are caught
+// and reult in abnormal termination.
+inline llvm::Expected<
+    mlir::concretelang::ClientServer<mlir::concretelang::JITSupport>>
+internalCheckedJit(llvm::StringRef src, llvm::StringRef func = "main",
+                   bool useDefaultFHEConstraints = false,
+                   bool dataflowParallelize = false,
+                   bool loopParallelize = false) {
+
+  auto options =
+      mlir::concretelang::CompilationOptions(std::string(func.data()));
+  options.emitGPUOps = true;
+
+  if (useDefaultFHEConstraints) {
+    options.v0FHEConstraints = defaultV0Constraints;
+    options.optimizerConfig.strategy_v0 = true;
+  }
+
+  // Allow loop parallelism in all cases
+  options.loopParallelize = loopParallelize;
+#ifdef CONCRETELANG_DATAFLOW_EXECUTION_ENABLED
+#ifdef CONCRETELANG_DATAFLOW_TESTING_ENABLED
+  options.dataflowParallelize = true;
+  options.loopParallelize = true;
+#else
+  options.dataflowParallelize = dataflowParallelize;
+#endif
+#endif
+
+  auto lambdaOrErr =
+      mlir::concretelang::ClientServer<mlir::concretelang::JITSupport>::create(
+          src, options, getTestKeySetCache(), mlir::concretelang::JITSupport());
+
+  return lambdaOrErr;
+}
+
+// Wrapper around `internalCheckedJit` that causes
+// `ASSERT_EXPECTED_SUCCESS` to use the file and line number of the
+// caller instead of `internalCheckedJit`.
+#define checkedJit(VARNAME, ...)                                               \
+  auto VARNAMEOrErr = internalCheckedJit(__VA_ARGS__);                         \
+  ASSERT_EXPECTED_SUCCESS(VARNAMEOrErr);                                       \
+  auto VARNAME = std::move(*VARNAMEOrErr);
+
+#endif
diff --git a/compiler/tests/end_to_end_tests/end_to_end_jit_test.h b/compiler/tests/end_to_end_tests/end_to_end_jit_test.h
index 1e41dd3c2..18e15b767 100644
--- a/compiler/tests/end_to_end_tests/end_to_end_jit_test.h
+++ b/compiler/tests/end_to_end_tests/end_to_end_jit_test.h
@@ -8,6 +8,7 @@
 #include "concretelang/Support/CompilerEngine.h"
 #include "concretelang/Support/JITSupport.h"
 
+#include "end_to_end_test.h"
 #include "globals.h"
 #include "tests_tools/assert.h"
 
@@ -46,16 +47,6 @@ internalCheckedJit(llvm::StringRef src, llvm::StringRef func = "main",
   return lambdaOrErr;
 }
 
-// Shorthands to create integer literals of a specific type
-static inline uint8_t operator"" _u8(unsigned long long int v) { return v; }
-static inline uint16_t operator"" _u16(unsigned long long int v) { return v; }
-static inline uint32_t operator"" _u32(unsigned long long int v) { return v; }
-static inline uint64_t operator"" _u64(unsigned long long int v) { return v; }
-
-// Evaluates to the number of elements of a statically initialized
-// array
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
-
 // Wrapper around `internalCheckedJit` that causes
 // `ASSERT_EXPECTED_SUCCESS` to use the file and line number of the
 // caller instead of `internalCheckedJit`.
diff --git a/compiler/tests/end_to_end_tests/end_to_end_test.h b/compiler/tests/end_to_end_tests/end_to_end_test.h
new file mode 100644
index 000000000..29bd5bfff
--- /dev/null
+++ b/compiler/tests/end_to_end_tests/end_to_end_test.h
@@ -0,0 +1,16 @@
+#ifndef END_TO_END_TEST_H
+#define END_TO_END_TEST_H
+
+#include <gtest/gtest.h>
+
+// Shorthands to create integer literals of a specific type
+static inline uint8_t operator"" _u8(unsigned long long int v) { return v; }
+static inline uint16_t operator"" _u16(unsigned long long int v) { return v; }
+static inline uint32_t operator"" _u32(unsigned long long int v) { return v; }
+static inline uint64_t operator"" _u64(unsigned long long int v) { return v; }
+
+// Evaluates to the number of elements of a statically initialized
+// array
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#endif