From 93d5a06557e4abb7015a0d097368564730d38e70 Mon Sep 17 00:00:00 2001 From: Antoniu Pop Date: Mon, 25 Apr 2022 11:09:22 +0100 Subject: [PATCH] test: add a test for distributed execution. --- compiler/Makefile | 2 + .../tests/end_to_end_tests/CMakeLists.txt | 5 + .../end_to_end_jit_distributed.cc | 148 ++++++++++++++++++ .../end_to_end_jit_distributed.sh | 23 +++ compiler/tests/tests_tools/assert.h | 4 + 5 files changed, 182 insertions(+) create mode 100644 compiler/tests/end_to_end_tests/end_to_end_jit_distributed.cc create mode 100644 compiler/tests/end_to_end_tests/end_to_end_jit_distributed.sh diff --git a/compiler/Makefile b/compiler/Makefile index 9b4ba776e..75450d580 100644 --- a/compiler/Makefile +++ b/compiler/Makefile @@ -173,9 +173,11 @@ build-end-to-end-jit-lambda: build-initialized build-end-to-end-dataflow-tests: build-initialized cmake --build $(BUILD_DIR) --target end_to_end_jit_auto_parallelization + cmake --build $(BUILD_DIR) --target end_to_end_jit_distributed run-end-to-end-dataflow-tests: build-end-to-end-dataflow-tests $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_auto_parallelization + $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed # benchmark diff --git a/compiler/tests/end_to_end_tests/CMakeLists.txt b/compiler/tests/end_to_end_tests/CMakeLists.txt index aefa463d3..a03393e7e 100644 --- a/compiler/tests/end_to_end_tests/CMakeLists.txt +++ b/compiler/tests/end_to_end_tests/CMakeLists.txt @@ -64,4 +64,9 @@ if(CONCRETELANG_PARALLEL_EXECUTION_ENABLED) end_to_end_jit_auto_parallelization.cc globals.cc ) + add_concretecompiler_unittest( + end_to_end_jit_distributed + end_to_end_jit_distributed.cc + globals.cc + ) endif() diff --git a/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.cc b/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.cc new file mode 100644 index 000000000..b8f5821e7 --- /dev/null +++ b/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.cc @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include + +#include "end_to_end_jit_test.h" + +/////////////////////////////////////////////////////////////////////////////// +// Auto-parallelize independent FHE ops ///////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +std::vector distributed_results; + +TEST(Distributed, nn_med_nested) { + checkedJit(lambda, R"XXX( +func @main(%arg0: tensor<200x4x!FHE.eint<4>>) -> tensor<200x8x!FHE.eint<4>> { + %cst = arith.constant dense<"0x01010100010100000001010101000101010101010101010001000101000001010001010100000101000001000001010001000001010100010001000000010100010001010001000001000101010101000100010001000000000100010001000101000001000101010100010001000000000101000100000000000001000100000100000100000001010000010001000101000100010001000100000100000100010101010000000000000000010001010000000100000100010100000100000000010001000101000100000000000101010101000101010101010100010100010100000000000101010100000100010100000001000101000000010101000101000100000101010100010101010000010101010100010000000000000001010101000100010101000001010001010000010001010101000000000000000001000001000000010100000100000101010100010001000000000000010100010101000000010100000100010001010001000000000100010001000101010100010100000001010100010101010100010100010001000001000000000101000101010001000100000101010100000101010100000100010101000100000101000101010100010001000101010100010001010001010000010000010001010000000001000101010001000000000101000000010000010100010001000001000001010101000100010001010100000101000000010001000000000101000101000000010000000001000101010100010001000000000001010000010001000001010101000101010101010100000000000001000100000100000001000000010101010101000000000101010101000100000101000100000000000001000100000101000101010100010000000101000000000100000100000101010000010100000000010000000000010001000100000101010001010101000000000000010000010101010001000000010001010001010000000000000101000000010101010101000001010101000001000001010100000000010001010100000100000101000101010100010001010001000001000100000101000100010100000100010000000101000000010000010001010101010000000101000000010101000001010100000100010001000000000001010000000100010000000000000000000000000001010101010101010101000001010101000001010100000001000101010101010000010101000101010100010101010000010101010100000100000000000101010000000000010101010000000001000000010100000100000001000101010000000001000001000001010001010000010001000101010001010001010101000100010000000100000100010101000000000101010101010001000100000000000101010000010101000001010001010000000001010100000101000001010000000001010101000100010000010101000000000001000101000001010101000101000001000001000000010100010001000101010100010001010000000101000000010001000001000100000101010001000001000001000101010000010001000001000101000000000000000101010000010000000101010100010100010001010101010000000000010001000101010000000001010100000000010001010100010001000001000101000000010100010000010000010001010100010000010001010100010000010100010101010001000100010100010101000100000101010100000100010100000100000000010101000000010001000001010000000101000100000100010101000000010100000101000001010001010100010000000101010000000001010001000000010100010101010001000100010001000001010101000000010001000100000100010101000000000000010100010000000100000000010100010000000100000101010000010101000100010000010100000001000100000000000100000001010101010101000100010001000000010101010100000001000001000001010001000101010100000001010001010100010101000101000000010001010100010101000100000101000101000001000001000001000101010100010001010000000100000101010100000001000000000000010101000100010001000001000001000000000000010100000100000001"> : tensor<200x8xi5> + %cst_0 = arith.constant dense<[[1, 0, 0, 0, 1, 0, 0, 1], [0, 0, 1, 1, 0, 0, 0, 0], [1, 1, 0, 1, 1, 0, 1, 1], [1, 1, 0, 0, 1, 0, 1, 1]]> : tensor<4x8xi5> + %cst_1 = arith.constant dense<[0, 3, 7, 10, 14, 17, 21, 24, 28, 31, 35, 38, 42, 45, 49, 52]> : tensor<16xi64> + %0 = "FHELinalg.matmul_eint_int"(%arg0, %cst_0) : (tensor<200x4x!FHE.eint<4>>, tensor<4x8xi5>) -> tensor<200x8x!FHE.eint<4>> + %1 = "FHELinalg.add_eint_int"(%0, %cst) : (tensor<200x8x!FHE.eint<4>>, tensor<200x8xi5>) -> tensor<200x8x!FHE.eint<4>> + + %res = "FHE.zero_tensor"() : () -> tensor<200x8x!FHE.eint<4>> + + %slice_A = tensor.extract_slice %1[0, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_B = tensor.extract_slice %1[25, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_C = tensor.extract_slice %1[50, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_D = tensor.extract_slice %1[75, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_E = tensor.extract_slice %1[100, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_F = tensor.extract_slice %1[125, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_G = tensor.extract_slice %1[150, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_H = tensor.extract_slice %1[175, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_I = tensor.extract_slice %1[0, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_J = tensor.extract_slice %1[25, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_K = tensor.extract_slice %1[50, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_L = tensor.extract_slice %1[75, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_M = tensor.extract_slice %1[100, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_N = tensor.extract_slice %1[125, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_O = tensor.extract_slice %1[150, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + %slice_P = tensor.extract_slice %1[175, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>> + + %part_A = "FHELinalg.apply_lookup_table"(%slice_A, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_B = "FHELinalg.apply_lookup_table"(%slice_B, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_C = "FHELinalg.apply_lookup_table"(%slice_C, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_D = "FHELinalg.apply_lookup_table"(%slice_D, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_E = "FHELinalg.apply_lookup_table"(%slice_E, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_F = "FHELinalg.apply_lookup_table"(%slice_F, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_G = "FHELinalg.apply_lookup_table"(%slice_G, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_H = "FHELinalg.apply_lookup_table"(%slice_H, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_I = "FHELinalg.apply_lookup_table"(%slice_I, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_J = "FHELinalg.apply_lookup_table"(%slice_J, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_K = "FHELinalg.apply_lookup_table"(%slice_K, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_L = "FHELinalg.apply_lookup_table"(%slice_L, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_M = "FHELinalg.apply_lookup_table"(%slice_M, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_N = "FHELinalg.apply_lookup_table"(%slice_N, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_O = "FHELinalg.apply_lookup_table"(%slice_O, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + %part_P = "FHELinalg.apply_lookup_table"(%slice_P, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>> + + %res_A = tensor.insert_slice %part_A into %res [0, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_B = tensor.insert_slice %part_B into %res_A[25, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_C = tensor.insert_slice %part_C into %res_B[50, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_D = tensor.insert_slice %part_D into %res_C[75, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_E = tensor.insert_slice %part_E into %res_D[100, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_F = tensor.insert_slice %part_F into %res_E[125, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_G = tensor.insert_slice %part_G into %res_F[150, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_H = tensor.insert_slice %part_H into %res_G[175, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_I = tensor.insert_slice %part_I into %res_H[0, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_J = tensor.insert_slice %part_J into %res_I[25, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_K = tensor.insert_slice %part_K into %res_J[50, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_L = tensor.insert_slice %part_L into %res_K[75, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_M = tensor.insert_slice %part_M into %res_L[100, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_N = tensor.insert_slice %part_N into %res_M[125, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_O = tensor.insert_slice %part_O into %res_N[150, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + %res_P = tensor.insert_slice %part_P into %res_O[175, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>> + + return %res_P : tensor<200x8x!FHE.eint<4>> +} +)XXX", + "main", false, true, true); + + const size_t numDim = 2; + const size_t dim0 = 200; + const size_t dim1 = 4; + const size_t dim2 = 8; + const int64_t dims[numDim]{dim0, dim1}; + const llvm::ArrayRef shape2D(dims, numDim); + std::vector input; + input.reserve(dim0 * dim1); + + for (int i = 0; i < dim0 * dim1; ++i) + input.push_back(i % 17 % 4); + + mlir::concretelang::TensorLambdaArgument< + mlir::concretelang::IntLambdaArgument> + arg(input, shape2D); + + if (mlir::concretelang::dfr::_dfr_is_root_node()) { + llvm::Expected> res = + lambda.operator()>({&arg}); + ASSERT_EXPECTED_SUCCESS(res); + ASSERT_EQ(res->size(), dim0 * dim2); + distributed_results = *res; + } else + ASSERT_EXPECTED_FAILURE(lambda.operator()>()); +} + +TEST(Distributed, nn_med_sequential) { + if (mlir::concretelang::dfr::_dfr_is_root_node()) { + checkedJit(lambda, R"XXX( + func @main(%arg0: tensor<200x4x!FHE.eint<4>>) -> tensor<200x8x!FHE.eint<4>> { + %cst = arith.constant dense<"0x01010100010100000001010101000101010101010101010001000101000001010001010100000101000001000001010001000001010100010001000000010100010001010001000001000101010101000100010001000000000100010001000101000001000101010100010001000000000101000100000000000001000100000100000100000001010000010001000101000100010001000100000100000100010101010000000000000000010001010000000100000100010100000100000000010001000101000100000000000101010101000101010101010100010100010100000000000101010100000100010100000001000101000000010101000101000100000101010100010101010000010101010100010000000000000001010101000100010101000001010001010000010001010101000000000000000001000001000000010100000100000101010100010001000000000000010100010101000000010100000100010001010001000000000100010001000101010100010100000001010100010101010100010100010001000001000000000101000101010001000100000101010100000101010100000100010101000100000101000101010100010001000101010100010001010001010000010000010001010000000001000101010001000000000101000000010000010100010001000001000001010101000100010001010100000101000000010001000000000101000101000000010000000001000101010100010001000000000001010000010001000001010101000101010101010100000000000001000100000100000001000000010101010101000000000101010101000100000101000100000000000001000100000101000101010100010000000101000000000100000100000101010000010100000000010000000000010001000100000101010001010101000000000000010000010101010001000000010001010001010000000000000101000000010101010101000001010101000001000001010100000000010001010100000100000101000101010100010001010001000001000100000101000100010100000100010000000101000000010000010001010101010000000101000000010101000001010100000100010001000000000001010000000100010000000000000000000000000001010101010101010101000001010101000001010100000001000101010101010000010101000101010100010101010000010101010100000100000000000101010000000000010101010000000001000000010100000100000001000101010000000001000001000001010001010000010001000101010001010001010101000100010000000100000100010101000000000101010101010001000100000000000101010000010101000001010001010000000001010100000101000001010000000001010101000100010000010101000000000001000101000001010101000101000001000001000000010100010001000101010100010001010000000101000000010001000001000100000101010001000001000001000101010000010001000001000101000000000000000101010000010000000101010100010100010001010101010000000000010001000101010000000001010100000000010001010100010001000001000101000000010100010000010000010001010100010000010001010100010000010100010101010001000100010100010101000100000101010100000100010100000100000000010101000000010001000001010000000101000100000100010101000000010100000101000001010001010100010000000101010000000001010001000000010100010101010001000100010001000001010101000000010001000100000100010101000000000000010100010000000100000000010100010000000100000101010000010101000100010000010100000001000100000000000100000001010101010101000100010001000000010101010100000001000001000001010001000101010100000001010001010100010101000101000000010001010100010101000100000101000101000001000001000001000101010100010001010000000100000101010100000001000000000000010101000100010001000001000001000000000000010100000100000001"> : tensor<200x8xi5> + %cst_0 = arith.constant dense<[[1, 0, 0, 0, 1, 0, 0, 1], [0, 0, 1, 1, 0, 0, 0, 0], [1, 1, 0, 1, 1, 0, 1, 1], [1, 1, 0, 0, 1, 0, 1, 1]]> : tensor<4x8xi5> + %0 = "FHELinalg.matmul_eint_int"(%arg0, %cst_0) : (tensor<200x4x!FHE.eint<4>>, tensor<4x8xi5>) -> tensor<200x8x!FHE.eint<4>> + %1 = "FHELinalg.add_eint_int"(%0, %cst) : (tensor<200x8x!FHE.eint<4>>, tensor<200x8xi5>) -> tensor<200x8x!FHE.eint<4>> + %cst_1 = arith.constant dense<[0, 3, 7, 10, 14, 17, 21, 24, 28, 31, 35, 38, 42, 45, 49, 52]> : tensor<16xi64> + %2 = "FHELinalg.apply_lookup_table"(%1, %cst_1) : (tensor<200x8x!FHE.eint<4>>, tensor<16xi64>) -> tensor<200x8x!FHE.eint<4>> + return %2 : tensor<200x8x!FHE.eint<4>> + } +)XXX", + "main", false, false, false); + + const size_t numDim = 2; + const size_t dim0 = 200; + const size_t dim1 = 4; + const size_t dim2 = 8; + const int64_t dims[numDim]{dim0, dim1}; + const llvm::ArrayRef shape2D(dims, numDim); + std::vector input; + input.reserve(dim0 * dim1); + + for (int i = 0; i < dim0 * dim1; ++i) + input.push_back(i % 17 % 4); + + mlir::concretelang::TensorLambdaArgument< + mlir::concretelang::IntLambdaArgument> + arg(input, shape2D); + + llvm::Expected> res = + lambda.operator()>({&arg}); + + ASSERT_EXPECTED_SUCCESS(res); + ASSERT_EQ(res->size(), dim0 * dim2); + for (size_t i = 0; i < dim0 * dim2; i++) + EXPECT_EQ(distributed_results[i], (*res)[i]) + << "result differ at pos " << i; + } +} diff --git a/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.sh b/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.sh new file mode 100644 index 000000000..3e4566bfa --- /dev/null +++ b/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --job-name=end_to_end_jit_distributed +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=antoniu.pop@zama.ai +#SBATCH --nodes=4 +#SBATCH --cpus-per-task=8 +#SBATCH --time=00:20:00 +#SBATCH --output=end_to_end_jit_distributed_%j.log + +echo "Date = $(date)" +echo "Hostname = $(hostname -s)" +echo "Working Directory = $(pwd)" +echo "" +echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES" +echo "Number of Tasks Allocated = $SLURM_NTASKS" +echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK" + +export OMP_NUM_THREADS=8 +export DFR_NUM_THREADS=2 + +srun ./build/bin/end_to_end_jit_distributed + +date diff --git a/compiler/tests/tests_tools/assert.h b/compiler/tests/tests_tools/assert.h index 4486fe431..197e010f0 100644 --- a/compiler/tests/tests_tools/assert.h +++ b/compiler/tests/tests_tools/assert.h @@ -32,6 +32,10 @@ static bool assert_expected_success(llvm::Expected &&val) { template static bool assert_expected_failure(llvm::Expected &&val) { if (!((bool)val)) { + if (!mlir::concretelang::dfr::_dfr_is_root_node()) { + llvm::toString(val.takeError()); + return true; + } // We need to consume the error, so let's do it here llvm::errs() << "assert_expected_failure: " << llvm::toString(val.takeError()) << "\n";