From fdeb97629ef8964a5d9040328ee63734884ac874 Mon Sep 17 00:00:00 2001 From: Nugine Date: Fri, 8 Nov 2024 15:19:38 +0800 Subject: [PATCH] Optimize PFCOUNT, PFMERGE command by SIMD acceleration (#13558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR optimizes the performance of HyperLogLog commands (PFCOUNT, PFMERGE) by adding AVX2 fast paths. Two AVX2 functions are added for conversion between raw representation and dense representation. They are 15 ~ 30 times faster than scalar implementaion. Note that sparse representation is not accelerated. AVX2 fast paths are enabled when the CPU supports AVX2 (checked at runtime) and the hyperloglog configuration is default (HLL_REGISTERS == 16384 && HLL_BITS == 6). When merging 3 dense hll structures, the benchmark shows a 12x speedup compared to the scalar version. ``` pfcount key1 key2 key3 pfmerge keyall key1 key2 key3 ``` ``` ====================================================================================================== Type Ops/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------ PFCOUNT-scalar 5570.09 35.89060 32.51100 65.27900 69.11900 299.17 PFCOUNT-avx2 72604.92 2.82072 2.73500 5.50300 7.13500 3899.68 ------------------------------------------------------------------------------------------------------ PFMERGE-scalar 7879.13 25.52156 24.19100 46.33500 48.38300 492.45 PFMERGE-avx2 126448.64 1.58120 1.53500 3.08700 4.89500 7903.04 ------------------------------------------------------------------------------------------------------ scalar: redis:unstable 9906daf5c9fdb836a5b3f04829c75701a4e90eb4 avx2: Nugine:hll-simd 02e09f85ac07eace50ebdddd0fd70822f7b9152d CPU: 13th Gen Intel® Core™ i9-13900H × 20 Memory: 32.0 GiB OS: Ubuntu 22.04.5 LTS ``` Experiment repo: https://github.com/Nugine/redis-hyperloglog Benchmark script: https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh Algorithm: https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp resolves #13551 --------- Co-authored-by: Yuan Wang Co-authored-by: debing.sun --- src/config.h | 13 +++ src/hyperloglog.c | 264 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 265 insertions(+), 12 deletions(-) diff --git a/src/config.h b/src/config.h index ae072c9dfb..c7fadf5ff1 100644 --- a/src/config.h +++ b/src/config.h @@ -318,4 +318,17 @@ void setcpuaffinity(const char *cpulist); #define ATTRIBUTE_TARGET_POPCNT #endif +/* Check if we can compile AVX2 code */ +#if defined (__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) +#if defined(__has_attribute) && __has_attribute(target) +#define HAVE_AVX2 +#endif +#endif + +#if defined (HAVE_AVX2) +#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2"))) +#else +#define ATTRIBUTE_TARGET_AVX2 +#endif + #endif diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 6ed88c13ba..aa51d4eab9 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -13,6 +13,10 @@ #include #include +#ifdef HAVE_AVX2 +#include +#endif + /* The Redis HyperLogLog implementation is based on the following ideas: * * * The use of a 64 bit hash function as proposed in [1], in order to estimate @@ -1041,6 +1045,132 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) { } } +#ifdef HAVE_AVX2 +/* A specialized version of hllMergeDense, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense) + * + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { + const __m256i shuffle = _mm256_setr_epi8( // + 4, 5, 6, -1, // + 7, 8, 9, -1, // + 10, 11, 12, -1, // + 13, 14, 15, -1, // + 0, 1, 2, -1, // + 3, 4, 5, -1, // + 6, 7, 8, -1, // + 9, 10, 11, -1 // + ); + + /* Merge the first 8 registers (6 bytes) normally + * as the AVX2 algorithm needs 4 padding bytes at the start */ + uint8_t val; + for (int i = 0; i < 8; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } + + /* Dense to Raw: + * + * 4 registers in 3 bytes: + * {bbaaaaaa|ccccbbbb|ddddddcc} + * + * LOAD 32 bytes (32 registers) per iteration: + * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding) + * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * + * SHUFFLE to: + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 3 valid bytes (4 registers) and a zero byte. + * + * extract registers in each group with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (<<0) + * {00000000|00bbbbbb|00000000|00000000} x8 (<<2) + * {00000000|00000000|00cccccc|00000000} x8 (<<4) + * {00000000|00000000|00000000|00dddddd} x8 (<<6) + * + * merge the extracted registers with OR: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw + */ + + /* Skip 8 registers (6 bytes) */ + const uint8_t *r = reg_dense + 6 - 4; + uint8_t *t = reg_raw + 8; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x0, x; + x0 = _mm256_loadu_si256((__m256i *)r); + x = _mm256_shuffle_epi8(x0, shuffle); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000)); + + a2 = _mm256_slli_epi32(a2, 2); + a3 = _mm256_slli_epi32(a3, 4); + a4 = _mm256_slli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + + __m256i z = _mm256_loadu_si256((__m256i *)t); + + z = _mm256_max_epu8(z, y); + + _mm256_storeu_si256((__m256i *)t, z); + + r += 24; + t += 32; + } + + /* Merge the last 24 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} +#endif + +/* Merge dense-encoded registers to raw registers array. */ +void hllMergeDense(uint8_t* reg_raw, const uint8_t* reg_dense) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (__builtin_cpu_supports("avx2")) { + hllMergeDenseAVX2(reg_raw, reg_dense); + return; + } + } +#endif + + uint8_t val; + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} + /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll' * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'. * @@ -1054,12 +1184,7 @@ int hllMerge(uint8_t *max, robj *hll) { int i; if (hdr->encoding == HLL_DENSE) { - uint8_t val; - - for (i = 0; i < HLL_REGISTERS; i++) { - HLL_DENSE_GET_REGISTER(val,hdr->registers,i); - if (val > max[i]) max[i] = val; - } + hllMergeDense(max, hdr->registers); } else { uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr); long runlen, regval; @@ -1091,6 +1216,117 @@ int hllMerge(uint8_t *max, robj *hll) { return C_OK; } +#ifdef HAVE_AVX2 +/* A specialized version of hllDenseCompress, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress) + * + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { + const __m256i shuffle = _mm256_setr_epi8( // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1, // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1 // + ); + + /* Raw to Dense: + * + * LOAD 32 bytes (32 registers) per iteration: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 4 registers. + * + * move the registers to correct positions with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (>>0) + * {bb000000|0000bbbb|00000000|00000000} x8 (>>2) + * {00000000|cccc0000|000000cc|00000000} x8 (>>4) + * {00000000|00000000|dddddd00|00000000} x8 (>>6) + * + * merge the registers with OR: + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * + * SHUFFLE to: + * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + * + * STORE the lower half and higher half respectively: + * AAABBBCCCDDD0000 + * EEEFFFGGGHHH0000 + * AAABBBCCCDDDEEEFFFGGGHHH0000 + * + * Note that the last 4 bytes are padding bytes. + */ + + const uint8_t *r = reg_raw; + uint8_t *t = reg_dense; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x = _mm256_loadu_si256((__m256i *)r); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000)); + + a2 = _mm256_srli_epi32(a2, 2); + a3 = _mm256_srli_epi32(a3, 4); + a4 = _mm256_srli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + y = _mm256_shuffle_epi8(y, shuffle); + + __m128i lower, higher; + lower = _mm256_castsi256_si128(y); + higher = _mm256_extracti128_si256(y, 1); + + _mm_storeu_si128((__m128i *)t, lower); + _mm_storeu_si128((__m128i *)(t + 12), higher); + + r += 32; + t += 24; + } + + /* Merge the last 32 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} +#endif + +/* Compress raw registers to dense representation. */ +void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (__builtin_cpu_supports("avx2")) { + hllDenseCompressAVX2(reg_dense, reg_raw); + return; + } + } +#endif + + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} + /* ========================== HyperLogLog commands ========================== */ /* Create an HLL object. We always create the HLL using sparse encoding. @@ -1350,12 +1586,16 @@ void pfmergeCommand(client *c) { /* Write the resulting HLL to the destination HLL registers and * invalidate the cached value. */ - for (j = 0; j < HLL_REGISTERS; j++) { - if (max[j] == 0) continue; - hdr = o->ptr; - switch(hdr->encoding) { - case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break; - case HLL_SPARSE: hllSparseSet(o,j,max[j]); break; + if (use_dense) { + hllDenseCompress(hdr->registers, max); + } else { + for (j = 0; j < HLL_REGISTERS; j++) { + if (max[j] == 0) continue; + hdr = o->ptr; + switch (hdr->encoding) { + case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break; + case HLL_SPARSE: hllSparseSet(o,j,max[j]); break; + } } } hdr = o->ptr; /* o->ptr may be different now, as a side effect of