From fdeb97629ef8964a5d9040328ee63734884ac874 Mon Sep 17 00:00:00 2001
From: Nugine <nugine@foxmail.com>
Date: Fri, 8 Nov 2024 15:19:38 +0800
Subject: [PATCH] Optimize PFCOUNT, PFMERGE command by SIMD acceleration
 (#13558)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR optimizes the performance of HyperLogLog commands (PFCOUNT,
PFMERGE) by adding AVX2 fast paths.

Two AVX2 functions are added for conversion between raw representation
and dense representation. They are 15 ~ 30 times faster than scalar
implementaion. Note that sparse representation is not accelerated.

AVX2 fast paths are enabled when the CPU supports AVX2 (checked at
runtime) and the hyperloglog configuration is default (HLL_REGISTERS ==
16384 && HLL_BITS == 6).

When merging 3 dense hll structures, the benchmark shows a 12x speedup
compared to the scalar version.

```
pfcount key1 key2 key3
pfmerge keyall key1 key2 key3
```

```
======================================================================================================
Type             Ops/sec    Avg. Latency     p50 Latency     p99 Latency   p99.9 Latency       KB/sec
------------------------------------------------------------------------------------------------------
PFCOUNT-scalar    5570.09        35.89060        32.51100        65.27900        69.11900       299.17
PFCOUNT-avx2     72604.92         2.82072         2.73500         5.50300         7.13500      3899.68
------------------------------------------------------------------------------------------------------
PFMERGE-scalar    7879.13        25.52156        24.19100        46.33500        48.38300       492.45
PFMERGE-avx2    126448.64         1.58120         1.53500         3.08700         4.89500      7903.04
------------------------------------------------------------------------------------------------------

scalar: redis:unstable   9906daf5c9fdb836a5b3f04829c75701a4e90eb4
avx2:   Nugine:hll-simd  02e09f85ac07eace50ebdddd0fd70822f7b9152d

CPU:    13th Gen Intel® Core™ i9-13900H × 20
Memory: 32.0 GiB
OS:     Ubuntu 22.04.5 LTS
```

Experiment repo: https://github.com/Nugine/redis-hyperloglog
Benchmark script:
https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh
Algorithm:
https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp

resolves #13551

---------

Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
---
 src/config.h      |  13 +++
 src/hyperloglog.c | 264 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 265 insertions(+), 12 deletions(-)

diff --git a/src/config.h b/src/config.h
index ae072c9dfb..c7fadf5ff1 100644
--- a/src/config.h
+++ b/src/config.h
@@ -318,4 +318,17 @@ void setcpuaffinity(const char *cpulist);
     #define ATTRIBUTE_TARGET_POPCNT
 #endif
 
+/* Check if we can compile AVX2 code */
+#if defined (__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4))
+#if defined(__has_attribute) && __has_attribute(target)
+#define HAVE_AVX2
+#endif
+#endif
+
+#if defined (HAVE_AVX2)
+#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2")))
+#else
+#define ATTRIBUTE_TARGET_AVX2
+#endif
+
 #endif
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index 6ed88c13ba..aa51d4eab9 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -13,6 +13,10 @@
 #include <stdint.h>
 #include <math.h>
 
+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
+
 /* The Redis HyperLogLog implementation is based on the following ideas:
  *
  * * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -1041,6 +1045,132 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) {
     }
 }
 
+#ifdef HAVE_AVX2
+/* A specialized version of hllMergeDense, optimized for default configurations.
+ * 
+ * Requirements:
+ * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
+ * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense)
+ *
+ * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
+ * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
+ */
+ATTRIBUTE_TARGET_AVX2
+void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
+    const __m256i shuffle = _mm256_setr_epi8( //
+        4, 5, 6, -1,                          //
+        7, 8, 9, -1,                          //
+        10, 11, 12, -1,                       //
+        13, 14, 15, -1,                       //
+        0, 1, 2, -1,                          //
+        3, 4, 5, -1,                          //
+        6, 7, 8, -1,                          //
+        9, 10, 11, -1                         //
+    );
+
+    /* Merge the first 8 registers (6 bytes) normally 
+     * as the AVX2 algorithm needs 4 padding bytes at the start */
+    uint8_t val;
+    for (int i = 0; i < 8; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+
+    /* Dense to Raw:
+     *
+     * 4 registers in 3 bytes:
+     * {bbaaaaaa|ccccbbbb|ddddddcc}
+     * 
+     * LOAD 32 bytes (32 registers) per iteration:
+     * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding)
+     * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
+     * 
+     * SHUFFLE to:
+     * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
+     *
+     * AVX2 is little endian, each of the 8 groups is a little-endian int32.
+     * A group (int32) contains 3 valid bytes (4 registers) and a zero byte.
+     * 
+     * extract registers in each group with AND and SHIFT:
+     * {00aaaaaa|00000000|00000000|00000000} x8 (<<0)
+     * {00000000|00bbbbbb|00000000|00000000} x8 (<<2)
+     * {00000000|00000000|00cccccc|00000000} x8 (<<4)
+     * {00000000|00000000|00000000|00dddddd} x8 (<<6)
+     *
+     * merge the extracted registers with OR:
+     * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
+     *
+     * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw
+     */
+
+    /* Skip 8 registers (6 bytes) */ 
+    const uint8_t *r = reg_dense + 6 - 4;
+    uint8_t *t = reg_raw + 8;
+
+    for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
+        __m256i x0, x;
+        x0 = _mm256_loadu_si256((__m256i *)r);
+        x = _mm256_shuffle_epi8(x0, shuffle);
+
+        __m256i a1, a2, a3, a4;
+        a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
+        a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0));
+        a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000));
+        a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000));
+
+        a2 = _mm256_slli_epi32(a2, 2);
+        a3 = _mm256_slli_epi32(a3, 4);
+        a4 = _mm256_slli_epi32(a4, 6);
+
+        __m256i y1, y2, y;
+        y1 = _mm256_or_si256(a1, a2);
+        y2 = _mm256_or_si256(a3, a4);
+        y = _mm256_or_si256(y1, y2);
+
+        __m256i z = _mm256_loadu_si256((__m256i *)t);
+
+        z = _mm256_max_epu8(z, y);
+
+        _mm256_storeu_si256((__m256i *)t, z);
+
+        r += 24;
+        t += 32;
+    }
+
+    /* Merge the last 24 registers normally 
+     * as the AVX2 algorithm needs 4 padding bytes at the end */
+    for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+}
+#endif
+
+/* Merge dense-encoded registers to raw registers array. */
+void hllMergeDense(uint8_t* reg_raw, const uint8_t* reg_dense) {
+#ifdef HAVE_AVX2
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (__builtin_cpu_supports("avx2")) {
+            hllMergeDenseAVX2(reg_raw, reg_dense);
+            return;
+        }
+    }
+#endif
+
+    uint8_t val;
+    for (int i = 0; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+}
+
 /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll'
  * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'.
  *
@@ -1054,12 +1184,7 @@ int hllMerge(uint8_t *max, robj *hll) {
     int i;
 
     if (hdr->encoding == HLL_DENSE) {
-        uint8_t val;
-
-        for (i = 0; i < HLL_REGISTERS; i++) {
-            HLL_DENSE_GET_REGISTER(val,hdr->registers,i);
-            if (val > max[i]) max[i] = val;
-        }
+        hllMergeDense(max, hdr->registers);
     } else {
         uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr);
         long runlen, regval;
@@ -1091,6 +1216,117 @@ int hllMerge(uint8_t *max, robj *hll) {
     return C_OK;
 }
 
+#ifdef HAVE_AVX2
+/* A specialized version of hllDenseCompress, optimized for default configurations.
+ * 
+ * Requirements:
+ * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
+ * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress)
+ *
+ * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
+ * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
+ */
+ATTRIBUTE_TARGET_AVX2
+void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
+    const __m256i shuffle = _mm256_setr_epi8( //
+        0, 1, 2,                              //
+        4, 5, 6,                              //
+        8, 9, 10,                             //
+        12, 13, 14,                           //
+        -1, -1, -1, -1,                       //
+        0, 1, 2,                              //
+        4, 5, 6,                              //
+        8, 9, 10,                             //
+        12, 13, 14,                           //
+        -1, -1, -1, -1                        //
+    );
+
+    /* Raw to Dense:
+     * 
+     * LOAD 32 bytes (32 registers) per iteration:
+     * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
+     *
+     * AVX2 is little endian, each of the 8 groups is a little-endian int32.
+     * A group (int32) contains 4 registers.
+     * 
+     * move the registers to correct positions with AND and SHIFT:
+     * {00aaaaaa|00000000|00000000|00000000} x8 (>>0)
+     * {bb000000|0000bbbb|00000000|00000000} x8 (>>2)
+     * {00000000|cccc0000|000000cc|00000000} x8 (>>4)
+     * {00000000|00000000|dddddd00|00000000} x8 (>>6)
+     *
+     * merge the registers with OR:
+     * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
+     * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     *
+     * SHUFFLE to:
+     * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
+     *
+     * STORE the lower half and higher half respectively:
+     * AAABBBCCCDDD0000 
+     *             EEEFFFGGGHHH0000
+     * AAABBBCCCDDDEEEFFFGGGHHH0000
+     *
+     * Note that the last 4 bytes are padding bytes.
+     */
+
+    const uint8_t *r = reg_raw;
+    uint8_t *t = reg_dense;
+
+    for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
+        __m256i x = _mm256_loadu_si256((__m256i *)r);
+
+        __m256i a1, a2, a3, a4;
+        a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
+        a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00));
+        a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000));
+        a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000));
+
+        a2 = _mm256_srli_epi32(a2, 2);
+        a3 = _mm256_srli_epi32(a3, 4);
+        a4 = _mm256_srli_epi32(a4, 6);
+
+        __m256i y1, y2, y;
+        y1 = _mm256_or_si256(a1, a2);
+        y2 = _mm256_or_si256(a3, a4);
+        y = _mm256_or_si256(y1, y2);
+        y = _mm256_shuffle_epi8(y, shuffle);
+
+        __m128i lower, higher;
+        lower = _mm256_castsi256_si128(y);
+        higher = _mm256_extracti128_si256(y, 1);
+
+        _mm_storeu_si128((__m128i *)t, lower);
+        _mm_storeu_si128((__m128i *)(t + 12), higher);
+
+        r += 32;
+        t += 24;
+    }
+
+    /* Merge the last 32 registers normally 
+     * as the AVX2 algorithm needs 4 padding bytes at the end */
+    for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+#endif
+
+/* Compress raw registers to dense representation. */
+void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
+#ifdef HAVE_AVX2
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (__builtin_cpu_supports("avx2")) {
+            hllDenseCompressAVX2(reg_dense, reg_raw);
+            return;
+        }
+    }
+#endif
+
+    for (int i = 0; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+
 /* ========================== HyperLogLog commands ========================== */
 
 /* Create an HLL object. We always create the HLL using sparse encoding.
@@ -1350,12 +1586,16 @@ void pfmergeCommand(client *c) {
 
     /* Write the resulting HLL to the destination HLL registers and
      * invalidate the cached value. */
-    for (j = 0; j < HLL_REGISTERS; j++) {
-        if (max[j] == 0) continue;
-        hdr = o->ptr;
-        switch(hdr->encoding) {
-        case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break;
-        case HLL_SPARSE: hllSparseSet(o,j,max[j]); break;
+    if (use_dense) {
+        hllDenseCompress(hdr->registers, max);
+    } else {
+        for (j = 0; j < HLL_REGISTERS; j++) {
+            if (max[j] == 0) continue;
+            hdr = o->ptr;
+            switch (hdr->encoding) {
+                case HLL_DENSE: hllDenseSet(hdr->registers,j,max[j]); break;
+                case HLL_SPARSE: hllSparseSet(o,j,max[j]); break;
+            }
         }
     }
     hdr = o->ptr; /* o->ptr may be different now, as a side effect of