mirror of
https://github.com/data61/MP-SPDZ.git
synced 2026-05-13 03:00:24 -04:00
329 lines
15 KiB
C
329 lines
15 KiB
C
// This file is reduced to functionality necessary for AES in order to avoid
|
|
// conflicts with simde.
|
|
|
|
/*
|
|
* sse2neon is freely redistributable under the MIT License.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
#pragma push_macro("FORCE_INLINE")
|
|
#pragma push_macro("ALIGN_STRUCT")
|
|
#define FORCE_INLINE static inline __attribute__((always_inline))
|
|
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
|
|
#else
|
|
#error "Macro name collisions may happen with unsupported compiler."
|
|
#ifdef FORCE_INLINE
|
|
#undef FORCE_INLINE
|
|
#endif
|
|
#define FORCE_INLINE static inline
|
|
#ifndef ALIGN_STRUCT
|
|
#define ALIGN_STRUCT(x) __declspec(align(x))
|
|
#endif
|
|
#endif
|
|
|
|
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
|
|
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
|
|
|
|
#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
|
|
|
|
// A struct is defined in this header file called 'SIMDVec' which can be used
|
|
// by applications which attempt to access the contents of an _m128 struct
|
|
// directly. It is important to note that accessing the __m128 struct directly
|
|
// is bad coding practice by Microsoft: @see:
|
|
// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
|
|
//
|
|
// However, some legacy source code may try to access the contents of an __m128
|
|
// struct directly so the developer can use the SIMDVec as an alias for it. Any
|
|
// casting must be done manually by the developer, as you cannot cast or
|
|
// otherwise alias the base NEON data type for intrinsic operations.
|
|
//
|
|
// union intended to allow direct access to an __m128 variable using the names
|
|
// that the MSVC compiler provides. This union should really only be used when
|
|
// trying to access the members of the vector as integer values. GCC/clang
|
|
// allow native access to the float members through a simple array access
|
|
// operator (in C since 4.6, in C++ since 4.8).
|
|
//
|
|
// Ideally direct accesses to SIMD vectors should not be used since it can cause
|
|
// a performance hit. If it really is needed however, the original __m128
|
|
// variable can be aliased with a pointer to this union and used to access
|
|
// individual components. The use of this union should be hidden behind a macro
|
|
// that is used throughout the codebase to access the members instead of always
|
|
// declaring this type of variable.
|
|
typedef union ALIGN_STRUCT(16) SIMDVec {
|
|
float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
|
|
int8_t m128_i8[16]; // as signed 8-bit integers.
|
|
int16_t m128_i16[8]; // as signed 16-bit integers.
|
|
int32_t m128_i32[4]; // as signed 32-bit integers.
|
|
int64_t m128_i64[2]; // as signed 64-bit integers.
|
|
uint8_t m128_u8[16]; // as unsigned 8-bit integers.
|
|
uint16_t m128_u16[8]; // as unsigned 16-bit integers.
|
|
uint32_t m128_u32[4]; // as unsigned 32-bit integers.
|
|
uint64_t m128_u64[2]; // as unsigned 64-bit integers.
|
|
} SIMDVec;
|
|
|
|
// casting using SIMDVec
|
|
#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
|
|
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
|
|
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
|
|
|
|
/* Backwards compatibility for compilers with lack of specific type support */
|
|
|
|
// Older gcc does not define vld1q_u8_x4 type
|
|
#if defined(__GNUC__) && !defined(__clang__) && \
|
|
((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
|
|
(__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \
|
|
(__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
|
|
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
|
|
{
|
|
uint8x16x4_t ret;
|
|
ret.val[0] = vld1q_u8(p + 0);
|
|
ret.val[1] = vld1q_u8(p + 16);
|
|
ret.val[2] = vld1q_u8(p + 32);
|
|
ret.val[3] = vld1q_u8(p + 48);
|
|
return ret;
|
|
}
|
|
#else
|
|
// Wraps vld1q_u8_x4
|
|
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
|
|
{
|
|
return vld1q_u8_x4(p);
|
|
}
|
|
#endif
|
|
|
|
#if !defined(__ARM_FEATURE_CRYPTO)
|
|
/* clang-format off */
|
|
#define SSE2NEON_AES_DATA(w) \
|
|
{ \
|
|
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
|
|
w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
|
|
w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
|
|
w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
|
|
w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
|
|
w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
|
|
w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
|
|
w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
|
|
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
|
|
w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
|
|
w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
|
|
w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
|
|
w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
|
|
w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
|
|
w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
|
|
w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
|
|
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
|
|
w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
|
|
w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
|
|
w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
|
|
w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
|
|
w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
|
|
w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
|
|
w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
|
|
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
|
|
w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
|
|
w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
|
|
w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
|
|
w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
|
|
w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
|
|
w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
|
|
w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
|
|
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
|
|
w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
|
|
w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
|
|
w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
|
|
w(0xb0), w(0x54), w(0xbb), w(0x16) \
|
|
}
|
|
/* clang-format on */
|
|
|
|
/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
|
|
#define SSE2NEON_AES_H0(x) (x)
|
|
static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
|
|
#undef SSE2NEON_AES_H0
|
|
|
|
// In the absence of crypto extensions, implement aesenc using regular neon
|
|
// intrinsics instead. See:
|
|
// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
|
|
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
|
|
// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
|
|
// for more information Reproduced with permission of the author.
|
|
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
|
|
{
|
|
#if defined(__aarch64__)
|
|
static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
|
|
0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
|
|
0xc, 0x1, 0x6, 0xb};
|
|
static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
|
|
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
|
|
|
|
uint8x16_t v;
|
|
uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
|
|
|
|
// shift rows
|
|
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
|
|
|
|
// sub bytes
|
|
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
|
|
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
|
|
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
|
|
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
|
|
|
|
// mix columns
|
|
w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
|
|
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
|
|
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
|
|
|
|
// add round key
|
|
return vreinterpretq_m128i_u8(w) ^ RoundKey;
|
|
|
|
#else /* ARMv7-A NEON implementation */
|
|
#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
|
|
(((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
|
|
(b0))
|
|
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
|
|
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
|
|
#define SSE2NEON_AES_U0(p) \
|
|
SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
|
|
#define SSE2NEON_AES_U1(p) \
|
|
SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
|
|
#define SSE2NEON_AES_U2(p) \
|
|
SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
|
|
#define SSE2NEON_AES_U3(p) \
|
|
SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
|
|
static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
|
|
SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
|
|
SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
|
|
SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
|
|
SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
|
|
};
|
|
#undef SSE2NEON_AES_B2W
|
|
#undef SSE2NEON_AES_F2
|
|
#undef SSE2NEON_AES_F3
|
|
#undef SSE2NEON_AES_U0
|
|
#undef SSE2NEON_AES_U1
|
|
#undef SSE2NEON_AES_U2
|
|
#undef SSE2NEON_AES_U3
|
|
|
|
uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
|
|
uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
|
|
uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
|
|
uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
|
|
|
|
__m128i out = _mm_set_epi32(
|
|
(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
|
|
aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
|
|
(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
|
|
aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
|
|
(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
|
|
aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
|
|
(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
|
|
aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
|
|
|
|
return _mm_xor_si128(out, RoundKey);
|
|
#endif
|
|
}
|
|
|
|
// Perform the last round of an AES encryption flow on data (state) in a using
|
|
// the round key in RoundKey, and store the result in dst.
|
|
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
|
|
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
|
|
{
|
|
/* FIXME: optimized for NEON */
|
|
uint8_t v[4][4] = {
|
|
{SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
|
|
{SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
|
|
{SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
|
|
{SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
|
|
SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
|
|
};
|
|
for (int i = 0; i < 16; i++)
|
|
vreinterpretq_nth_u8_m128i(a, i) =
|
|
v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
|
|
return a;
|
|
}
|
|
|
|
// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
|
|
// This instruction generates a round key for AES encryption. See
|
|
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
|
|
// for details.
|
|
//
|
|
// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
|
|
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
|
|
{
|
|
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
|
|
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
|
|
for (int i = 0; i < 4; ++i) {
|
|
((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
|
|
((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
|
|
}
|
|
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
|
|
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
|
|
}
|
|
#undef SSE2NEON_AES_DATA
|
|
|
|
#else /* __ARM_FEATURE_CRYPTO */
|
|
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
|
|
// AESMC and then manually applying the real key as an xor operation. This
|
|
// unfortunately means an additional xor op; the compiler should be able to
|
|
// optimize this away for repeated calls however. See
|
|
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
|
|
// for more details.
|
|
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
|
|
{
|
|
return vreinterpretq_m128i_u8(
|
|
vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
|
|
vreinterpretq_u8_m128i(b));
|
|
}
|
|
|
|
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
|
|
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
|
|
{
|
|
return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
|
|
vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
|
|
RoundKey);
|
|
}
|
|
|
|
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
|
|
{
|
|
// AESE does ShiftRows and SubBytes on A
|
|
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
|
|
|
|
uint8x16_t dest = {
|
|
// Undo ShiftRows step from AESE and extract X1 and X3
|
|
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
|
|
u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
|
|
u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
|
|
u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
|
|
};
|
|
uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
|
|
return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
|
|
}
|
|
#endif
|