diff --git a/GNUmakefile b/GNUmakefile index 49ad24ef..ae19f145 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -882,6 +882,8 @@ gcm-simd.o : gcm-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< # AESNI or ARMv7a/ARMv8a available +rijndael.o : rijndael.cpp + $(CXX) $(strip $(CXXFLAGS) $(AES_FLAG) -c) $< rijndael-simd.o : rijndael-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(AES_FLAG) -c) $< diff --git a/config.h b/config.h index 9b69e904..c22be086 100644 --- a/config.h +++ b/config.h @@ -521,9 +521,6 @@ NAMESPACE_END #define CRYPTOPP_AESNI_AVAILABLE 1 #endif -// TODO: -#undef CRYPTOPP_AESNI_AVAILABLE - #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && \ (defined(__SHA__) || (CRYPTOPP_MSC_VERSION >= 1900) || \ (CRYPTOPP_GCC_VERSION >= 40900) || (__INTEL_COMPILER >= 1300) || \ diff --git a/gcm.cpp b/gcm.cpp index a3228103..404eaea7 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -29,8 +29,7 @@ NAMESPACE_BEGIN(CryptoPP) #if (CRYPTOPP_CLMUL_AVAILABLE) -# include "wmmintrin.h" -# include "tmmintrin.h" +# include "emmintrin.h" #endif #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) @@ -94,17 +93,8 @@ extern void GCM_ReverseHashBufferIfNeeded_SSSE3(byte *hashBuffer); #endif #if CRYPTOPP_CLMUL_AVAILABLE -extern __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r); extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize); extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); - -CRYPTOPP_ALIGN_DATA(16) -const word64 s_clmulConstants64[] = { - W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), - W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), - W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; - -const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; const unsigned int s_cltableSizeInBlocks = 8; #endif // CRYPTOPP_CLMUL_AVAILABLE @@ -113,18 +103,8 @@ extern void GCM_ReverseHashBufferIfNeeded_NEON(byte *hashBuffer); #endif #if CRYPTOPP_ARM_PMULL_AVAILABLE -extern uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r); extern void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize); extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); - -CRYPTOPP_ALIGN_DATA(16) -const word64 s_clmulConstants64[] = { - W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients - W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8 - W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8 -}; - -const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64; const unsigned int s_cltableSizeInBlocks = 8; #endif // CRYPTOPP_ARM_PMULL_AVAILABLE diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp new file mode 100644 index 00000000..a6575c34 --- /dev/null +++ b/rijndael-simd.cpp @@ -0,0 +1,195 @@ +// rijndael-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to AES-NI and +// ARMv8a AES instructions. A separate source file is needed +// because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" + +// Clang and GCC hoops... +#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER)) +# undef CRYPTOPP_ARM_AES_AVAILABLE +#endif + +#if (CRYPTOPP_SSE42_AVAILABLE) +# include "nmmintrin.h" +#endif + +#if (CRYPTOPP_AESNI_AVAILABLE) +# include "wmmintrin.h" +#endif + +#if (CRYPTOPP_ARM_AES_AVAILABLE) +# include "arm_neon.h" +# include "arm_acle.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) +bool CPU_TryAES_ARMV8() +{ +#if (CRYPTOPP_ARM_AES_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + // AES encrypt and decrypt + uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); + uint8x16_t r1 = vaeseq_u8(data, key); + uint8x16_t r2 = vaesdq_u8(data, key); + + result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +# else + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerAES); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpNoAES)) + result = false; + else + { + uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); + uint8x16_t r1 = vaeseq_u8(data, key); + uint8x16_t r2 = vaesdq_u8(data, key); + + // Hack... GCC optimizes away the code and returns true + result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_AES_AVAILABLE +} +#endif // ARM32 or ARM64 + +#if (CRYPTOPP_ARM_AES_AVAILABLE) +#endif // CRYPTOPP_ARM_AES_AVAILABLE + +#if (CRYPTOPP_AESNI_AVAILABLE) +void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk) +{ + const unsigned rounds = keyLen/4 + 6; + static const word32 rcLE[] = { + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80, + 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ + }; + + const word32 *ro = rcLE, *rc = rcLE; + CRYPTOPP_UNUSED(ro); + + __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keyLen-16)); + std::memcpy(rk, userKey, keyLen); + + // keySize: m_key allocates 4*(rounds+1 word32's. + const size_t keySize = 4*(rounds+1); + const word32* end = rk + keySize; + while (true) + { + CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE)); + rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); + rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; + rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; + rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; + + if (rk + keyLen/4 + 4 == end) + break; + + if (keyLen == 24) + { + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + + CRYPTOPP_ASSERT(keySize >= 12); + temp = _mm_insert_epi32(temp, rk[11], 3); + } + else if (keyLen == 32) + { + CRYPTOPP_ASSERT(keySize >= 12); + temp = _mm_insert_epi32(temp, rk[11], 3); + rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + + CRYPTOPP_ASSERT(keySize >= 16); + temp = _mm_insert_epi32(temp, rk[15], 3); + } + else + { + CRYPTOPP_ASSERT(keySize >= 8); + temp = _mm_insert_epi32(temp, rk[7], 3); + } + + rk += keyLen/4; + } +} + +void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds) +{ + unsigned int i, j; + __m128i temp; + +#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) + // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. + // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. + vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds)); +#else + std::swap(*(__m128i *)(void *)(key), *(__m128i *)(void *)(key+4*rounds)); +#endif + for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4) + { + temp = _mm_aesimc_si128(*(__m128i *)(void *)(key+i)); + *(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+j)); + *(__m128i *)(void *)(key+j) = temp; + } + + *(__m128i *)(void *)(key+i) = _mm_aesimc_si128(*(__m128i *)(void *)(key+i)); +} +#endif // CRYPTOPP_AESNI_AVAILABLE + +NAMESPACE_END diff --git a/rijndael.cpp b/rijndael.cpp index b3045b30..f54fec60 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -74,6 +74,17 @@ being unloaded from L1 cache, until that round is finished. #include "misc.h" #include "cpu.h" +// TODO: remove... +#if (CRYPTOPP_AESNI_AVAILABLE) +# include "wmmintrin.h" +#endif + +// TODO: remove... +#if (CRYPTOPP_ARM_AES_AVAILABLE) +# include "arm_neon.h" +# include "arm_acle.h" +#endif + NAMESPACE_BEGIN(CryptoPP) // Hack for http://github.com/weidai11/cryptopp/issues/42 and http://github.com/weidai11/cryptopp/issues/132 @@ -214,11 +225,16 @@ void Rijndael::Base::FillDecTable() s_TdFilled = true; } -void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) -{ - AssertValidKeyLength(keylen); +#if (CRYPTOPP_AESNI_AVAILABLE) +extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk); +extern void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds); +#endif - m_rounds = keylen/4 + 6; +void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &) +{ + AssertValidKeyLength(keyLen); + + m_rounds = keyLen/4 + 6; m_key.New(4*(m_rounds+1)); word32 *rk = m_key; @@ -227,110 +243,36 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 if (HasAESNI() && HasSSE4()) { - static const word32 rcLE[] = { - 0x01, 0x02, 0x04, 0x08, - 0x10, 0x20, 0x40, 0x80, - 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ - }; - - // Coverity finding, appears to be false positive. Assert the condition. - const word32 *ro = rcLE, *rc = rcLE; - CRYPTOPP_UNUSED(ro); - - __m128i temp = _mm_loadu_si128((__m128i *)(void *)(userKey+keylen-16)); - memcpy(rk, userKey, keylen); - - while (true) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE)); - rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); - rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; - rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; - rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; - - if (rk + keylen/4 + 4 == m_key.end()) - break; - - if (keylen == 24) - { - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 12); - temp = _mm_insert_epi32(temp, rk[11], 3); - } - else if (keylen == 32) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 12); - temp = _mm_insert_epi32(temp, rk[11], 3); - rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 16); - temp = _mm_insert_epi32(temp, rk[15], 3); - } - else - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 8); - temp = _mm_insert_epi32(temp, rk[7], 3); - } - - rk += keylen/4; - } - + Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk); if (!IsForwardTransformation()) - { - rk = m_key; - unsigned int i, j; - -#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) - // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. - // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. - vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); -#else - std::swap(*(__m128i *)(void *)(rk), *(__m128i *)(void *)(rk+4*m_rounds)); -#endif - for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) - { - temp = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i)); - *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+j)); - *(__m128i *)(void *)(rk+j) = temp; - } - - *(__m128i *)(void *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(void *)(rk+i)); - } + Rijndael_UncheckedSetKeyRev_SSE4_AESNI(m_key, m_rounds); return; } #endif - GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); + GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen); const word32 *rc = rcon; word32 temp; while (true) { - temp = rk[keylen/4-1]; + temp = rk[keyLen/4-1]; word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; - rk[keylen/4] = rk[0] ^ x ^ *(rc++); - rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; - rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; - rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; + rk[keyLen/4] = rk[0] ^ x ^ *(rc++); + rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; + rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; + rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; - if (rk + keylen/4 + 4 == m_key.end()) + if (rk + keyLen/4 + 4 == m_key.end()) break; - if (keylen == 24) + if (keyLen == 24) { rk[10] = rk[ 4] ^ rk[ 9]; rk[11] = rk[ 5] ^ rk[10]; } - else if (keylen == 32) + else if (keyLen == 32) { temp = rk[11]; rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; @@ -338,7 +280,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c rk[14] = rk[ 6] ^ rk[13]; rk[15] = rk[ 7] ^ rk[14]; } - rk += keylen/4; + rk += keyLen/4; } rk = m_key;