diff --git a/gcm.cpp b/gcm.cpp index 404eaea7..e75c43a4 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -269,7 +269,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const inline void GCM_Base::ReverseHashBufferIfNeeded() { -#if CRYPTOPP_CLMUL_AVAILABLE +#if CRYPTOPP_SSSE3_AVAILABLE if (HasCLMUL()) { GCM_ReverseHashBufferIfNeeded_SSSE3(HashBuffer()); diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 39a4dd96..b2ff8edb 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -16,8 +16,16 @@ #endif #if (CRYPTOPP_SSE41_AVAILABLE) +// Hack... Apple conflates SSE4.1 and SSE4.2. Without __SSE4_2__, +// Apple fails the compile with "SSE4.2 instruction set not enabled" +// when "nmmintrin.h" is included. Its non-trivial for us to +// automatically add -msse4.2 for Apple Clang. We also want to +// avoid problems on low-end Atoms which have AES but lack SSE4.2. +# if (CRYPTOPP_APPLE_CLANG_VERSION) +# define __SSE4_2__ 1 +# endif # include "nmmintrin.h" -#endif +#endif // CRYPTOPP_SSE41_AVAILABLE #if (CRYPTOPP_AESNI_AVAILABLE) # include "wmmintrin.h" @@ -129,7 +137,8 @@ void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int block = _mm_aesenclast_si128(block, subkeys[rounds]); } -inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds) +inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, + MAYBE_CONST __m128i *subkeys, unsigned int rounds) { __m128i rk = subkeys[0]; block0 = _mm_xor_si128(block0, rk); @@ -163,7 +172,8 @@ void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int block = _mm_aesdeclast_si128(block, subkeys[rounds]); } -void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds) +void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, + MAYBE_CONST __m128i *subkeys, unsigned int rounds) { __m128i rk = subkeys[0]; block0 = _mm_xor_si128(block0, rk); @@ -298,16 +308,18 @@ inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, MAYBE_CON return length; } -size_t Rijndael_AdvancedProcessBlocks_Enc_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +size_t Rijndael_AdvancedProcessBlocks_Enc_AESNI(MAYBE_CONST word32 *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { + MAYBE_CONST __m128i* keys = reinterpret_cast(subkeys); return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Enc_Block, AESNI_Enc_4_Blocks, - subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); + keys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } -size_t Rijndael_AdvancedProcessBlocks_Dec_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +size_t Rijndael_AdvancedProcessBlocks_Dec_AESNI(MAYBE_CONST word32 *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { + MAYBE_CONST __m128i* keys = reinterpret_cast(subkeys); return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Dec_Block, AESNI_Dec_4_Blocks, - subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); + keys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk) diff --git a/rijndael.cpp b/rijndael.cpp index bea3b45e..4d803001 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -74,11 +74,6 @@ being unloaded from L1 cache, until that round is finished. #include "misc.h" #include "cpu.h" -// TODO: remove... -#if (CRYPTOPP_AESNI_AVAILABLE) -# include "wmmintrin.h" -#endif - // TODO: remove... #if (CRYPTOPP_ARM_AES_AVAILABLE) # include "arm_neon.h" @@ -229,9 +224,9 @@ void Rijndael::Base::FillDecTable() extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk); extern void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds); -extern size_t Rijndael_AdvancedProcessBlocks_Enc_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds, +extern size_t Rijndael_AdvancedProcessBlocks_Enc_AESNI(const word32 *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -extern size_t Rijndael_AdvancedProcessBlocks_Dec_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds, +extern size_t Rijndael_AdvancedProcessBlocks_Dec_AESNI(const word32 *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif @@ -1053,8 +1048,7 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo { #if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) - return Rijndael_AdvancedProcessBlocks_Enc_AESNI((MAYBE_CONST __m128i *)(const void *)m_key.begin(), - m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + return Rijndael_AdvancedProcessBlocks_Enc_AESNI(m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif @@ -1117,8 +1111,7 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo { #if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) - return Rijndael_AdvancedProcessBlocks_Dec_AESNI((MAYBE_CONST __m128i *)(const void *)m_key.begin(), - m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + return Rijndael_AdvancedProcessBlocks_Dec_AESNI(m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);