Finish AESNI cut-in

Remove specialized MMX and ISSE support. SSE2 is now the floor
pull/461/head
Jeffrey Walton 2017-08-01 01:20:30 -04:00
parent 142fe88ae1
commit 1356456f05
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
9 changed files with 248 additions and 256 deletions

View File

@ -882,8 +882,6 @@ gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
# AESNI or ARMv7a/ARMv8a available
rijndael.o : rijndael.cpp
$(CXX) $(strip $(CXXFLAGS) $(AES_FLAG) -c) $<
rijndael-simd.o : rijndael-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(AES_FLAG) -c) $<

View File

@ -114,7 +114,7 @@ typedef void (*pfnCompress64)(const byte*, BLAKE2_State<word64, true>&);
pfnCompress64 InitializeCompress64Fn()
{
#if CRYPTOPP_SSE42_AVAILABLE
if (HasSSE4())
if (HasSSE42())
return &BLAKE2_Compress64_SSE4;
else
#endif
@ -136,7 +136,7 @@ pfnCompress64 InitializeCompress64Fn()
pfnCompress32 InitializeCompress32Fn()
{
#if CRYPTOPP_SSE42_AVAILABLE
if (HasSSE4())
if (HasSSE42())
return &BLAKE2_Compress32_SSE4;
else
#endif

21
cpu.cpp
View File

@ -185,8 +185,9 @@ static bool TrySSE2()
}
bool CRYPTOPP_SECTION_INIT g_x86DetectionDone = false;
bool CRYPTOPP_SECTION_INIT g_hasMMX = false, CRYPTOPP_SECTION_INIT g_hasISSE = false, CRYPTOPP_SECTION_INIT g_hasSSE2 = false, CRYPTOPP_SECTION_INIT g_hasSSSE3 = false;
bool CRYPTOPP_SECTION_INIT g_hasSSE4 = false, CRYPTOPP_SECTION_INIT g_hasAESNI = false, CRYPTOPP_SECTION_INIT g_hasCLMUL = false, CRYPTOPP_SECTION_INIT g_hasSHA = false;
bool CRYPTOPP_SECTION_INIT CRYPTOPP_SECTION_INIT g_hasSSE2 = false, CRYPTOPP_SECTION_INIT g_hasSSSE3 = false;
bool CRYPTOPP_SECTION_INIT g_hasSSE41 = false, CRYPTOPP_SECTION_INIT g_hasSSE42 = false;
bool CRYPTOPP_SECTION_INIT g_hasAESNI = false, CRYPTOPP_SECTION_INIT g_hasCLMUL = false, CRYPTOPP_SECTION_INIT g_hasSHA = false;
bool CRYPTOPP_SECTION_INIT g_hasRDRAND = false, CRYPTOPP_SECTION_INIT g_hasRDSEED = false, CRYPTOPP_SECTION_INIT g_isP4 = false;
bool CRYPTOPP_SECTION_INIT g_hasPadlockRNG = false, CRYPTOPP_SECTION_INIT g_hasPadlockACE = false, CRYPTOPP_SECTION_INIT g_hasPadlockACE2 = false;
bool CRYPTOPP_SECTION_INIT g_hasPadlockPHE = false, CRYPTOPP_SECTION_INIT g_hasPadlockPMM = false;
@ -225,26 +226,14 @@ void DetectX86Features()
if (!CpuId(1, cpuid2))
return;
g_hasMMX = (cpuid2[3] & (1 << 23)) != 0;
if ((cpuid2[3] & (1 << 26)) != 0)
g_hasSSE2 = TrySSE2();
g_hasSSSE3 = g_hasSSE2 && (cpuid2[2] & (1<<9));
g_hasSSE4 = g_hasSSE2 && ((cpuid2[2] & (1<<19)) && (cpuid2[2] & (1<<20)));
g_hasSSE41 = g_hasSSE2 && (cpuid2[2] & (1<<19));
g_hasSSE42 = g_hasSSE2 && (cpuid2[2] & (1<<20));
g_hasAESNI = g_hasSSE2 && (cpuid2[2] & (1<<25));
g_hasCLMUL = g_hasSSE2 && (cpuid2[2] & (1<<1));
if ((cpuid2[3] & (1 << 25)) != 0)
g_hasISSE = true;
else
{
CpuId(0x080000000, cpuid3);
if (cpuid3[0] >= 0x080000001)
{
CpuId(0x080000001, cpuid3);
g_hasISSE = (cpuid3[3] & (1 << 22)) != 0;
}
}
if (IsIntel(cpuid1))
{
static const unsigned int RDRAND_FLAG = (1 << 30);

55
cpu.h
View File

@ -57,11 +57,10 @@ NAMESPACE_BEGIN(CryptoPP)
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
// These should not be used directly
extern CRYPTOPP_DLL bool g_x86DetectionDone;
extern CRYPTOPP_DLL bool g_hasMMX;
extern CRYPTOPP_DLL bool g_hasISSE;
extern CRYPTOPP_DLL bool g_hasSSE2;
extern CRYPTOPP_DLL bool g_hasSSSE3;
extern CRYPTOPP_DLL bool g_hasSSE4;
extern CRYPTOPP_DLL bool g_hasSSE41;
extern CRYPTOPP_DLL bool g_hasSSE42;
extern CRYPTOPP_DLL bool g_hasAESNI;
extern CRYPTOPP_DLL bool g_hasCLMUL;
extern CRYPTOPP_DLL bool g_hasSHA;
@ -79,36 +78,6 @@ CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features();
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 output[4]);
#endif // CRYPTOPP_DOXYGEN_PROCESSING
//! \brief Determines MMX availability
//! \returns true if MMX is determined to be available, false otherwise
//! \details MMX, SSE and SSE2 are core processor features for x86_64, and
//! the function always returns true for the platform.
inline bool HasMMX()
{
#if CRYPTOPP_BOOL_X64
return true;
#else
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasMMX;
#endif
}
//! \brief Determines SSE availability
//! \returns true if SSE is determined to be available, false otherwise
//! \details MMX, SSE and SSE2 are core processor features for x86_64, and
//! the function always returns true for the platform.
inline bool HasISSE()
{
#if CRYPTOPP_BOOL_X64
return true;
#else
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasISSE;
#endif
}
//! \brief Determines SSE2 availability
//! \returns true if SSE2 is determined to be available, false otherwise
//! \details MMX, SSE and SSE2 are core processor features for x86_64, and
@ -135,14 +104,24 @@ inline bool HasSSSE3()
return g_hasSSSE3;
}
//! \brief Determines SSE4 availability
//! \returns true if SSE4.1 and SSE4.2 are determined to be available, false otherwise
//! \details HasSSE4() is a runtime check performed using CPUID which requires both SSE4.1 and SSE4.2
inline bool HasSSE4()
//! \brief Determines SSE4.1 availability
//! \returns true if SSE4.1 is determined to be available, false otherwise
//! \details HasSSE41() is a runtime check performed using CPUID
inline bool HasSSE41()
{
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasSSE4;
return g_hasSSE41;
}
//! \brief Determines SSE4.2 availability
//! \returns true if SSE4.2 is determined to be available, false otherwise
//! \details HasSSE42() is a runtime check performed using CPUID
inline bool HasSSE42()
{
if (!g_x86DetectionDone)
DetectX86Features();
return g_hasSSE42;
}
//! \brief Determines AES-NI availability

View File

@ -298,7 +298,7 @@ CRC32C::CRC32C()
void CRC32C::Update(const byte *s, size_t n)
{
#if (CRYPTOPP_SSE42_AVAILABLE)
if (HasSSE4())
if (HasSSE42())
{
CRC32C_Update_SSE42(s, n, m_crc);
return;

View File

@ -15,7 +15,7 @@
# undef CRYPTOPP_ARM_AES_AVAILABLE
#endif
#if (CRYPTOPP_SSE42_AVAILABLE)
#if (CRYPTOPP_SSE41_AVAILABLE)
# include "nmmintrin.h"
#endif
@ -37,6 +37,13 @@
# define EXCEPTION_EXECUTE_HANDLER 1
#endif
// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
#if (__SUNPRO_CC >= 0x5130)
# define MAYBE_CONST
#else
# define MAYBE_CONST const
#endif
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
@ -110,6 +117,199 @@ bool CPU_TryAES_ARMV8()
#endif // CRYPTOPP_ARM_AES_AVAILABLE
#if (CRYPTOPP_AESNI_AVAILABLE)
void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesenc_si128(block, subkeys[i]);
block = _mm_aesenc_si128(block, subkeys[i+1]);
}
block = _mm_aesenc_si128(block, subkeys[rounds-1]);
block = _mm_aesenclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesenc_si128(block0, rk);
block1 = _mm_aesenc_si128(block1, rk);
block2 = _mm_aesenc_si128(block2, rk);
block3 = _mm_aesenc_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesenclast_si128(block0, rk);
block1 = _mm_aesenclast_si128(block1, rk);
block2 = _mm_aesenclast_si128(block2, rk);
block3 = _mm_aesenclast_si128(block3, rk);
}
void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesdec_si128(block, subkeys[i]);
block = _mm_aesdec_si128(block, subkeys[i+1]);
}
block = _mm_aesdec_si128(block, subkeys[rounds-1]);
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
}
void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesdec_si128(block0, rk);
block1 = _mm_aesdec_si128(block1, rk);
block2 = _mm_aesdec_si128(block2, rk);
block3 = _mm_aesdec_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesdeclast_si128(block0, rk);
block1 = _mm_aesdeclast_si128(block1, rk);
block2 = _mm_aesdeclast_si128(block2, rk);
block3 = _mm_aesdeclast_si128(block3, rk);
}
CRYPTOPP_ALIGN_DATA(16)
static const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
if (flags & BlockTransformation::BT_ReverseDirection)
{
CRYPTOPP_ASSERT(length % blockSize == 0);
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
__m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *(const __m128i *)(const void *)s_one;
block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
inBlocks += inIncrement;
block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
inBlocks += inIncrement;
block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
}
func4(block0, block1, block2, block3, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
}
_mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
__m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
_mm_storeu_si128((__m128i *)(void *)outBlocks, block);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
size_t Rijndael_AdvancedProcessBlocks_Enc_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Enc_Block, AESNI_Enc_4_Blocks,
subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t Rijndael_AdvancedProcessBlocks_Dec_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Dec_Block, AESNI_Dec_4_Blocks,
subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk)
{
const unsigned rounds = keyLen/4 + 6;

View File

@ -228,6 +228,11 @@ void Rijndael::Base::FillDecTable()
#if (CRYPTOPP_AESNI_AVAILABLE)
extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
extern void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds);
extern size_t Rijndael_AdvancedProcessBlocks_Enc_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t Rijndael_AdvancedProcessBlocks_Dec_AESNI(MAYBE_CONST __m128i *subkeys, unsigned int rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &)
@ -239,10 +244,12 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
word32 *rk = m_key;
#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE42_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
// MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
if (HasAESNI() && HasSSE4())
if (HasAESNI() && HasSSE41())
{
// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
// Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
if (!IsForwardTransformation())
Rijndael_UncheckedSetKeyRev_SSE4_AESNI(m_key, m_rounds);
@ -336,7 +343,8 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
if (HasAESNI())
#endif
{
return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
(void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
@ -1024,190 +1032,6 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end)
return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
}
#if CRYPTOPP_AESNI_AVAILABLE
inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesenc_si128(block, subkeys[i]);
block = _mm_aesenc_si128(block, subkeys[i+1]);
}
block = _mm_aesenc_si128(block, subkeys[rounds-1]);
block = _mm_aesenclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesenc_si128(block0, rk);
block1 = _mm_aesenc_si128(block1, rk);
block2 = _mm_aesenc_si128(block2, rk);
block3 = _mm_aesenc_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesenclast_si128(block0, rk);
block1 = _mm_aesenclast_si128(block1, rk);
block2 = _mm_aesenclast_si128(block2, rk);
block3 = _mm_aesenclast_si128(block3, rk);
}
inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesdec_si128(block, subkeys[i]);
block = _mm_aesdec_si128(block, subkeys[i+1]);
}
block = _mm_aesdec_si128(block, subkeys[rounds-1]);
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesdec_si128(block0, rk);
block1 = _mm_aesdec_si128(block1, rk);
block2 = _mm_aesdec_si128(block2, rk);
block3 = _mm_aesdec_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesdeclast_si128(block0, rk);
block1 = _mm_aesdeclast_si128(block1, rk);
block2 = _mm_aesdeclast_si128(block2, rk);
block3 = _mm_aesdeclast_si128(block3, rk);
}
CRYPTOPP_ALIGN_DATA(16)
static const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
if (flags & BlockTransformation::BT_ReverseDirection)
{
CRYPTOPP_ASSERT(length % blockSize == 0);
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
__m128i block0 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *(const __m128i *)(const void *)s_one;
block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128((__m128i *)(void *)inBlocks, _mm_add_epi32(block3, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
inBlocks += inIncrement;
block2 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
inBlocks += inIncrement;
block3 = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
// Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks);
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
}
func4(block0, block1, block2, block3, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
xorBlocks += xorIncrement;
}
_mm_storeu_si128((__m128i *)(void *)outBlocks, block0);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block1);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block2);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)(void *)outBlocks, block3);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
__m128i block = _mm_loadu_si128((const __m128i *)(const void *)inBlocks);
if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)(const void *)xorBlocks));
_mm_storeu_si128((__m128i *)(void *)outBlocks, block);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
#endif
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
struct Locals
{
@ -1229,7 +1053,9 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
{
#if CRYPTOPP_AESNI_AVAILABLE
if (HasAESNI())
return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
return Rijndael_AdvancedProcessBlocks_Enc_AESNI((MAYBE_CONST __m128i *)(const void *)m_key.begin(),
m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
@ -1291,7 +1117,8 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
{
#if CRYPTOPP_AESNI_AVAILABLE
if (HasAESNI())
return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
return Rijndael_AdvancedProcessBlocks_Dec_AESNI((MAYBE_CONST __m128i *)(const void *)m_key.begin(),
m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);

View File

@ -339,15 +339,14 @@ bool TestSettings()
std::cout << std::endl;
#ifdef CRYPTOPP_CPUID_AVAILABLE
bool hasMMX = HasMMX();
bool hasISSE = HasISSE();
bool hasSSE2 = HasSSE2();
bool hasSSSE3 = HasSSSE3();
bool hasSSE4 = HasSSE4();
bool hasSSE41 = HasSSE41();
bool hasSSE42 = HasSSE42();
bool isP4 = IsP4();
int cacheLineSize = GetCacheLineSize();
if ((isP4 && (!hasMMX || !hasSSE2)) || (hasSSE2 && !hasMMX) || (cacheLineSize < 16 || cacheLineSize > 256 || !IsPowerOf2(cacheLineSize)))
if (cacheLineSize < 16 || cacheLineSize > 256 || !IsPowerOf2(cacheLineSize))
{
std::cout << "FAILED: ";
pass = false;
@ -355,7 +354,7 @@ bool TestSettings()
else
std::cout << "passed: ";
std::cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasSSE4 == " << hasSSE4;
std::cout << "hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasSSE4.1 == " << hasSSE41 << ", hasSSE4.2 == " << hasSSE42;
std::cout << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", hasRDRAND == " << HasRDRAND() << ", hasRDSEED == " << HasRDSEED();
std::cout << ", hasSHA == " << HasSHA() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize << std::endl;

View File

@ -409,7 +409,7 @@ static const word64 Whirlpool_C[4*256+R] = {
void Whirlpool::Transform(word64 *digest, const word64 *block)
{
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasISSE())
if (HasSSE2())
{
// MMX version has the same structure as C version below
#ifdef __GNUC__