diff --git a/Readme.txt b/Readme.txt index 178c5f7a..ff995a6f 100644 --- a/Readme.txt +++ b/Readme.txt @@ -414,19 +414,30 @@ the mailing list. - ported to MSVC 2008, GCC 4.2, Sun CC 5.9, Intel C++ Compiler 10.0, and Borland C++Builder 2007 -5.6 - added AuthenticatedSymmetricCipher interface class and Filter wrappers - - added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED - - added support for variable length IVs - - added OIDs for Brainpool elliptic curve parameters - - improved AES and SHA-256 speed on x86 and x64 - - fixed incorrect VMAC computation on message lengths - that are >64 mod 128 (x86 assembly version is not affected) - - fixed compiler error in vmac.cpp on x86 with GCC -fPIC - - fixed run-time validation error on x86-64 with GCC 4.3.2 -O2 - - fixed HashFilter bug when putMessage=true - - removed WORD64_AVAILABLE; compiler support for 64-bit int is now required - - ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11 - -5.6.1 - switched to a public domain implementation of MARS +5.6.0 - added AuthenticatedSymmetricCipher interface class and Filter wrappers + - added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED + - added support for variable length IVs + - added OIDs for Brainpool elliptic curve parameters + - improved AES and SHA-256 speed on x86 and x64 + - changed BlockTransformation interface to no longer assume data alignment + - fixed incorrect VMAC computation on message lengths + that are >64 mod 128 (x86 assembly version is not affected) + - fixed compiler error in vmac.cpp on x86 with GCC -fPIC + - fixed run-time validation error on x86-64 with GCC 4.3.2 -O2 + - fixed HashFilter bug when putMessage=true + - fixed AES-CTR data alignment bug that causes incorrect encryption on ARM + - removed WORD64_AVAILABLE; compiler support for 64-bit int is now required + - ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11 + +5.6.1 - added support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM + - removed WAKE-CFB + - fixed several bugs in the SHA-256 x86/x64 assembly code: + * incorrect hash on non-SSE2 x86 machines on non-aligned input + * incorrect hash on x86 machines when input crosses 0x80000000 + * incorrect hash on x64 when compiled with GCC with optimizations enabled + - fixed bugs in AES x86 and x64 assembly causing crashes in some MSVC build configurations + - switched to a public domain implementation of MARS + - ported to MSVC 2010, Sun Studio 12u1 + - renamed the MSVC DLL project to "cryptopp" for compatibility with MSVC 2010 Written by Wei Dai diff --git a/bench.cpp b/bench.cpp index cee316cd..8521a5a3 100644 --- a/bench.cpp +++ b/bench.cpp @@ -10,6 +10,7 @@ #include "hex.h" #include "modes.h" #include "factory.h" +#include "cpu.h" #include #include @@ -242,14 +243,24 @@ void BenchmarkAll(double t, double hertz) cout << "AlgorithmMiB/Second" << cpb << "Microseconds to
Setup Key and IV" << cpk << endl; cout << "\n"; - BenchMarkByName2("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048)); - BenchMarkByName2("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024)); + if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL()) + BenchMarkByName2("AES/GCM", 0, "AES/GCM"); + else + { + BenchMarkByName2("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048)); + BenchMarkByName2("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024)); + } BenchMarkByName2("AES/CCM"); BenchMarkByName2("AES/EAX"); cout << "\n"; - BenchMarkByName2("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048)); - BenchMarkByName2("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024)); + if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL()) + BenchMarkByName2("AES/GCM", 0, "GMAC(AES)"); + else + { + BenchMarkByName2("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048)); + BenchMarkByName2("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024)); + } BenchMarkByName("VMAC(AES)-64"); BenchMarkByName("VMAC(AES)-128"); BenchMarkByName("HMAC(SHA-1)"); diff --git a/config.h b/config.h index f53cbce9..3a038446 100644 --- a/config.h +++ b/config.h @@ -257,6 +257,7 @@ NAMESPACE_END #endif #if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))) + // C++Builder 2010 does not allow "call label" where label is defined within inline assembly #define CRYPTOPP_X86_ASM_AVAILABLE #if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(CRYPTOPP_MSVC6PP_OR_LATER) || CRYPTOPP_GCC_VERSION >= 30300) @@ -288,6 +289,12 @@ NAMESPACE_END #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 #endif +#if defined(CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE) && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110) + #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1 +#else + #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0 +#endif + #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) #define CRYPTOPP_BOOL_ALIGN16_ENABLED 1 #else diff --git a/cpu.cpp b/cpu.cpp index 11c27b8d..c1a1d956 100755 --- a/cpu.cpp +++ b/cpu.cpp @@ -8,7 +8,7 @@ #include "misc.h" #include -#ifdef __GNUC__ +#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY #include #include #endif @@ -19,9 +19,19 @@ NAMESPACE_BEGIN(CryptoPP) -#ifdef CRYPTOPP_X86_ASM_AVAILABLE +#ifdef CRYPTOPP_CPUID_AVAILABLE -#ifndef _MSC_VER +#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64 + +bool CpuId(word32 input, word32 *output) +{ + __cpuid((int *)output, input); + return true; +} + +#else + +#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY typedef void (*SigHandler)(int); static jmp_buf s_jmpNoCPUID; @@ -29,11 +39,17 @@ static void SigIllHandlerCPUID(int) { longjmp(s_jmpNoCPUID, 1); } + +static jmp_buf s_jmpNoSSE2; +static void SigIllHandlerSSE2(int) +{ + longjmp(s_jmpNoSSE2, 1); +} #endif bool CpuId(word32 input, word32 *output) { -#ifdef _MSC_VER +#ifdef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY __try { __asm @@ -80,31 +96,13 @@ bool CpuId(word32 input, word32 *output) #endif } -#ifndef _MSC_VER -static jmp_buf s_jmpNoSSE2; -static void SigIllHandlerSSE2(int) -{ - longjmp(s_jmpNoSSE2, 1); -} #endif -#elif _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64 - -bool CpuId(word32 input, word32 *output) -{ - __cpuid((int *)output, input); - return true; -} - -#endif - -#ifdef CRYPTOPP_CPUID_AVAILABLE - static bool TrySSE2() { #if CRYPTOPP_BOOL_X64 return true; -#elif defined(_MSC_VER) +#elif defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) __try { #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE @@ -119,7 +117,7 @@ static bool TrySSE2() return false; } return true; -#elif defined(__GNUC__) +#else SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2); if (oldHandler == SIG_ERR) return false; @@ -139,8 +137,6 @@ static bool TrySSE2() signal(SIGILL, oldHandler); return result; -#else - return false; #endif } @@ -160,8 +156,8 @@ void DetectX86Features() if ((cpuid1[3] & (1 << 26)) != 0) g_hasSSE2 = TrySSE2(); g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9)); - g_hasAESNI = (cpuid1[2] & (1<<25)) != 0; - g_hasCLMUL = (cpuid1[2] & (1<<1)) != 0; + g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25)); + g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1)); if ((cpuid1[3] & (1 << 25)) != 0) g_hasISSE = true; diff --git a/cpu.h b/cpu.h index 79e5ea86..9a6ee223 100755 --- a/cpu.h +++ b/cpu.h @@ -18,22 +18,18 @@ NAMESPACE_BEGIN(CryptoPP) -#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64) +#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64 #define CRYPTOPP_CPUID_AVAILABLE // these should not be used directly extern CRYPTOPP_DLL bool g_x86DetectionDone; -extern CRYPTOPP_DLL bool g_hasSSE2; -extern CRYPTOPP_DLL bool g_hasISSE; -extern CRYPTOPP_DLL bool g_hasMMX; extern CRYPTOPP_DLL bool g_hasSSSE3; extern CRYPTOPP_DLL bool g_hasAESNI; extern CRYPTOPP_DLL bool g_hasCLMUL; extern CRYPTOPP_DLL bool g_isP4; extern CRYPTOPP_DLL word32 g_cacheLineSize; CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features(); - CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 *output); #if CRYPTOPP_BOOL_X64 @@ -42,6 +38,10 @@ inline bool HasISSE() {return true;} inline bool HasMMX() {return true;} #else +extern CRYPTOPP_DLL bool g_hasSSE2; +extern CRYPTOPP_DLL bool g_hasISSE; +extern CRYPTOPP_DLL bool g_hasMMX; + inline bool HasSSE2() { if (!g_x86DetectionDone) @@ -107,22 +107,8 @@ inline int GetCacheLineSize() return CRYPTOPP_L1_CACHE_LINE_SIZE; } -inline bool HasSSSE3() {return false;} -inline bool IsP4() {return false;} - -// assume MMX and SSE2 if intrinsics are enabled -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_X64 -inline bool HasSSE2() {return true;} -inline bool HasISSE() {return true;} -inline bool HasMMX() {return true;} -#else -inline bool HasSSE2() {return false;} -inline bool HasISSE() {return false;} -inline bool HasMMX() {return false;} #endif -#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400 - #endif #ifdef CRYPTOPP_GENERATE_X64_MASM @@ -134,7 +120,19 @@ inline bool HasMMX() {return false;} #define ASJ(x, y, z) x label##y*newline* #define ASC(x, y) x label##y*newline* #define AS_HEX(y) 0##y##h -#elif defined(__GNUC__) +#elif defined(_MSC_VER) || defined(__BORLANDC__) + #define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + #define AS1(x) __asm {x} + #define AS2(x, y) __asm {x, y} + #define AS3(x, y, z) __asm {x, y, z} + #define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)} + #define ASL(x) __asm {label##x:} + #define ASJ(x, y, z) __asm {x label##y} + #define ASC(x, y) __asm {x label##y} + #define CRYPTOPP_NAKED __declspec(naked) + #define AS_HEX(y) 0x##y +#else + #define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY // define these in two steps to allow arguments to be expanded #define GNU_AS1(x) #x ";" #define GNU_AS2(x, y) #x ", " #y ";" @@ -150,16 +148,6 @@ inline bool HasMMX() {return false;} #define ASC(x, y) #x " " #y ";" #define CRYPTOPP_NAKED #define AS_HEX(y) 0x##y -#else - #define AS1(x) __asm {x} - #define AS2(x, y) __asm {x, y} - #define AS3(x, y, z) __asm {x, y, z} - #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)} - #define ASL(x) __asm {label##x:} - #define ASJ(x, y, z) __asm {x label##y} - #define ASC(x, y) __asm {x label##y} - #define CRYPTOPP_NAKED __declspec(naked) - #define AS_HEX(y) 0x##y #endif #define IF0(y) diff --git a/cryptlib.h b/cryptlib.h index fbd76311..c6f4c423 100644 --- a/cryptlib.h +++ b/cryptlib.h @@ -456,7 +456,7 @@ public: //! return number of blocks that can be processed in parallel, for bit-slicing implementations virtual unsigned int OptimalNumberOfParallelBlocks() const {return 1;} - enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8} FlagsForAdvancedProcessBlocks; + enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8, BT_AllowParallel=16} FlagsForAdvancedProcessBlocks; //! encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks) /*! /note If BT_InBlockIsCounter is set, last byte of inBlocks may be modified. */ diff --git a/datatest.cpp b/datatest.cpp index cfac74a0..121e0c55 100644 --- a/datatest.cpp +++ b/datatest.cpp @@ -57,15 +57,15 @@ const std::string & GetRequiredDatum(const TestData &data, const char *name) return i->second; } -void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish) +void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish, const std::string &channel=DEFAULT_CHANNEL) { while (source.MaxRetrievable() > (finish ? 0 : 4096)) { byte buf[4096+64]; - word32 start = GlobalRNG().GenerateWord32(0, 63); - word32 len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, source.MaxRetrievable())); - source.Get(buf+start, len); - target.Put(buf+start, len); + size_t start = GlobalRNG().GenerateWord32(0, 63); + size_t len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, 3*source.MaxRetrievable()/2)); + len = source.Get(buf+start, len); + target.ChannelPut(channel, buf+start, len); } } @@ -397,9 +397,9 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters) return; } - StringSource ss(plaintext, false, new StreamTransformationFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING)); - ss.Pump(plaintext.size()/2 + 1); - ss.PumpAll(); + StreamTransformationFilter encFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING); + RandomizedTransfer(StringStore(plaintext).Ref(), encFilter, true); + encFilter.MessageEnd(); /*{ std::string z; encryptor->Seek(seek); @@ -422,14 +422,14 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters) { std::cout << "incorrectly encrypted: "; StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout))); - xx.Pump(256); xx.Flush(false); + xx.Pump(2048); xx.Flush(false); std::cout << "\n"; SignalTestFailure(); } std::string decrypted; - StringSource dd(encrypted, false, new StreamTransformationFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING)); - dd.Pump(plaintext.size()/2 + 1); - dd.PumpAll(); + StreamTransformationFilter decFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING); + RandomizedTransfer(StringStore(encrypted).Ref(), decFilter, true); + decFilter.MessageEnd(); if (decrypted != plaintext) { std::cout << "incorrectly decrypted: "; @@ -484,27 +484,24 @@ void TestAuthenticatedSymmetricCipher(TestData &v, const NameValuePairs &overrid StringStore sh(header), sp(plaintext), sc(ciphertext), sf(footer), sm(mac); if (macAtBegin) - sm.TransferTo(df); + RandomizedTransfer(sm, df, true); sh.CopyTo(df, LWORD_MAX, AAD_CHANNEL); - sc.TransferTo(df); + RandomizedTransfer(sc, df, true); sf.CopyTo(df, LWORD_MAX, AAD_CHANNEL); if (!macAtBegin) - sm.TransferTo(df); + RandomizedTransfer(sm, df, true); df.MessageEnd(); - sh.TransferTo(ef, sh.MaxRetrievable()/2+1, AAD_CHANNEL); - sh.TransferTo(ef, LWORD_MAX, AAD_CHANNEL); - sp.TransferTo(ef, sp.MaxRetrievable()/2+1); - sp.TransferTo(ef); - sf.TransferTo(ef, sf.MaxRetrievable()/2+1, AAD_CHANNEL); - sf.TransferTo(ef, LWORD_MAX, AAD_CHANNEL); + RandomizedTransfer(sh, ef, true, AAD_CHANNEL); + RandomizedTransfer(sp, ef, true); + RandomizedTransfer(sf, ef, true, AAD_CHANNEL); ef.MessageEnd(); if (test == "Encrypt" && encrypted != ciphertext+mac) { std::cout << "incorrectly encrypted: "; StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout))); - xx.Pump(256); xx.Flush(false); + xx.Pump(2048); xx.Flush(false); std::cout << "\n"; SignalTestFailure(); } diff --git a/files.cpp b/files.cpp index 6a29ed55..453b5624 100644 --- a/files.cpp +++ b/files.cpp @@ -95,7 +95,7 @@ size_t FileStore::TransferTo2(BufferedTransformation &target, lword &transferByt m_stream->read((char *)m_space, (unsigned int)STDMIN(size, (lword)spaceSize)); } - m_len = m_stream->gcount(); + m_len = (size_t)m_stream->gcount(); size_t blockedBytes; output: blockedBytes = target.ChannelPutModifiable2(channel, m_space, m_len, 0, blocking); @@ -242,7 +242,7 @@ size_t FileSink::Put2(const byte *inString, size_t length, int messageEnd, bool size = numeric_limits::max(); m_stream->write((const char *)inString, size); inString += size; - length -= size; + length -= (size_t)size; } if (messageEnd) diff --git a/gcm.cpp b/gcm.cpp index 83672270..610db97f 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -14,6 +14,11 @@ #include #endif +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#include +#include +#endif + NAMESPACE_BEGIN(CryptoPP) word16 GCM_Base::s_reductionTable[256]; @@ -47,6 +52,21 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char * } Block::Put(NULL, c)(Z0)(Z1); } + +__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i) +{ + word64 A[1] = {ByteReverse(((word64*)&a)[i&1])}; + word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])}; + + PolynomialMod2 pa((byte *)A, 8); + PolynomialMod2 pb((byte *)B, 8); + PolynomialMod2 c = pa*pb; + + __m128i output; + for (int i=0; i<16; i++) + ((byte *)&output)[i] = c.GetByte(i); + return output; +} #endif #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE @@ -66,6 +86,56 @@ inline static void Xor16(byte *a, const byte *b, const byte *c) ((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1]; } +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +static CRYPTOPP_ALIGN_DATA(16) const word64 s_clmulConstants64[] = { + 0xe100000000000000, 0xc200000000000000, + 0x08090a0b0c0d0e0f, 0x0001020304050607, + 0x0001020304050607, 0x08090a0b0c0d0e0f}; +static const __m128i *s_clmulConstants = (const __m128i *)s_clmulConstants64; +static const unsigned int s_clmulTableSizeInBlocks = 8; + +inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) +{ + /* + The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most + significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the + rightmost bit positions, and the lowest byte addresses. + + c1 ^= c0t * 0xc200000000000000 + c2t ^= c0t + t = shift (c1t ^ c0b) left 1 bit + c2 ^= t * 0xe100000000000000 + c2t ^= c1b + shift c2 left 1 bit and xor in lowest bit of c1t + */ +#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301 + c2 = _mm_xor_si128(c2, _mm_move_epi64(c0)); +#else + c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); +#endif + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); + c0 = _mm_srli_si128(c0, 8); + c0 = _mm_xor_si128(c0, c1); + c0 = _mm_slli_epi64(c0, 1); + c0 = _mm_clmulepi64_si128(c0, r, 0); + c2 = _mm_xor_si128(c2, c0); + c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8)); + c1 = _mm_unpacklo_epi64(c1, c2); + c1 = _mm_srli_epi64(c1, 63); + c2 = _mm_slli_epi64(c2, 1); + return _mm_xor_si128(c2, c1); +} + +inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r) +{ + __m128i c0 = _mm_clmulepi64_si128(x,h,0); + __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); + __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); + + return CLMUL_Reduce(c0, c1, c2, r); +} +#endif + void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms) { BlockCipher &blockCipher = AccessBlockCipher(); @@ -74,26 +144,56 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE) throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16"); - int tableSize; - if (params.GetIntValue(Name::TableSize(), tableSize)) - tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024; + int tableSize, i, j, k; + +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasCLMUL()) + { + params.GetIntValue(Name::TableSize(), tableSize); // avoid "parameter not used" error + tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; + } else - tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024; +#endif + { + if (params.GetIntValue(Name::TableSize(), tableSize)) + tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024; + else + tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024; #if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400) - // VC 2003 workaround: compiler generates bad code for 64K tables - tableSize = 2*1024; + // VC 2003 workaround: compiler generates bad code for 64K tables + tableSize = 2*1024; #endif + } m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize); + byte *table = MulTable(); byte *hashKey = HashKey(); memset(hashKey, 0, REQUIRED_BLOCKSIZE); blockCipher.ProcessBlock(hashKey); - byte *table = MulTable(); - int i, j, k; - word64 V0, V1; +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasCLMUL()) + { + const __m128i r = s_clmulConstants[0]; + __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)hashKey), s_clmulConstants[1]); + __m128i h = h0; + for (i=0; i Block; Block::Get(hashKey)(V0)(V1); @@ -178,6 +278,17 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const } } +inline void GCM_Base::ReverseHashBufferIfNeeded() +{ +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasCLMUL()) + { + __m128i &x = *(__m128i *)HashBuffer(); + x = _mm_shuffle_epi8(x, s_clmulConstants[1]); + } +#endif +} + void GCM_Base::Resync(const byte *iv, size_t len) { BlockCipher &cipher = AccessBlockCipher(); @@ -209,6 +320,8 @@ void GCM_Base::Resync(const byte *iv, size_t len) PutBlock(NULL, m_buffer)(0)(origLen*8); GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE); + + ReverseHashBufferIfNeeded(); } if (m_state >= State_IVSet) @@ -241,6 +354,73 @@ void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuf size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) { +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasCLMUL()) + { + const __m128i *table = (const __m128i *)MulTable(); + __m128i x = _mm_load_si128((__m128i *)HashBuffer()); + const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2]; + + while (len >= 16) + { + size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; + __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-1)*16)), bswapMask2);; + __m128i c0 = _mm_setzero_si128(); + __m128i c1 = _mm_setzero_si128(); + __m128i c2 = _mm_setzero_si128(); + + while (true) + { + __m128i h0 = _mm_load_si128(table+i); + __m128i h1 = _mm_load_si128(table+i+1); + __m128i h01 = _mm_xor_si128(h0, h1); + + if (++i == s) + { + d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask); + d = _mm_xor_si128(d, x); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1)); + d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2))); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0)); + break; + } + + d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask2); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1)); + d2 = _mm_xor_si128(d2, d); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1)); + + if (++i == s) + { + d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask); + d = _mm_xor_si128(d, x); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11)); + d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2))); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10)); + break; + } + + d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); + d = _mm_xor_si128(d, d2); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10)); + } + data += s*16; + len -= s*16; + + c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2); + x = CLMUL_Reduce(c0, c1, c2, r); + } + + _mm_store_si128((__m128i *)HashBuffer(), x); + return len; + } +#endif + typedef BlockGetAndPut Block; word64 *hashBuffer = (word64 *)HashBuffer(); @@ -414,9 +594,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) AS2( shr WORD_REG(dx), 4 ) #endif - #if !defined(_MSC_VER) || (_MSC_VER < 1400) - AS_PUSH_IF86( bx) - #endif + AS_PUSH_IF86( bx) AS_PUSH_IF86( bp) #ifdef __GNUC__ @@ -524,9 +702,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) AS2( movdqa [WORD_REG(si)], xmm0 ) AS_POP_IF86( bp) - #if !defined(_MSC_VER) || (_MSC_VER < 1400) - AS_POP_IF86( bx) - #endif + AS_POP_IF86( bx) #ifdef __GNUC__ ".att_syntax prefix;" @@ -647,6 +823,7 @@ void GCM_Base::AuthenticateLastConfidentialBlock() void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize) { m_ctr.Seek(0); + ReverseHashBufferIfNeeded(); m_ctr.ProcessData(mac, HashBuffer(), macSize); } diff --git a/gcm.h b/gcm.h index 0133ffef..0b32524f 100644 --- a/gcm.h +++ b/gcm.h @@ -63,6 +63,7 @@ protected: byte *HashBuffer() {return m_buffer+REQUIRED_BLOCKSIZE;} byte *HashKey() {return m_buffer+2*REQUIRED_BLOCKSIZE;} byte *MulTable() {return m_buffer+3*REQUIRED_BLOCKSIZE;} + inline void ReverseHashBufferIfNeeded(); class CRYPTOPP_DLL GCTR : public CTR_Mode_ExternalCipher::Encryption { diff --git a/modes.cpp b/modes.cpp index 81bf4de3..789fafb2 100644 --- a/modes.cpp +++ b/modes.cpp @@ -115,7 +115,7 @@ void CTR_ModePolicy::OperateKeystream(KeystreamOperation operation, byte *output { byte lsb = m_counterArray[s-1]; size_t blocks = UnsignedMin(iterationCount, 256U-lsb); - m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter); + m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_AllowParallel); if ((m_counterArray[s-1] = lsb + (byte)blocks) == 0) IncrementCounterBy256(); @@ -147,7 +147,7 @@ void BlockOrientedCipherModeBase::UncheckedSetKey(const byte *key, unsigned int void ECB_OneWay::ProcessData(byte *outString, const byte *inString, size_t length) { assert(length%BlockSize()==0); - m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, 0); + m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, BlockTransformation::BT_AllowParallel); } void CBC_Encryption::ProcessData(byte *outString, const byte *inString, size_t length) @@ -199,7 +199,7 @@ void CBC_Decryption::ProcessData(byte *outString, const byte *inString, size_t l unsigned int blockSize = BlockSize(); memcpy(m_temp, inString+length-blockSize, blockSize); // save copy now in case of in-place decryption if (length > blockSize) - m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection); + m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection|BlockTransformation::BT_AllowParallel); m_cipher->ProcessAndXorBlock(inString, m_register, outString); m_register.swap(m_temp); } diff --git a/modes.h b/modes.h index ff88d310..c0c30c47 100644 --- a/modes.h +++ b/modes.h @@ -340,6 +340,7 @@ struct OFB_Mode_ExternalCipher : public CipherModeDocumentation }; CRYPTOPP_DLL_TEMPLATE_CLASS AdditiveCipherTemplate >; +CRYPTOPP_DLL_TEMPLATE_CLASS CipherModeFinalTemplate_ExternalCipher > > >; //! CTR mode template diff --git a/rijndael.cpp b/rijndael.cpp index a39b65d0..fbc7dccb 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -4,6 +4,10 @@ // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code +/* +July 2010: Added support for AES-NI instructions via compiler intrinsics. +*/ + /* Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein @@ -69,6 +73,10 @@ being unloaded from L1 cache, until that round is finished. #include "misc.h" #include "cpu.h" +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#include +#endif + NAMESPACE_BEGIN(CryptoPP) #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS @@ -198,20 +206,83 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c m_rounds = keylen/4 + 6; m_key.New(4*(m_rounds+1)); - word32 temp, *rk = m_key; - const word32 *rc = rcon; + word32 *rk = m_key; + +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86) + // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 + if (HasAESNI()) + { + static const word32 rcLE[] = { + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80, + 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ + }; + const word32 *rc = rcLE; + + __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16)); + memcpy(rk, userKey, keylen); + + while (true) + { + rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); + rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; + rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; + rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; + + if (rk + keylen/4 + 4 == m_key.end()) + break; + + if (keylen == 24) + { + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + temp = _mm_insert_epi32(temp, rk[11], 3); + } + else if (keylen == 32) + { + temp = _mm_insert_epi32(temp, rk[11], 3); + rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + temp = _mm_insert_epi32(temp, rk[15], 3); + } + else + temp = _mm_insert_epi32(temp, rk[7], 3); + + rk += keylen/4; + } + + if (!IsForwardTransformation()) + { + rk = m_key; + unsigned int i, j; + + std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); + + for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) + { + temp = _mm_aesimc_si128(*(__m128i *)(rk+i)); + *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j)); + *(__m128i *)(rk+j) = temp; + } + + *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i)); + } + + return; + } +#endif GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); + const word32 *rc = rcon; + word32 temp; while (true) { temp = rk[keylen/4-1]; - rk[keylen/4] = rk[0] ^ - (word32(Se[GETBYTE(temp, 2)]) << 24) ^ - (word32(Se[GETBYTE(temp, 1)]) << 16) ^ - (word32(Se[GETBYTE(temp, 0)]) << 8) ^ - Se[GETBYTE(temp, 3)] ^ - *(rc++); + word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; + rk[keylen/4] = rk[0] ^ x ^ *(rc++); rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; @@ -227,11 +298,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c else if (keylen == 32) { temp = rk[11]; - rk[12] = rk[ 4] ^ - (word32(Se[GETBYTE(temp, 3)]) << 24) ^ - (word32(Se[GETBYTE(temp, 2)]) << 16) ^ - (word32(Se[GETBYTE(temp, 1)]) << 8) ^ - Se[GETBYTE(temp, 0)]; + rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; rk[13] = rk[ 5] ^ rk[12]; rk[14] = rk[ 6] ^ rk[13]; rk[15] = rk[ 7] ^ rk[14]; @@ -239,10 +306,15 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c rk += keylen/4; } + rk = m_key; + if (IsForwardTransformation()) { if (!s_TeFilled) FillEncTable(); + + ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16); + ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16); } else { @@ -250,35 +322,37 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c FillDecTable(); unsigned int i, j; - rk = m_key; - /* invert the order of the round keys: */ - for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) { - temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; - temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; - temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; - temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; +#define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) + + for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) + { + temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp; + temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp; + temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp; + temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp; } -#define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) + rk[i+0] = InverseMixColumn(rk[i+0]); + rk[i+1] = InverseMixColumn(rk[i+1]); + rk[i+2] = InverseMixColumn(rk[i+2]); + rk[i+3] = InverseMixColumn(rk[i+3]); - /* apply the inverse MixColumn transform to all round keys but the first and the last: */ - for (i = 1; i < m_rounds; i++) { - rk += 4; - InverseMixColumn(rk[0]); - InverseMixColumn(rk[1]); - InverseMixColumn(rk[2]); - InverseMixColumn(rk[3]); - } + temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp; + temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp; + temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp; + temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; } - ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16); - ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasAESNI()) + ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); +#endif } void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE if (HasSSE2()) { Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); @@ -354,6 +428,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasAESNI()) + { + Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); + return; + } +#endif + typedef BlockGetAndPut Block; word32 s0, s1, s2, s3, t0, t1, t2, t3; @@ -913,14 +995,200 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end) return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); } +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + +inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds) +{ + block = _mm_xor_si128(block, subkeys[0]); + for (unsigned int i=1; i +inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BlockTransformation::BT_ReverseDirection) + { + assert(length % blockSize == 0); + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 4*blockSize) + { + __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *(const __m128i *)s_one; + block1 = _mm_add_epi32(block0, be1); + block2 = _mm_add_epi32(block1, be1); + block3 = _mm_add_epi32(block2, be1); + _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128((const __m128i *)inBlocks); + inBlocks += inIncrement; + block2 = _mm_loadu_si128((const __m128i *)inBlocks); + inBlocks += inIncrement; + block3 = _mm_loadu_si128((const __m128i *)inBlocks); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + } + + func4(block0, block1, block2, block3, subkeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks)); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128((__m128i *)outBlocks, block0); + outBlocks += outIncrement; + _mm_storeu_si128((__m128i *)outBlocks, block1); + outBlocks += outIncrement; + _mm_storeu_si128((__m128i *)outBlocks, block2); + outBlocks += outIncrement; + _mm_storeu_si128((__m128i *)outBlocks, block3); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + __m128i block = _mm_loadu_si128((const __m128i *)inBlocks); + + if (flags & BlockTransformation::BT_XorInput) + block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks)); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subkeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks)); + + _mm_storeu_si128((__m128i *)outBlocks, block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} +#endif + size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + if (HasAESNI()) + return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif + #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) - if (length < BLOCKSIZE) - return length; - if (HasSSE2()) { + if (length < BLOCKSIZE) + return length; + struct Locals { word32 subkeys[4*12], workspace[8]; @@ -966,15 +1234,27 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo locals.keysBegin = (12-keysToCopy)*16; Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key); - return length%16; + return length % BLOCKSIZE; } - else #endif - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); + + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } #endif +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + +size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const +{ + if (HasAESNI()) + return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +#endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + NAMESPACE_END #endif diff --git a/rijndael.h b/rijndael.h index d6021868..64c784b0 100644 --- a/rijndael.h +++ b/rijndael.h @@ -50,6 +50,9 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif }; public: diff --git a/validat1.cpp b/validat1.cpp index dafeb52c..696327d1 100644 --- a/validat1.cpp +++ b/validat1.cpp @@ -252,6 +252,7 @@ bool TestSettings() cout << "passed: "; cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize; + cout << ", AESNI_INTRINSICS == " << CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE << endl; if (!pass) {