add support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM

pull/2/head
weidai 2010-07-24 05:55:22 +00:00
parent a070ff16ae
commit 7adbf89681
15 changed files with 630 additions and 157 deletions

View File

@ -414,19 +414,30 @@ the mailing list.
- ported to MSVC 2008, GCC 4.2, Sun CC 5.9, Intel C++ Compiler 10.0, - ported to MSVC 2008, GCC 4.2, Sun CC 5.9, Intel C++ Compiler 10.0,
and Borland C++Builder 2007 and Borland C++Builder 2007
5.6 - added AuthenticatedSymmetricCipher interface class and Filter wrappers 5.6.0 - added AuthenticatedSymmetricCipher interface class and Filter wrappers
- added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED - added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED
- added support for variable length IVs - added support for variable length IVs
- added OIDs for Brainpool elliptic curve parameters - added OIDs for Brainpool elliptic curve parameters
- improved AES and SHA-256 speed on x86 and x64 - improved AES and SHA-256 speed on x86 and x64
- fixed incorrect VMAC computation on message lengths - changed BlockTransformation interface to no longer assume data alignment
that are >64 mod 128 (x86 assembly version is not affected) - fixed incorrect VMAC computation on message lengths
- fixed compiler error in vmac.cpp on x86 with GCC -fPIC that are >64 mod 128 (x86 assembly version is not affected)
- fixed run-time validation error on x86-64 with GCC 4.3.2 -O2 - fixed compiler error in vmac.cpp on x86 with GCC -fPIC
- fixed HashFilter bug when putMessage=true - fixed run-time validation error on x86-64 with GCC 4.3.2 -O2
- removed WORD64_AVAILABLE; compiler support for 64-bit int is now required - fixed HashFilter bug when putMessage=true
- ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11 - fixed AES-CTR data alignment bug that causes incorrect encryption on ARM
- removed WORD64_AVAILABLE; compiler support for 64-bit int is now required
5.6.1 - switched to a public domain implementation of MARS - ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11
5.6.1 - added support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM
- removed WAKE-CFB
- fixed several bugs in the SHA-256 x86/x64 assembly code:
* incorrect hash on non-SSE2 x86 machines on non-aligned input
* incorrect hash on x86 machines when input crosses 0x80000000
* incorrect hash on x64 when compiled with GCC with optimizations enabled
- fixed bugs in AES x86 and x64 assembly causing crashes in some MSVC build configurations
- switched to a public domain implementation of MARS
- ported to MSVC 2010, Sun Studio 12u1
- renamed the MSVC DLL project to "cryptopp" for compatibility with MSVC 2010
Written by Wei Dai Written by Wei Dai

View File

@ -10,6 +10,7 @@
#include "hex.h" #include "hex.h"
#include "modes.h" #include "modes.h"
#include "factory.h" #include "factory.h"
#include "cpu.h"
#include <time.h> #include <time.h>
#include <math.h> #include <math.h>
@ -242,14 +243,24 @@ void BenchmarkAll(double t, double hertz)
cout << "<THEAD><TR><TH>Algorithm<TH>MiB/Second" << cpb << "<TH>Microseconds to<br>Setup Key and IV" << cpk << endl; cout << "<THEAD><TR><TH>Algorithm<TH>MiB/Second" << cpb << "<TH>Microseconds to<br>Setup Key and IV" << cpk << endl;
cout << "\n<TBODY style=\"background: yellow\">"; cout << "\n<TBODY style=\"background: yellow\">";
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048)); if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL())
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024)); BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM");
else
{
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048));
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
}
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/CCM"); BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/CCM");
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/EAX"); BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/EAX");
cout << "\n<TBODY style=\"background: white\">"; cout << "\n<TBODY style=\"background: white\">";
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048)); if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL())
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024)); BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
else
{
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048));
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
}
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-64"); BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-64");
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-128"); BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-128");
BenchMarkByName<MessageAuthenticationCode>("HMAC(SHA-1)"); BenchMarkByName<MessageAuthenticationCode>("HMAC(SHA-1)");

View File

@ -257,6 +257,7 @@ NAMESPACE_END
#endif #endif
#if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))) #if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))))
// C++Builder 2010 does not allow "call label" where label is defined within inline assembly
#define CRYPTOPP_X86_ASM_AVAILABLE #define CRYPTOPP_X86_ASM_AVAILABLE
#if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(CRYPTOPP_MSVC6PP_OR_LATER) || CRYPTOPP_GCC_VERSION >= 30300) #if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(CRYPTOPP_MSVC6PP_OR_LATER) || CRYPTOPP_GCC_VERSION >= 30300)
@ -288,6 +289,12 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
#endif #endif
#if defined(CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE) && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110)
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1
#else
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
#endif
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
#define CRYPTOPP_BOOL_ALIGN16_ENABLED 1 #define CRYPTOPP_BOOL_ALIGN16_ENABLED 1
#else #else

52
cpu.cpp
View File

@ -8,7 +8,7 @@
#include "misc.h" #include "misc.h"
#include <algorithm> #include <algorithm>
#ifdef __GNUC__ #ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#include <signal.h> #include <signal.h>
#include <setjmp.h> #include <setjmp.h>
#endif #endif
@ -19,9 +19,19 @@
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_X86_ASM_AVAILABLE #ifdef CRYPTOPP_CPUID_AVAILABLE
#ifndef _MSC_VER #if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
bool CpuId(word32 input, word32 *output)
{
__cpuid((int *)output, input);
return true;
}
#else
#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
typedef void (*SigHandler)(int); typedef void (*SigHandler)(int);
static jmp_buf s_jmpNoCPUID; static jmp_buf s_jmpNoCPUID;
@ -29,11 +39,17 @@ static void SigIllHandlerCPUID(int)
{ {
longjmp(s_jmpNoCPUID, 1); longjmp(s_jmpNoCPUID, 1);
} }
static jmp_buf s_jmpNoSSE2;
static void SigIllHandlerSSE2(int)
{
longjmp(s_jmpNoSSE2, 1);
}
#endif #endif
bool CpuId(word32 input, word32 *output) bool CpuId(word32 input, word32 *output)
{ {
#ifdef _MSC_VER #ifdef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
__try __try
{ {
__asm __asm
@ -80,31 +96,13 @@ bool CpuId(word32 input, word32 *output)
#endif #endif
} }
#ifndef _MSC_VER
static jmp_buf s_jmpNoSSE2;
static void SigIllHandlerSSE2(int)
{
longjmp(s_jmpNoSSE2, 1);
}
#endif #endif
#elif _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
bool CpuId(word32 input, word32 *output)
{
__cpuid((int *)output, input);
return true;
}
#endif
#ifdef CRYPTOPP_CPUID_AVAILABLE
static bool TrySSE2() static bool TrySSE2()
{ {
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X64
return true; return true;
#elif defined(_MSC_VER) #elif defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
__try __try
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@ -119,7 +117,7 @@ static bool TrySSE2()
return false; return false;
} }
return true; return true;
#elif defined(__GNUC__) #else
SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2); SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2);
if (oldHandler == SIG_ERR) if (oldHandler == SIG_ERR)
return false; return false;
@ -139,8 +137,6 @@ static bool TrySSE2()
signal(SIGILL, oldHandler); signal(SIGILL, oldHandler);
return result; return result;
#else
return false;
#endif #endif
} }
@ -160,8 +156,8 @@ void DetectX86Features()
if ((cpuid1[3] & (1 << 26)) != 0) if ((cpuid1[3] & (1 << 26)) != 0)
g_hasSSE2 = TrySSE2(); g_hasSSE2 = TrySSE2();
g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9)); g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
g_hasAESNI = (cpuid1[2] & (1<<25)) != 0; g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
g_hasCLMUL = (cpuid1[2] & (1<<1)) != 0; g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
if ((cpuid1[3] & (1 << 25)) != 0) if ((cpuid1[3] & (1 << 25)) != 0)
g_hasISSE = true; g_hasISSE = true;

48
cpu.h
View File

@ -18,22 +18,18 @@
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64) #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
#define CRYPTOPP_CPUID_AVAILABLE #define CRYPTOPP_CPUID_AVAILABLE
// these should not be used directly // these should not be used directly
extern CRYPTOPP_DLL bool g_x86DetectionDone; extern CRYPTOPP_DLL bool g_x86DetectionDone;
extern CRYPTOPP_DLL bool g_hasSSE2;
extern CRYPTOPP_DLL bool g_hasISSE;
extern CRYPTOPP_DLL bool g_hasMMX;
extern CRYPTOPP_DLL bool g_hasSSSE3; extern CRYPTOPP_DLL bool g_hasSSSE3;
extern CRYPTOPP_DLL bool g_hasAESNI; extern CRYPTOPP_DLL bool g_hasAESNI;
extern CRYPTOPP_DLL bool g_hasCLMUL; extern CRYPTOPP_DLL bool g_hasCLMUL;
extern CRYPTOPP_DLL bool g_isP4; extern CRYPTOPP_DLL bool g_isP4;
extern CRYPTOPP_DLL word32 g_cacheLineSize; extern CRYPTOPP_DLL word32 g_cacheLineSize;
CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features(); CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features();
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 *output); CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 *output);
#if CRYPTOPP_BOOL_X64 #if CRYPTOPP_BOOL_X64
@ -42,6 +38,10 @@ inline bool HasISSE() {return true;}
inline bool HasMMX() {return true;} inline bool HasMMX() {return true;}
#else #else
extern CRYPTOPP_DLL bool g_hasSSE2;
extern CRYPTOPP_DLL bool g_hasISSE;
extern CRYPTOPP_DLL bool g_hasMMX;
inline bool HasSSE2() inline bool HasSSE2()
{ {
if (!g_x86DetectionDone) if (!g_x86DetectionDone)
@ -107,22 +107,8 @@ inline int GetCacheLineSize()
return CRYPTOPP_L1_CACHE_LINE_SIZE; return CRYPTOPP_L1_CACHE_LINE_SIZE;
} }
inline bool HasSSSE3() {return false;}
inline bool IsP4() {return false;}
// assume MMX and SSE2 if intrinsics are enabled
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_X64
inline bool HasSSE2() {return true;}
inline bool HasISSE() {return true;}
inline bool HasMMX() {return true;}
#else
inline bool HasSSE2() {return false;}
inline bool HasISSE() {return false;}
inline bool HasMMX() {return false;}
#endif #endif
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400
#endif #endif
#ifdef CRYPTOPP_GENERATE_X64_MASM #ifdef CRYPTOPP_GENERATE_X64_MASM
@ -134,7 +120,19 @@ inline bool HasMMX() {return false;}
#define ASJ(x, y, z) x label##y*newline* #define ASJ(x, y, z) x label##y*newline*
#define ASC(x, y) x label##y*newline* #define ASC(x, y) x label##y*newline*
#define AS_HEX(y) 0##y##h #define AS_HEX(y) 0##y##h
#elif defined(__GNUC__) #elif defined(_MSC_VER) || defined(__BORLANDC__)
#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y}
#define AS3(x, y, z) __asm {x, y, z}
#define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)}
#define ASL(x) __asm {label##x:}
#define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked)
#define AS_HEX(y) 0x##y
#else
#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
// define these in two steps to allow arguments to be expanded // define these in two steps to allow arguments to be expanded
#define GNU_AS1(x) #x ";" #define GNU_AS1(x) #x ";"
#define GNU_AS2(x, y) #x ", " #y ";" #define GNU_AS2(x, y) #x ", " #y ";"
@ -150,16 +148,6 @@ inline bool HasMMX() {return false;}
#define ASC(x, y) #x " " #y ";" #define ASC(x, y) #x " " #y ";"
#define CRYPTOPP_NAKED #define CRYPTOPP_NAKED
#define AS_HEX(y) 0x##y #define AS_HEX(y) 0x##y
#else
#define AS1(x) __asm {x}
#define AS2(x, y) __asm {x, y}
#define AS3(x, y, z) __asm {x, y, z}
#define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
#define ASL(x) __asm {label##x:}
#define ASJ(x, y, z) __asm {x label##y}
#define ASC(x, y) __asm {x label##y}
#define CRYPTOPP_NAKED __declspec(naked)
#define AS_HEX(y) 0x##y
#endif #endif
#define IF0(y) #define IF0(y)

View File

@ -456,7 +456,7 @@ public:
//! return number of blocks that can be processed in parallel, for bit-slicing implementations //! return number of blocks that can be processed in parallel, for bit-slicing implementations
virtual unsigned int OptimalNumberOfParallelBlocks() const {return 1;} virtual unsigned int OptimalNumberOfParallelBlocks() const {return 1;}
enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8} FlagsForAdvancedProcessBlocks; enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8, BT_AllowParallel=16} FlagsForAdvancedProcessBlocks;
//! encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks) //! encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks)
/*! /note If BT_InBlockIsCounter is set, last byte of inBlocks may be modified. */ /*! /note If BT_InBlockIsCounter is set, last byte of inBlocks may be modified. */

View File

@ -57,15 +57,15 @@ const std::string & GetRequiredDatum(const TestData &data, const char *name)
return i->second; return i->second;
} }
void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish) void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish, const std::string &channel=DEFAULT_CHANNEL)
{ {
while (source.MaxRetrievable() > (finish ? 0 : 4096)) while (source.MaxRetrievable() > (finish ? 0 : 4096))
{ {
byte buf[4096+64]; byte buf[4096+64];
word32 start = GlobalRNG().GenerateWord32(0, 63); size_t start = GlobalRNG().GenerateWord32(0, 63);
word32 len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, source.MaxRetrievable())); size_t len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, 3*source.MaxRetrievable()/2));
source.Get(buf+start, len); len = source.Get(buf+start, len);
target.Put(buf+start, len); target.ChannelPut(channel, buf+start, len);
} }
} }
@ -397,9 +397,9 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters)
return; return;
} }
StringSource ss(plaintext, false, new StreamTransformationFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING)); StreamTransformationFilter encFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING);
ss.Pump(plaintext.size()/2 + 1); RandomizedTransfer(StringStore(plaintext).Ref(), encFilter, true);
ss.PumpAll(); encFilter.MessageEnd();
/*{ /*{
std::string z; std::string z;
encryptor->Seek(seek); encryptor->Seek(seek);
@ -422,14 +422,14 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters)
{ {
std::cout << "incorrectly encrypted: "; std::cout << "incorrectly encrypted: ";
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout))); StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
xx.Pump(256); xx.Flush(false); xx.Pump(2048); xx.Flush(false);
std::cout << "\n"; std::cout << "\n";
SignalTestFailure(); SignalTestFailure();
} }
std::string decrypted; std::string decrypted;
StringSource dd(encrypted, false, new StreamTransformationFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING)); StreamTransformationFilter decFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING);
dd.Pump(plaintext.size()/2 + 1); RandomizedTransfer(StringStore(encrypted).Ref(), decFilter, true);
dd.PumpAll(); decFilter.MessageEnd();
if (decrypted != plaintext) if (decrypted != plaintext)
{ {
std::cout << "incorrectly decrypted: "; std::cout << "incorrectly decrypted: ";
@ -484,27 +484,24 @@ void TestAuthenticatedSymmetricCipher(TestData &v, const NameValuePairs &overrid
StringStore sh(header), sp(plaintext), sc(ciphertext), sf(footer), sm(mac); StringStore sh(header), sp(plaintext), sc(ciphertext), sf(footer), sm(mac);
if (macAtBegin) if (macAtBegin)
sm.TransferTo(df); RandomizedTransfer(sm, df, true);
sh.CopyTo(df, LWORD_MAX, AAD_CHANNEL); sh.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
sc.TransferTo(df); RandomizedTransfer(sc, df, true);
sf.CopyTo(df, LWORD_MAX, AAD_CHANNEL); sf.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
if (!macAtBegin) if (!macAtBegin)
sm.TransferTo(df); RandomizedTransfer(sm, df, true);
df.MessageEnd(); df.MessageEnd();
sh.TransferTo(ef, sh.MaxRetrievable()/2+1, AAD_CHANNEL); RandomizedTransfer(sh, ef, true, AAD_CHANNEL);
sh.TransferTo(ef, LWORD_MAX, AAD_CHANNEL); RandomizedTransfer(sp, ef, true);
sp.TransferTo(ef, sp.MaxRetrievable()/2+1); RandomizedTransfer(sf, ef, true, AAD_CHANNEL);
sp.TransferTo(ef);
sf.TransferTo(ef, sf.MaxRetrievable()/2+1, AAD_CHANNEL);
sf.TransferTo(ef, LWORD_MAX, AAD_CHANNEL);
ef.MessageEnd(); ef.MessageEnd();
if (test == "Encrypt" && encrypted != ciphertext+mac) if (test == "Encrypt" && encrypted != ciphertext+mac)
{ {
std::cout << "incorrectly encrypted: "; std::cout << "incorrectly encrypted: ";
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout))); StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
xx.Pump(256); xx.Flush(false); xx.Pump(2048); xx.Flush(false);
std::cout << "\n"; std::cout << "\n";
SignalTestFailure(); SignalTestFailure();
} }

View File

@ -95,7 +95,7 @@ size_t FileStore::TransferTo2(BufferedTransformation &target, lword &transferByt
m_stream->read((char *)m_space, (unsigned int)STDMIN(size, (lword)spaceSize)); m_stream->read((char *)m_space, (unsigned int)STDMIN(size, (lword)spaceSize));
} }
m_len = m_stream->gcount(); m_len = (size_t)m_stream->gcount();
size_t blockedBytes; size_t blockedBytes;
output: output:
blockedBytes = target.ChannelPutModifiable2(channel, m_space, m_len, 0, blocking); blockedBytes = target.ChannelPutModifiable2(channel, m_space, m_len, 0, blocking);
@ -242,7 +242,7 @@ size_t FileSink::Put2(const byte *inString, size_t length, int messageEnd, bool
size = numeric_limits<std::streamsize>::max(); size = numeric_limits<std::streamsize>::max();
m_stream->write((const char *)inString, size); m_stream->write((const char *)inString, size);
inString += size; inString += size;
length -= size; length -= (size_t)size;
} }
if (messageEnd) if (messageEnd)

207
gcm.cpp
View File

@ -14,6 +14,11 @@
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#include <tmmintrin.h>
#include <wmmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
word16 GCM_Base::s_reductionTable[256]; word16 GCM_Base::s_reductionTable[256];
@ -47,6 +52,21 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
} }
Block::Put(NULL, c)(Z0)(Z1); Block::Put(NULL, c)(Z0)(Z1);
} }
__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
{
word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
PolynomialMod2 pa((byte *)A, 8);
PolynomialMod2 pb((byte *)B, 8);
PolynomialMod2 c = pa*pb;
__m128i output;
for (int i=0; i<16; i++)
((byte *)&output)[i] = c.GetByte(i);
return output;
}
#endif #endif
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
@ -66,6 +86,56 @@ inline static void Xor16(byte *a, const byte *b, const byte *c)
((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1]; ((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1];
} }
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
static CRYPTOPP_ALIGN_DATA(16) const word64 s_clmulConstants64[] = {
0xe100000000000000, 0xc200000000000000,
0x08090a0b0c0d0e0f, 0x0001020304050607,
0x0001020304050607, 0x08090a0b0c0d0e0f};
static const __m128i *s_clmulConstants = (const __m128i *)s_clmulConstants64;
static const unsigned int s_clmulTableSizeInBlocks = 8;
inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
{
/*
The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
rightmost bit positions, and the lowest byte addresses.
c1 ^= c0t * 0xc200000000000000
c2t ^= c0t
t = shift (c1t ^ c0b) left 1 bit
c2 ^= t * 0xe100000000000000
c2t ^= c1b
shift c2 left 1 bit and xor in lowest bit of c1t
*/
#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
#else
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
#endif
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
c0 = _mm_srli_si128(c0, 8);
c0 = _mm_xor_si128(c0, c1);
c0 = _mm_slli_epi64(c0, 1);
c0 = _mm_clmulepi64_si128(c0, r, 0);
c2 = _mm_xor_si128(c2, c0);
c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
c1 = _mm_unpacklo_epi64(c1, c2);
c1 = _mm_srli_epi64(c1, 63);
c2 = _mm_slli_epi64(c2, 1);
return _mm_xor_si128(c2, c1);
}
inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
{
__m128i c0 = _mm_clmulepi64_si128(x,h,0);
__m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
__m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
return CLMUL_Reduce(c0, c1, c2, r);
}
#endif
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params) void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs &params)
{ {
BlockCipher &blockCipher = AccessBlockCipher(); BlockCipher &blockCipher = AccessBlockCipher();
@ -74,26 +144,56 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE) if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16"); throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
int tableSize; int tableSize, i, j, k;
if (params.GetIntValue(Name::TableSize(), tableSize))
tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024; #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasCLMUL())
{
params.GetIntValue(Name::TableSize(), tableSize); // avoid "parameter not used" error
tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
}
else else
tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024; #endif
{
if (params.GetIntValue(Name::TableSize(), tableSize))
tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
else
tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024;
#if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400) #if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400)
// VC 2003 workaround: compiler generates bad code for 64K tables // VC 2003 workaround: compiler generates bad code for 64K tables
tableSize = 2*1024; tableSize = 2*1024;
#endif #endif
}
m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize); m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
byte *table = MulTable();
byte *hashKey = HashKey(); byte *hashKey = HashKey();
memset(hashKey, 0, REQUIRED_BLOCKSIZE); memset(hashKey, 0, REQUIRED_BLOCKSIZE);
blockCipher.ProcessBlock(hashKey); blockCipher.ProcessBlock(hashKey);
byte *table = MulTable(); #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
int i, j, k; if (HasCLMUL())
word64 V0, V1; {
const __m128i r = s_clmulConstants[0];
__m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)hashKey), s_clmulConstants[1]);
__m128i h = h0;
for (i=0; i<tableSize; i+=32)
{
__m128i h1 = CLMUL_GF_Mul(h, h0, r);
_mm_storel_epi64((__m128i *)(table+i), h);
_mm_storeu_si128((__m128i *)(table+i+16), h1);
_mm_storeu_si128((__m128i *)(table+i+8), h);
_mm_storel_epi64((__m128i *)(table+i+8), h1);
h = CLMUL_GF_Mul(h1, h0, r);
}
return;
}
#endif
word64 V0, V1;
typedef BlockGetAndPut<word64, BigEndian> Block; typedef BlockGetAndPut<word64, BigEndian> Block;
Block::Get(hashKey)(V0)(V1); Block::Get(hashKey)(V0)(V1);
@ -178,6 +278,17 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
} }
} }
inline void GCM_Base::ReverseHashBufferIfNeeded()
{
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasCLMUL())
{
__m128i &x = *(__m128i *)HashBuffer();
x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
}
#endif
}
void GCM_Base::Resync(const byte *iv, size_t len) void GCM_Base::Resync(const byte *iv, size_t len)
{ {
BlockCipher &cipher = AccessBlockCipher(); BlockCipher &cipher = AccessBlockCipher();
@ -209,6 +320,8 @@ void GCM_Base::Resync(const byte *iv, size_t len)
PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8); PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8);
GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE); GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
ReverseHashBufferIfNeeded();
} }
if (m_state >= State_IVSet) if (m_state >= State_IVSet)
@ -241,6 +354,73 @@ void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuf
size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
{ {
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasCLMUL())
{
const __m128i *table = (const __m128i *)MulTable();
__m128i x = _mm_load_si128((__m128i *)HashBuffer());
const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2];
while (len >= 16)
{
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
__m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-1)*16)), bswapMask2);;
__m128i c0 = _mm_setzero_si128();
__m128i c1 = _mm_setzero_si128();
__m128i c2 = _mm_setzero_si128();
while (true)
{
__m128i h0 = _mm_load_si128(table+i);
__m128i h1 = _mm_load_si128(table+i+1);
__m128i h01 = _mm_xor_si128(h0, h1);
if (++i == s)
{
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask);
d = _mm_xor_si128(d, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0));
break;
}
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask2);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
d2 = _mm_xor_si128(d2, d);
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1));
if (++i == s)
{
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask);
d = _mm_xor_si128(d, x);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11));
d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
break;
}
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask);
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
d = _mm_xor_si128(d, d2);
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
}
data += s*16;
len -= s*16;
c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
x = CLMUL_Reduce(c0, c1, c2, r);
}
_mm_store_si128((__m128i *)HashBuffer(), x);
return len;
}
#endif
typedef BlockGetAndPut<word64, NativeByteOrder> Block; typedef BlockGetAndPut<word64, NativeByteOrder> Block;
word64 *hashBuffer = (word64 *)HashBuffer(); word64 *hashBuffer = (word64 *)HashBuffer();
@ -414,9 +594,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
AS2( shr WORD_REG(dx), 4 ) AS2( shr WORD_REG(dx), 4 )
#endif #endif
#if !defined(_MSC_VER) || (_MSC_VER < 1400) AS_PUSH_IF86( bx)
AS_PUSH_IF86( bx)
#endif
AS_PUSH_IF86( bp) AS_PUSH_IF86( bp)
#ifdef __GNUC__ #ifdef __GNUC__
@ -524,9 +702,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
AS2( movdqa [WORD_REG(si)], xmm0 ) AS2( movdqa [WORD_REG(si)], xmm0 )
AS_POP_IF86( bp) AS_POP_IF86( bp)
#if !defined(_MSC_VER) || (_MSC_VER < 1400) AS_POP_IF86( bx)
AS_POP_IF86( bx)
#endif
#ifdef __GNUC__ #ifdef __GNUC__
".att_syntax prefix;" ".att_syntax prefix;"
@ -647,6 +823,7 @@ void GCM_Base::AuthenticateLastConfidentialBlock()
void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize) void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
{ {
m_ctr.Seek(0); m_ctr.Seek(0);
ReverseHashBufferIfNeeded();
m_ctr.ProcessData(mac, HashBuffer(), macSize); m_ctr.ProcessData(mac, HashBuffer(), macSize);
} }

1
gcm.h
View File

@ -63,6 +63,7 @@ protected:
byte *HashBuffer() {return m_buffer+REQUIRED_BLOCKSIZE;} byte *HashBuffer() {return m_buffer+REQUIRED_BLOCKSIZE;}
byte *HashKey() {return m_buffer+2*REQUIRED_BLOCKSIZE;} byte *HashKey() {return m_buffer+2*REQUIRED_BLOCKSIZE;}
byte *MulTable() {return m_buffer+3*REQUIRED_BLOCKSIZE;} byte *MulTable() {return m_buffer+3*REQUIRED_BLOCKSIZE;}
inline void ReverseHashBufferIfNeeded();
class CRYPTOPP_DLL GCTR : public CTR_Mode_ExternalCipher::Encryption class CRYPTOPP_DLL GCTR : public CTR_Mode_ExternalCipher::Encryption
{ {

View File

@ -115,7 +115,7 @@ void CTR_ModePolicy::OperateKeystream(KeystreamOperation operation, byte *output
{ {
byte lsb = m_counterArray[s-1]; byte lsb = m_counterArray[s-1];
size_t blocks = UnsignedMin(iterationCount, 256U-lsb); size_t blocks = UnsignedMin(iterationCount, 256U-lsb);
m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter); m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_AllowParallel);
if ((m_counterArray[s-1] = lsb + (byte)blocks) == 0) if ((m_counterArray[s-1] = lsb + (byte)blocks) == 0)
IncrementCounterBy256(); IncrementCounterBy256();
@ -147,7 +147,7 @@ void BlockOrientedCipherModeBase::UncheckedSetKey(const byte *key, unsigned int
void ECB_OneWay::ProcessData(byte *outString, const byte *inString, size_t length) void ECB_OneWay::ProcessData(byte *outString, const byte *inString, size_t length)
{ {
assert(length%BlockSize()==0); assert(length%BlockSize()==0);
m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, 0); m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, BlockTransformation::BT_AllowParallel);
} }
void CBC_Encryption::ProcessData(byte *outString, const byte *inString, size_t length) void CBC_Encryption::ProcessData(byte *outString, const byte *inString, size_t length)
@ -199,7 +199,7 @@ void CBC_Decryption::ProcessData(byte *outString, const byte *inString, size_t l
unsigned int blockSize = BlockSize(); unsigned int blockSize = BlockSize();
memcpy(m_temp, inString+length-blockSize, blockSize); // save copy now in case of in-place decryption memcpy(m_temp, inString+length-blockSize, blockSize); // save copy now in case of in-place decryption
if (length > blockSize) if (length > blockSize)
m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection); m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection|BlockTransformation::BT_AllowParallel);
m_cipher->ProcessAndXorBlock(inString, m_register, outString); m_cipher->ProcessAndXorBlock(inString, m_register, outString);
m_register.swap(m_temp); m_register.swap(m_temp);
} }

View File

@ -340,6 +340,7 @@ struct OFB_Mode_ExternalCipher : public CipherModeDocumentation
}; };
CRYPTOPP_DLL_TEMPLATE_CLASS AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> >; CRYPTOPP_DLL_TEMPLATE_CLASS AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> >;
CRYPTOPP_DLL_TEMPLATE_CLASS CipherModeFinalTemplate_ExternalCipher<ConcretePolicyHolder<Empty, AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> > > >;
//! CTR mode //! CTR mode
template <class CIPHER> template <class CIPHER>

View File

@ -4,6 +4,10 @@
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
/*
July 2010: Added support for AES-NI instructions via compiler intrinsics.
*/
/* /*
Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
@ -69,6 +73,10 @@ being unloaded from L1 cache, until that round is finished.
#include "misc.h" #include "misc.h"
#include "cpu.h" #include "cpu.h"
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
#include <wmmintrin.h>
#endif
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
@ -198,20 +206,83 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
m_rounds = keylen/4 + 6; m_rounds = keylen/4 + 6;
m_key.New(4*(m_rounds+1)); m_key.New(4*(m_rounds+1));
word32 temp, *rk = m_key; word32 *rk = m_key;
const word32 *rc = rcon;
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
// MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
if (HasAESNI())
{
static const word32 rcLE[] = {
0x01, 0x02, 0x04, 0x08,
0x10, 0x20, 0x40, 0x80,
0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
};
const word32 *rc = rcLE;
__m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
memcpy(rk, userKey, keylen);
while (true)
{
rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
if (rk + keylen/4 + 4 == m_key.end())
break;
if (keylen == 24)
{
rk[10] = rk[ 4] ^ rk[ 9];
rk[11] = rk[ 5] ^ rk[10];
temp = _mm_insert_epi32(temp, rk[11], 3);
}
else if (keylen == 32)
{
temp = _mm_insert_epi32(temp, rk[11], 3);
rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14];
temp = _mm_insert_epi32(temp, rk[15], 3);
}
else
temp = _mm_insert_epi32(temp, rk[7], 3);
rk += keylen/4;
}
if (!IsForwardTransformation())
{
rk = m_key;
unsigned int i, j;
std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
{
temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
*(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
*(__m128i *)(rk+j) = temp;
}
*(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
}
return;
}
#endif
GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
const word32 *rc = rcon;
word32 temp;
while (true) while (true)
{ {
temp = rk[keylen/4-1]; temp = rk[keylen/4-1];
rk[keylen/4] = rk[0] ^ word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
(word32(Se[GETBYTE(temp, 2)]) << 24) ^ rk[keylen/4] = rk[0] ^ x ^ *(rc++);
(word32(Se[GETBYTE(temp, 1)]) << 16) ^
(word32(Se[GETBYTE(temp, 0)]) << 8) ^
Se[GETBYTE(temp, 3)] ^
*(rc++);
rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
@ -227,11 +298,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
else if (keylen == 32) else if (keylen == 32)
{ {
temp = rk[11]; temp = rk[11];
rk[12] = rk[ 4] ^ rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
(word32(Se[GETBYTE(temp, 3)]) << 24) ^
(word32(Se[GETBYTE(temp, 2)]) << 16) ^
(word32(Se[GETBYTE(temp, 1)]) << 8) ^
Se[GETBYTE(temp, 0)];
rk[13] = rk[ 5] ^ rk[12]; rk[13] = rk[ 5] ^ rk[12];
rk[14] = rk[ 6] ^ rk[13]; rk[14] = rk[ 6] ^ rk[13];
rk[15] = rk[ 7] ^ rk[14]; rk[15] = rk[ 7] ^ rk[14];
@ -239,10 +306,15 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
rk += keylen/4; rk += keylen/4;
} }
rk = m_key;
if (IsForwardTransformation()) if (IsForwardTransformation())
{ {
if (!s_TeFilled) if (!s_TeFilled)
FillEncTable(); FillEncTable();
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
} }
else else
{ {
@ -250,35 +322,37 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
FillDecTable(); FillDecTable();
unsigned int i, j; unsigned int i, j;
rk = m_key;
/* invert the order of the round keys: */ #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) {
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; {
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
} }
#define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) rk[i+0] = InverseMixColumn(rk[i+0]);
rk[i+1] = InverseMixColumn(rk[i+1]);
rk[i+2] = InverseMixColumn(rk[i+2]);
rk[i+3] = InverseMixColumn(rk[i+3]);
/* apply the inverse MixColumn transform to all round keys but the first and the last: */ temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
for (i = 1; i < m_rounds; i++) { temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
rk += 4; temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
InverseMixColumn(rk[0]); temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
InverseMixColumn(rk[1]);
InverseMixColumn(rk[2]);
InverseMixColumn(rk[3]);
}
} }
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16); #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16); if (HasAESNI())
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#endif
} }
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{ {
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasSSE2()) if (HasSSE2())
{ {
Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
@ -354,6 +428,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{ {
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasAESNI())
{
Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
return;
}
#endif
typedef BlockGetAndPut<word32, NativeByteOrder> Block; typedef BlockGetAndPut<word32, NativeByteOrder> Block;
word32 s0, s1, s2, s3, t0, t1, t2, t3; word32 s0, s1, s2, s3, t0, t1, t2, t3;
@ -913,14 +995,200 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end)
return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
} }
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesenc_si128(block, subkeys[i]);
block = _mm_aesenc_si128(block, subkeys[i+1]);
}
block = _mm_aesenc_si128(block, subkeys[rounds-1]);
block = _mm_aesenclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesenc_si128(block0, rk);
block1 = _mm_aesenc_si128(block1, rk);
block2 = _mm_aesenc_si128(block2, rk);
block3 = _mm_aesenc_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesenclast_si128(block0, rk);
block1 = _mm_aesenclast_si128(block1, rk);
block2 = _mm_aesenclast_si128(block2, rk);
block3 = _mm_aesenclast_si128(block3, rk);
}
inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
{
block = _mm_aesdec_si128(block, subkeys[i]);
block = _mm_aesdec_si128(block, subkeys[i+1]);
}
block = _mm_aesdec_si128(block, subkeys[rounds-1]);
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
block0 = _mm_xor_si128(block0, rk);
block1 = _mm_xor_si128(block1, rk);
block2 = _mm_xor_si128(block2, rk);
block3 = _mm_xor_si128(block3, rk);
for (unsigned int i=1; i<rounds; i++)
{
rk = subkeys[i];
block0 = _mm_aesdec_si128(block0, rk);
block1 = _mm_aesdec_si128(block1, rk);
block2 = _mm_aesdec_si128(block2, rk);
block3 = _mm_aesdec_si128(block3, rk);
}
rk = subkeys[rounds];
block0 = _mm_aesdeclast_si128(block0, rk);
block1 = _mm_aesdeclast_si128(block1, rk);
block2 = _mm_aesdeclast_si128(block2, rk);
block3 = _mm_aesdeclast_si128(block3, rk);
}
static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
template <typename F1, typename F4>
inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
size_t blockSize = 16;
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
size_t xorIncrement = xorBlocks ? blockSize : 0;
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
if (flags & BlockTransformation::BT_ReverseDirection)
{
assert(length % blockSize == 0);
inBlocks += length - blockSize;
xorBlocks += length - blockSize;
outBlocks += length - blockSize;
inIncrement = 0-inIncrement;
xorIncrement = 0-xorIncrement;
outIncrement = 0-outIncrement;
}
if (flags & BlockTransformation::BT_AllowParallel)
{
while (length >= 4*blockSize)
{
__m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
const __m128i be1 = *(const __m128i *)s_one;
block1 = _mm_add_epi32(block0, be1);
block2 = _mm_add_epi32(block1, be1);
block3 = _mm_add_epi32(block2, be1);
_mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
}
else
{
inBlocks += inIncrement;
block1 = _mm_loadu_si128((const __m128i *)inBlocks);
inBlocks += inIncrement;
block2 = _mm_loadu_si128((const __m128i *)inBlocks);
inBlocks += inIncrement;
block3 = _mm_loadu_si128((const __m128i *)inBlocks);
inBlocks += inIncrement;
}
if (flags & BlockTransformation::BT_XorInput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
}
func4(block0, block1, block2, block3, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
xorBlocks += xorIncrement;
}
_mm_storeu_si128((__m128i *)outBlocks, block0);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)outBlocks, block1);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)outBlocks, block2);
outBlocks += outIncrement;
_mm_storeu_si128((__m128i *)outBlocks, block3);
outBlocks += outIncrement;
length -= 4*blockSize;
}
}
while (length >= blockSize)
{
__m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
if (flags & BlockTransformation::BT_XorInput)
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++;
func1(block, subkeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
_mm_storeu_si128((__m128i *)outBlocks, block);
inBlocks += inIncrement;
outBlocks += outIncrement;
xorBlocks += xorIncrement;
length -= blockSize;
}
return length;
}
#endif
size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{ {
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
if (HasAESNI())
return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
if (length < BLOCKSIZE)
return length;
if (HasSSE2()) if (HasSSE2())
{ {
if (length < BLOCKSIZE)
return length;
struct Locals struct Locals
{ {
word32 subkeys[4*12], workspace[8]; word32 subkeys[4*12], workspace[8];
@ -966,15 +1234,27 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
locals.keysBegin = (12-keysToCopy)*16; locals.keysBegin = (12-keysToCopy)*16;
Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key); Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
return length%16; return length % BLOCKSIZE;
} }
else
#endif #endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
} }
#endif #endif
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{
if (HasAESNI())
return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
NAMESPACE_END NAMESPACE_END
#endif #endif

View File

@ -50,6 +50,9 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
{ {
public: public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
}; };
public: public:

View File

@ -252,6 +252,7 @@ bool TestSettings()
cout << "passed: "; cout << "passed: ";
cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize; cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize;
cout << ", AESNI_INTRINSICS == " << CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE << endl;
if (!pass) if (!pass)
{ {