add support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM
parent
a070ff16ae
commit
7adbf89681
15
Readme.txt
15
Readme.txt
|
|
@ -414,19 +414,30 @@ the mailing list.
|
||||||
- ported to MSVC 2008, GCC 4.2, Sun CC 5.9, Intel C++ Compiler 10.0,
|
- ported to MSVC 2008, GCC 4.2, Sun CC 5.9, Intel C++ Compiler 10.0,
|
||||||
and Borland C++Builder 2007
|
and Borland C++Builder 2007
|
||||||
|
|
||||||
5.6 - added AuthenticatedSymmetricCipher interface class and Filter wrappers
|
5.6.0 - added AuthenticatedSymmetricCipher interface class and Filter wrappers
|
||||||
- added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED
|
- added CCM, GCM (with SSE2 assembly), EAX, CMAC, XSalsa20, and SEED
|
||||||
- added support for variable length IVs
|
- added support for variable length IVs
|
||||||
- added OIDs for Brainpool elliptic curve parameters
|
- added OIDs for Brainpool elliptic curve parameters
|
||||||
- improved AES and SHA-256 speed on x86 and x64
|
- improved AES and SHA-256 speed on x86 and x64
|
||||||
|
- changed BlockTransformation interface to no longer assume data alignment
|
||||||
- fixed incorrect VMAC computation on message lengths
|
- fixed incorrect VMAC computation on message lengths
|
||||||
that are >64 mod 128 (x86 assembly version is not affected)
|
that are >64 mod 128 (x86 assembly version is not affected)
|
||||||
- fixed compiler error in vmac.cpp on x86 with GCC -fPIC
|
- fixed compiler error in vmac.cpp on x86 with GCC -fPIC
|
||||||
- fixed run-time validation error on x86-64 with GCC 4.3.2 -O2
|
- fixed run-time validation error on x86-64 with GCC 4.3.2 -O2
|
||||||
- fixed HashFilter bug when putMessage=true
|
- fixed HashFilter bug when putMessage=true
|
||||||
|
- fixed AES-CTR data alignment bug that causes incorrect encryption on ARM
|
||||||
- removed WORD64_AVAILABLE; compiler support for 64-bit int is now required
|
- removed WORD64_AVAILABLE; compiler support for 64-bit int is now required
|
||||||
- ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11
|
- ported to GCC 4.3, C++Builder 2009, Sun CC 5.10, Intel C++ Compiler 11
|
||||||
|
|
||||||
5.6.1 - switched to a public domain implementation of MARS
|
5.6.1 - added support for AES-NI and CLMUL instruction sets in AES and GMAC/GCM
|
||||||
|
- removed WAKE-CFB
|
||||||
|
- fixed several bugs in the SHA-256 x86/x64 assembly code:
|
||||||
|
* incorrect hash on non-SSE2 x86 machines on non-aligned input
|
||||||
|
* incorrect hash on x86 machines when input crosses 0x80000000
|
||||||
|
* incorrect hash on x64 when compiled with GCC with optimizations enabled
|
||||||
|
- fixed bugs in AES x86 and x64 assembly causing crashes in some MSVC build configurations
|
||||||
|
- switched to a public domain implementation of MARS
|
||||||
|
- ported to MSVC 2010, Sun Studio 12u1
|
||||||
|
- renamed the MSVC DLL project to "cryptopp" for compatibility with MSVC 2010
|
||||||
|
|
||||||
Written by Wei Dai
|
Written by Wei Dai
|
||||||
|
|
|
||||||
11
bench.cpp
11
bench.cpp
|
|
@ -10,6 +10,7 @@
|
||||||
#include "hex.h"
|
#include "hex.h"
|
||||||
#include "modes.h"
|
#include "modes.h"
|
||||||
#include "factory.h"
|
#include "factory.h"
|
||||||
|
#include "cpu.h"
|
||||||
|
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
@ -242,14 +243,24 @@ void BenchmarkAll(double t, double hertz)
|
||||||
cout << "<THEAD><TR><TH>Algorithm<TH>MiB/Second" << cpb << "<TH>Microseconds to<br>Setup Key and IV" << cpk << endl;
|
cout << "<THEAD><TR><TH>Algorithm<TH>MiB/Second" << cpb << "<TH>Microseconds to<br>Setup Key and IV" << cpk << endl;
|
||||||
|
|
||||||
cout << "\n<TBODY style=\"background: yellow\">";
|
cout << "\n<TBODY style=\"background: yellow\">";
|
||||||
|
if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL())
|
||||||
|
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM");
|
||||||
|
else
|
||||||
|
{
|
||||||
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048));
|
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (2K tables)", MakeParameters(Name::TableSize(), 2048));
|
||||||
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
|
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/GCM", 0, "AES/GCM (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
|
||||||
|
}
|
||||||
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/CCM");
|
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/CCM");
|
||||||
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/EAX");
|
BenchMarkByName2<AuthenticatedSymmetricCipher, AuthenticatedSymmetricCipher>("AES/EAX");
|
||||||
|
|
||||||
cout << "\n<TBODY style=\"background: white\">";
|
cout << "\n<TBODY style=\"background: white\">";
|
||||||
|
if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && HasCLMUL())
|
||||||
|
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
|
||||||
|
else
|
||||||
|
{
|
||||||
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048));
|
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048));
|
||||||
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
|
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (64K tables)", MakeParameters(Name::TableSize(), 64*1024));
|
||||||
|
}
|
||||||
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-64");
|
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-64");
|
||||||
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-128");
|
BenchMarkByName<MessageAuthenticationCode>("VMAC(AES)-128");
|
||||||
BenchMarkByName<MessageAuthenticationCode>("HMAC(SHA-1)");
|
BenchMarkByName<MessageAuthenticationCode>("HMAC(SHA-1)");
|
||||||
|
|
|
||||||
7
config.h
7
config.h
|
|
@ -257,6 +257,7 @@ NAMESPACE_END
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))))
|
#if !defined(CRYPTOPP_DISABLE_ASM) && ((defined(_MSC_VER) && defined(_M_IX86)) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))))
|
||||||
|
// C++Builder 2010 does not allow "call label" where label is defined within inline assembly
|
||||||
#define CRYPTOPP_X86_ASM_AVAILABLE
|
#define CRYPTOPP_X86_ASM_AVAILABLE
|
||||||
|
|
||||||
#if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(CRYPTOPP_MSVC6PP_OR_LATER) || CRYPTOPP_GCC_VERSION >= 30300)
|
#if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(CRYPTOPP_MSVC6PP_OR_LATER) || CRYPTOPP_GCC_VERSION >= 30300)
|
||||||
|
|
@ -288,6 +289,12 @@ NAMESPACE_END
|
||||||
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
|
#define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE) && (CRYPTOPP_GCC_VERSION >= 40400 || _MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110)
|
||||||
|
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1
|
||||||
|
#else
|
||||||
|
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
||||||
#define CRYPTOPP_BOOL_ALIGN16_ENABLED 1
|
#define CRYPTOPP_BOOL_ALIGN16_ENABLED 1
|
||||||
#else
|
#else
|
||||||
|
|
|
||||||
52
cpu.cpp
52
cpu.cpp
|
|
@ -8,7 +8,7 @@
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <setjmp.h>
|
#include <setjmp.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -19,9 +19,19 @@
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
#ifdef CRYPTOPP_X86_ASM_AVAILABLE
|
#ifdef CRYPTOPP_CPUID_AVAILABLE
|
||||||
|
|
||||||
#ifndef _MSC_VER
|
#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
|
||||||
|
|
||||||
|
bool CpuId(word32 input, word32 *output)
|
||||||
|
{
|
||||||
|
__cpuid((int *)output, input);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
|
||||||
typedef void (*SigHandler)(int);
|
typedef void (*SigHandler)(int);
|
||||||
|
|
||||||
static jmp_buf s_jmpNoCPUID;
|
static jmp_buf s_jmpNoCPUID;
|
||||||
|
|
@ -29,11 +39,17 @@ static void SigIllHandlerCPUID(int)
|
||||||
{
|
{
|
||||||
longjmp(s_jmpNoCPUID, 1);
|
longjmp(s_jmpNoCPUID, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static jmp_buf s_jmpNoSSE2;
|
||||||
|
static void SigIllHandlerSSE2(int)
|
||||||
|
{
|
||||||
|
longjmp(s_jmpNoSSE2, 1);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool CpuId(word32 input, word32 *output)
|
bool CpuId(word32 input, word32 *output)
|
||||||
{
|
{
|
||||||
#ifdef _MSC_VER
|
#ifdef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
|
||||||
__try
|
__try
|
||||||
{
|
{
|
||||||
__asm
|
__asm
|
||||||
|
|
@ -80,31 +96,13 @@ bool CpuId(word32 input, word32 *output)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
static jmp_buf s_jmpNoSSE2;
|
|
||||||
static void SigIllHandlerSSE2(int)
|
|
||||||
{
|
|
||||||
longjmp(s_jmpNoSSE2, 1);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#elif _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64
|
|
||||||
|
|
||||||
bool CpuId(word32 input, word32 *output)
|
|
||||||
{
|
|
||||||
__cpuid((int *)output, input);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CRYPTOPP_CPUID_AVAILABLE
|
|
||||||
|
|
||||||
static bool TrySSE2()
|
static bool TrySSE2()
|
||||||
{
|
{
|
||||||
#if CRYPTOPP_BOOL_X64
|
#if CRYPTOPP_BOOL_X64
|
||||||
return true;
|
return true;
|
||||||
#elif defined(_MSC_VER)
|
#elif defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
|
||||||
__try
|
__try
|
||||||
{
|
{
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||||
|
|
@ -119,7 +117,7 @@ static bool TrySSE2()
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
#elif defined(__GNUC__)
|
#else
|
||||||
SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2);
|
SigHandler oldHandler = signal(SIGILL, SigIllHandlerSSE2);
|
||||||
if (oldHandler == SIG_ERR)
|
if (oldHandler == SIG_ERR)
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -139,8 +137,6 @@ static bool TrySSE2()
|
||||||
|
|
||||||
signal(SIGILL, oldHandler);
|
signal(SIGILL, oldHandler);
|
||||||
return result;
|
return result;
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -160,8 +156,8 @@ void DetectX86Features()
|
||||||
if ((cpuid1[3] & (1 << 26)) != 0)
|
if ((cpuid1[3] & (1 << 26)) != 0)
|
||||||
g_hasSSE2 = TrySSE2();
|
g_hasSSE2 = TrySSE2();
|
||||||
g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
|
g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9));
|
||||||
g_hasAESNI = (cpuid1[2] & (1<<25)) != 0;
|
g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25));
|
||||||
g_hasCLMUL = (cpuid1[2] & (1<<1)) != 0;
|
g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1));
|
||||||
|
|
||||||
if ((cpuid1[3] & (1 << 25)) != 0)
|
if ((cpuid1[3] & (1 << 25)) != 0)
|
||||||
g_hasISSE = true;
|
g_hasISSE = true;
|
||||||
|
|
|
||||||
48
cpu.h
48
cpu.h
|
|
@ -18,22 +18,18 @@
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
#if defined(CRYPTOPP_X86_ASM_AVAILABLE) || (_MSC_VER >= 1400 && CRYPTOPP_BOOL_X64)
|
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
|
||||||
|
|
||||||
#define CRYPTOPP_CPUID_AVAILABLE
|
#define CRYPTOPP_CPUID_AVAILABLE
|
||||||
|
|
||||||
// these should not be used directly
|
// these should not be used directly
|
||||||
extern CRYPTOPP_DLL bool g_x86DetectionDone;
|
extern CRYPTOPP_DLL bool g_x86DetectionDone;
|
||||||
extern CRYPTOPP_DLL bool g_hasSSE2;
|
|
||||||
extern CRYPTOPP_DLL bool g_hasISSE;
|
|
||||||
extern CRYPTOPP_DLL bool g_hasMMX;
|
|
||||||
extern CRYPTOPP_DLL bool g_hasSSSE3;
|
extern CRYPTOPP_DLL bool g_hasSSSE3;
|
||||||
extern CRYPTOPP_DLL bool g_hasAESNI;
|
extern CRYPTOPP_DLL bool g_hasAESNI;
|
||||||
extern CRYPTOPP_DLL bool g_hasCLMUL;
|
extern CRYPTOPP_DLL bool g_hasCLMUL;
|
||||||
extern CRYPTOPP_DLL bool g_isP4;
|
extern CRYPTOPP_DLL bool g_isP4;
|
||||||
extern CRYPTOPP_DLL word32 g_cacheLineSize;
|
extern CRYPTOPP_DLL word32 g_cacheLineSize;
|
||||||
CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features();
|
CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features();
|
||||||
|
|
||||||
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 *output);
|
CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 *output);
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_X64
|
#if CRYPTOPP_BOOL_X64
|
||||||
|
|
@ -42,6 +38,10 @@ inline bool HasISSE() {return true;}
|
||||||
inline bool HasMMX() {return true;}
|
inline bool HasMMX() {return true;}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
extern CRYPTOPP_DLL bool g_hasSSE2;
|
||||||
|
extern CRYPTOPP_DLL bool g_hasISSE;
|
||||||
|
extern CRYPTOPP_DLL bool g_hasMMX;
|
||||||
|
|
||||||
inline bool HasSSE2()
|
inline bool HasSSE2()
|
||||||
{
|
{
|
||||||
if (!g_x86DetectionDone)
|
if (!g_x86DetectionDone)
|
||||||
|
|
@ -107,22 +107,8 @@ inline int GetCacheLineSize()
|
||||||
return CRYPTOPP_L1_CACHE_LINE_SIZE;
|
return CRYPTOPP_L1_CACHE_LINE_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool HasSSSE3() {return false;}
|
|
||||||
inline bool IsP4() {return false;}
|
|
||||||
|
|
||||||
// assume MMX and SSE2 if intrinsics are enabled
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_X64
|
|
||||||
inline bool HasSSE2() {return true;}
|
|
||||||
inline bool HasISSE() {return true;}
|
|
||||||
inline bool HasMMX() {return true;}
|
|
||||||
#else
|
|
||||||
inline bool HasSSE2() {return false;}
|
|
||||||
inline bool HasISSE() {return false;}
|
|
||||||
inline bool HasMMX() {return false;}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE || _MSC_VER >= 1400
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CRYPTOPP_GENERATE_X64_MASM
|
#ifdef CRYPTOPP_GENERATE_X64_MASM
|
||||||
|
|
@ -134,7 +120,19 @@ inline bool HasMMX() {return false;}
|
||||||
#define ASJ(x, y, z) x label##y*newline*
|
#define ASJ(x, y, z) x label##y*newline*
|
||||||
#define ASC(x, y) x label##y*newline*
|
#define ASC(x, y) x label##y*newline*
|
||||||
#define AS_HEX(y) 0##y##h
|
#define AS_HEX(y) 0##y##h
|
||||||
#elif defined(__GNUC__)
|
#elif defined(_MSC_VER) || defined(__BORLANDC__)
|
||||||
|
#define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
|
||||||
|
#define AS1(x) __asm {x}
|
||||||
|
#define AS2(x, y) __asm {x, y}
|
||||||
|
#define AS3(x, y, z) __asm {x, y, z}
|
||||||
|
#define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)}
|
||||||
|
#define ASL(x) __asm {label##x:}
|
||||||
|
#define ASJ(x, y, z) __asm {x label##y}
|
||||||
|
#define ASC(x, y) __asm {x label##y}
|
||||||
|
#define CRYPTOPP_NAKED __declspec(naked)
|
||||||
|
#define AS_HEX(y) 0x##y
|
||||||
|
#else
|
||||||
|
#define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
|
||||||
// define these in two steps to allow arguments to be expanded
|
// define these in two steps to allow arguments to be expanded
|
||||||
#define GNU_AS1(x) #x ";"
|
#define GNU_AS1(x) #x ";"
|
||||||
#define GNU_AS2(x, y) #x ", " #y ";"
|
#define GNU_AS2(x, y) #x ", " #y ";"
|
||||||
|
|
@ -150,16 +148,6 @@ inline bool HasMMX() {return false;}
|
||||||
#define ASC(x, y) #x " " #y ";"
|
#define ASC(x, y) #x " " #y ";"
|
||||||
#define CRYPTOPP_NAKED
|
#define CRYPTOPP_NAKED
|
||||||
#define AS_HEX(y) 0x##y
|
#define AS_HEX(y) 0x##y
|
||||||
#else
|
|
||||||
#define AS1(x) __asm {x}
|
|
||||||
#define AS2(x, y) __asm {x, y}
|
|
||||||
#define AS3(x, y, z) __asm {x, y, z}
|
|
||||||
#define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
|
|
||||||
#define ASL(x) __asm {label##x:}
|
|
||||||
#define ASJ(x, y, z) __asm {x label##y}
|
|
||||||
#define ASC(x, y) __asm {x label##y}
|
|
||||||
#define CRYPTOPP_NAKED __declspec(naked)
|
|
||||||
#define AS_HEX(y) 0x##y
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define IF0(y)
|
#define IF0(y)
|
||||||
|
|
|
||||||
|
|
@ -456,7 +456,7 @@ public:
|
||||||
//! return number of blocks that can be processed in parallel, for bit-slicing implementations
|
//! return number of blocks that can be processed in parallel, for bit-slicing implementations
|
||||||
virtual unsigned int OptimalNumberOfParallelBlocks() const {return 1;}
|
virtual unsigned int OptimalNumberOfParallelBlocks() const {return 1;}
|
||||||
|
|
||||||
enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8} FlagsForAdvancedProcessBlocks;
|
enum {BT_InBlockIsCounter=1, BT_DontIncrementInOutPointers=2, BT_XorInput=4, BT_ReverseDirection=8, BT_AllowParallel=16} FlagsForAdvancedProcessBlocks;
|
||||||
|
|
||||||
//! encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks)
|
//! encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks)
|
||||||
/*! /note If BT_InBlockIsCounter is set, last byte of inBlocks may be modified. */
|
/*! /note If BT_InBlockIsCounter is set, last byte of inBlocks may be modified. */
|
||||||
|
|
|
||||||
41
datatest.cpp
41
datatest.cpp
|
|
@ -57,15 +57,15 @@ const std::string & GetRequiredDatum(const TestData &data, const char *name)
|
||||||
return i->second;
|
return i->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish)
|
void RandomizedTransfer(BufferedTransformation &source, BufferedTransformation &target, bool finish, const std::string &channel=DEFAULT_CHANNEL)
|
||||||
{
|
{
|
||||||
while (source.MaxRetrievable() > (finish ? 0 : 4096))
|
while (source.MaxRetrievable() > (finish ? 0 : 4096))
|
||||||
{
|
{
|
||||||
byte buf[4096+64];
|
byte buf[4096+64];
|
||||||
word32 start = GlobalRNG().GenerateWord32(0, 63);
|
size_t start = GlobalRNG().GenerateWord32(0, 63);
|
||||||
word32 len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, source.MaxRetrievable()));
|
size_t len = GlobalRNG().GenerateWord32(1, UnsignedMin(4096U, 3*source.MaxRetrievable()/2));
|
||||||
source.Get(buf+start, len);
|
len = source.Get(buf+start, len);
|
||||||
target.Put(buf+start, len);
|
target.ChannelPut(channel, buf+start, len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -397,9 +397,9 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
StringSource ss(plaintext, false, new StreamTransformationFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING));
|
StreamTransformationFilter encFilter(*encryptor, new StringSink(encrypted), StreamTransformationFilter::NO_PADDING);
|
||||||
ss.Pump(plaintext.size()/2 + 1);
|
RandomizedTransfer(StringStore(plaintext).Ref(), encFilter, true);
|
||||||
ss.PumpAll();
|
encFilter.MessageEnd();
|
||||||
/*{
|
/*{
|
||||||
std::string z;
|
std::string z;
|
||||||
encryptor->Seek(seek);
|
encryptor->Seek(seek);
|
||||||
|
|
@ -422,14 +422,14 @@ void TestSymmetricCipher(TestData &v, const NameValuePairs &overrideParameters)
|
||||||
{
|
{
|
||||||
std::cout << "incorrectly encrypted: ";
|
std::cout << "incorrectly encrypted: ";
|
||||||
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
|
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
|
||||||
xx.Pump(256); xx.Flush(false);
|
xx.Pump(2048); xx.Flush(false);
|
||||||
std::cout << "\n";
|
std::cout << "\n";
|
||||||
SignalTestFailure();
|
SignalTestFailure();
|
||||||
}
|
}
|
||||||
std::string decrypted;
|
std::string decrypted;
|
||||||
StringSource dd(encrypted, false, new StreamTransformationFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING));
|
StreamTransformationFilter decFilter(*decryptor, new StringSink(decrypted), StreamTransformationFilter::NO_PADDING);
|
||||||
dd.Pump(plaintext.size()/2 + 1);
|
RandomizedTransfer(StringStore(encrypted).Ref(), decFilter, true);
|
||||||
dd.PumpAll();
|
decFilter.MessageEnd();
|
||||||
if (decrypted != plaintext)
|
if (decrypted != plaintext)
|
||||||
{
|
{
|
||||||
std::cout << "incorrectly decrypted: ";
|
std::cout << "incorrectly decrypted: ";
|
||||||
|
|
@ -484,27 +484,24 @@ void TestAuthenticatedSymmetricCipher(TestData &v, const NameValuePairs &overrid
|
||||||
StringStore sh(header), sp(plaintext), sc(ciphertext), sf(footer), sm(mac);
|
StringStore sh(header), sp(plaintext), sc(ciphertext), sf(footer), sm(mac);
|
||||||
|
|
||||||
if (macAtBegin)
|
if (macAtBegin)
|
||||||
sm.TransferTo(df);
|
RandomizedTransfer(sm, df, true);
|
||||||
sh.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
|
sh.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
|
||||||
sc.TransferTo(df);
|
RandomizedTransfer(sc, df, true);
|
||||||
sf.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
|
sf.CopyTo(df, LWORD_MAX, AAD_CHANNEL);
|
||||||
if (!macAtBegin)
|
if (!macAtBegin)
|
||||||
sm.TransferTo(df);
|
RandomizedTransfer(sm, df, true);
|
||||||
df.MessageEnd();
|
df.MessageEnd();
|
||||||
|
|
||||||
sh.TransferTo(ef, sh.MaxRetrievable()/2+1, AAD_CHANNEL);
|
RandomizedTransfer(sh, ef, true, AAD_CHANNEL);
|
||||||
sh.TransferTo(ef, LWORD_MAX, AAD_CHANNEL);
|
RandomizedTransfer(sp, ef, true);
|
||||||
sp.TransferTo(ef, sp.MaxRetrievable()/2+1);
|
RandomizedTransfer(sf, ef, true, AAD_CHANNEL);
|
||||||
sp.TransferTo(ef);
|
|
||||||
sf.TransferTo(ef, sf.MaxRetrievable()/2+1, AAD_CHANNEL);
|
|
||||||
sf.TransferTo(ef, LWORD_MAX, AAD_CHANNEL);
|
|
||||||
ef.MessageEnd();
|
ef.MessageEnd();
|
||||||
|
|
||||||
if (test == "Encrypt" && encrypted != ciphertext+mac)
|
if (test == "Encrypt" && encrypted != ciphertext+mac)
|
||||||
{
|
{
|
||||||
std::cout << "incorrectly encrypted: ";
|
std::cout << "incorrectly encrypted: ";
|
||||||
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
|
StringSource xx(encrypted, false, new HexEncoder(new FileSink(std::cout)));
|
||||||
xx.Pump(256); xx.Flush(false);
|
xx.Pump(2048); xx.Flush(false);
|
||||||
std::cout << "\n";
|
std::cout << "\n";
|
||||||
SignalTestFailure();
|
SignalTestFailure();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ size_t FileStore::TransferTo2(BufferedTransformation &target, lword &transferByt
|
||||||
|
|
||||||
m_stream->read((char *)m_space, (unsigned int)STDMIN(size, (lword)spaceSize));
|
m_stream->read((char *)m_space, (unsigned int)STDMIN(size, (lword)spaceSize));
|
||||||
}
|
}
|
||||||
m_len = m_stream->gcount();
|
m_len = (size_t)m_stream->gcount();
|
||||||
size_t blockedBytes;
|
size_t blockedBytes;
|
||||||
output:
|
output:
|
||||||
blockedBytes = target.ChannelPutModifiable2(channel, m_space, m_len, 0, blocking);
|
blockedBytes = target.ChannelPutModifiable2(channel, m_space, m_len, 0, blocking);
|
||||||
|
|
@ -242,7 +242,7 @@ size_t FileSink::Put2(const byte *inString, size_t length, int messageEnd, bool
|
||||||
size = numeric_limits<std::streamsize>::max();
|
size = numeric_limits<std::streamsize>::max();
|
||||||
m_stream->write((const char *)inString, size);
|
m_stream->write((const char *)inString, size);
|
||||||
inString += size;
|
inString += size;
|
||||||
length -= size;
|
length -= (size_t)size;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (messageEnd)
|
if (messageEnd)
|
||||||
|
|
|
||||||
193
gcm.cpp
193
gcm.cpp
|
|
@ -14,6 +14,11 @@
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
#include <tmmintrin.h>
|
||||||
|
#include <wmmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
word16 GCM_Base::s_reductionTable[256];
|
word16 GCM_Base::s_reductionTable[256];
|
||||||
|
|
@ -47,6 +52,21 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
|
||||||
}
|
}
|
||||||
Block::Put(NULL, c)(Z0)(Z1);
|
Block::Put(NULL, c)(Z0)(Z1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
|
||||||
|
{
|
||||||
|
word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
|
||||||
|
word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
|
||||||
|
|
||||||
|
PolynomialMod2 pa((byte *)A, 8);
|
||||||
|
PolynomialMod2 pb((byte *)B, 8);
|
||||||
|
PolynomialMod2 c = pa*pb;
|
||||||
|
|
||||||
|
__m128i output;
|
||||||
|
for (int i=0; i<16; i++)
|
||||||
|
((byte *)&output)[i] = c.GetByte(i);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||||
|
|
@ -66,6 +86,56 @@ inline static void Xor16(byte *a, const byte *b, const byte *c)
|
||||||
((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1];
|
((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
static CRYPTOPP_ALIGN_DATA(16) const word64 s_clmulConstants64[] = {
|
||||||
|
0xe100000000000000, 0xc200000000000000,
|
||||||
|
0x08090a0b0c0d0e0f, 0x0001020304050607,
|
||||||
|
0x0001020304050607, 0x08090a0b0c0d0e0f};
|
||||||
|
static const __m128i *s_clmulConstants = (const __m128i *)s_clmulConstants64;
|
||||||
|
static const unsigned int s_clmulTableSizeInBlocks = 8;
|
||||||
|
|
||||||
|
inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most
|
||||||
|
significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the
|
||||||
|
rightmost bit positions, and the lowest byte addresses.
|
||||||
|
|
||||||
|
c1 ^= c0t * 0xc200000000000000
|
||||||
|
c2t ^= c0t
|
||||||
|
t = shift (c1t ^ c0b) left 1 bit
|
||||||
|
c2 ^= t * 0xe100000000000000
|
||||||
|
c2t ^= c1b
|
||||||
|
shift c2 left 1 bit and xor in lowest bit of c1t
|
||||||
|
*/
|
||||||
|
#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301
|
||||||
|
c2 = _mm_xor_si128(c2, _mm_move_epi64(c0));
|
||||||
|
#else
|
||||||
|
c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8));
|
||||||
|
#endif
|
||||||
|
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10));
|
||||||
|
c0 = _mm_srli_si128(c0, 8);
|
||||||
|
c0 = _mm_xor_si128(c0, c1);
|
||||||
|
c0 = _mm_slli_epi64(c0, 1);
|
||||||
|
c0 = _mm_clmulepi64_si128(c0, r, 0);
|
||||||
|
c2 = _mm_xor_si128(c2, c0);
|
||||||
|
c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8));
|
||||||
|
c1 = _mm_unpacklo_epi64(c1, c2);
|
||||||
|
c1 = _mm_srli_epi64(c1, 63);
|
||||||
|
c2 = _mm_slli_epi64(c2, 1);
|
||||||
|
return _mm_xor_si128(c2, c1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
|
||||||
|
{
|
||||||
|
__m128i c0 = _mm_clmulepi64_si128(x,h,0);
|
||||||
|
__m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
|
||||||
|
__m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
|
||||||
|
|
||||||
|
return CLMUL_Reduce(c0, c1, c2, r);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms)
|
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms)
|
||||||
{
|
{
|
||||||
BlockCipher &blockCipher = AccessBlockCipher();
|
BlockCipher &blockCipher = AccessBlockCipher();
|
||||||
|
|
@ -74,7 +144,17 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||||
if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
|
if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE)
|
||||||
throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
|
throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16");
|
||||||
|
|
||||||
int tableSize;
|
int tableSize, i, j, k;
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
if (HasCLMUL())
|
||||||
|
{
|
||||||
|
params.GetIntValue(Name::TableSize(), tableSize); // avoid "parameter not used" error
|
||||||
|
tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
if (params.GetIntValue(Name::TableSize(), tableSize))
|
if (params.GetIntValue(Name::TableSize(), tableSize))
|
||||||
tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
|
tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024;
|
||||||
else
|
else
|
||||||
|
|
@ -84,16 +164,36 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||||
// VC 2003 workaround: compiler generates bad code for 64K tables
|
// VC 2003 workaround: compiler generates bad code for 64K tables
|
||||||
tableSize = 2*1024;
|
tableSize = 2*1024;
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
|
m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize);
|
||||||
|
byte *table = MulTable();
|
||||||
byte *hashKey = HashKey();
|
byte *hashKey = HashKey();
|
||||||
memset(hashKey, 0, REQUIRED_BLOCKSIZE);
|
memset(hashKey, 0, REQUIRED_BLOCKSIZE);
|
||||||
blockCipher.ProcessBlock(hashKey);
|
blockCipher.ProcessBlock(hashKey);
|
||||||
|
|
||||||
byte *table = MulTable();
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
int i, j, k;
|
if (HasCLMUL())
|
||||||
word64 V0, V1;
|
{
|
||||||
|
const __m128i r = s_clmulConstants[0];
|
||||||
|
__m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)hashKey), s_clmulConstants[1]);
|
||||||
|
__m128i h = h0;
|
||||||
|
|
||||||
|
for (i=0; i<tableSize; i+=32)
|
||||||
|
{
|
||||||
|
__m128i h1 = CLMUL_GF_Mul(h, h0, r);
|
||||||
|
_mm_storel_epi64((__m128i *)(table+i), h);
|
||||||
|
_mm_storeu_si128((__m128i *)(table+i+16), h1);
|
||||||
|
_mm_storeu_si128((__m128i *)(table+i+8), h);
|
||||||
|
_mm_storel_epi64((__m128i *)(table+i+8), h1);
|
||||||
|
h = CLMUL_GF_Mul(h1, h0, r);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
word64 V0, V1;
|
||||||
typedef BlockGetAndPut<word64, BigEndian> Block;
|
typedef BlockGetAndPut<word64, BigEndian> Block;
|
||||||
Block::Get(hashKey)(V0)(V1);
|
Block::Get(hashKey)(V0)(V1);
|
||||||
|
|
||||||
|
|
@ -178,6 +278,17 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void GCM_Base::ReverseHashBufferIfNeeded()
|
||||||
|
{
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
if (HasCLMUL())
|
||||||
|
{
|
||||||
|
__m128i &x = *(__m128i *)HashBuffer();
|
||||||
|
x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void GCM_Base::Resync(const byte *iv, size_t len)
|
void GCM_Base::Resync(const byte *iv, size_t len)
|
||||||
{
|
{
|
||||||
BlockCipher &cipher = AccessBlockCipher();
|
BlockCipher &cipher = AccessBlockCipher();
|
||||||
|
|
@ -209,6 +320,8 @@ void GCM_Base::Resync(const byte *iv, size_t len)
|
||||||
|
|
||||||
PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8);
|
PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8);
|
||||||
GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
|
GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE);
|
||||||
|
|
||||||
|
ReverseHashBufferIfNeeded();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_state >= State_IVSet)
|
if (m_state >= State_IVSet)
|
||||||
|
|
@ -241,6 +354,73 @@ void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuf
|
||||||
|
|
||||||
size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
||||||
{
|
{
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
if (HasCLMUL())
|
||||||
|
{
|
||||||
|
const __m128i *table = (const __m128i *)MulTable();
|
||||||
|
__m128i x = _mm_load_si128((__m128i *)HashBuffer());
|
||||||
|
const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2];
|
||||||
|
|
||||||
|
while (len >= 16)
|
||||||
|
{
|
||||||
|
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
|
||||||
|
__m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-1)*16)), bswapMask2);;
|
||||||
|
__m128i c0 = _mm_setzero_si128();
|
||||||
|
__m128i c1 = _mm_setzero_si128();
|
||||||
|
__m128i c2 = _mm_setzero_si128();
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
__m128i h0 = _mm_load_si128(table+i);
|
||||||
|
__m128i h1 = _mm_load_si128(table+i+1);
|
||||||
|
__m128i h01 = _mm_xor_si128(h0, h1);
|
||||||
|
|
||||||
|
if (++i == s)
|
||||||
|
{
|
||||||
|
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask);
|
||||||
|
d = _mm_xor_si128(d, x);
|
||||||
|
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0));
|
||||||
|
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
|
||||||
|
d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||||
|
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask2);
|
||||||
|
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
|
||||||
|
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
|
||||||
|
d2 = _mm_xor_si128(d2, d);
|
||||||
|
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1));
|
||||||
|
|
||||||
|
if (++i == s)
|
||||||
|
{
|
||||||
|
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask);
|
||||||
|
d = _mm_xor_si128(d, x);
|
||||||
|
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
|
||||||
|
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11));
|
||||||
|
d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||||
|
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask);
|
||||||
|
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
|
||||||
|
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
|
||||||
|
d = _mm_xor_si128(d, d2);
|
||||||
|
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
|
||||||
|
}
|
||||||
|
data += s*16;
|
||||||
|
len -= s*16;
|
||||||
|
|
||||||
|
c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2);
|
||||||
|
x = CLMUL_Reduce(c0, c1, c2, r);
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm_store_si128((__m128i *)HashBuffer(), x);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef BlockGetAndPut<word64, NativeByteOrder> Block;
|
typedef BlockGetAndPut<word64, NativeByteOrder> Block;
|
||||||
word64 *hashBuffer = (word64 *)HashBuffer();
|
word64 *hashBuffer = (word64 *)HashBuffer();
|
||||||
|
|
||||||
|
|
@ -414,9 +594,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
||||||
AS2( shr WORD_REG(dx), 4 )
|
AS2( shr WORD_REG(dx), 4 )
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(_MSC_VER) || (_MSC_VER < 1400)
|
|
||||||
AS_PUSH_IF86( bx)
|
AS_PUSH_IF86( bx)
|
||||||
#endif
|
|
||||||
AS_PUSH_IF86( bp)
|
AS_PUSH_IF86( bp)
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
|
|
@ -524,9 +702,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
||||||
AS2( movdqa [WORD_REG(si)], xmm0 )
|
AS2( movdqa [WORD_REG(si)], xmm0 )
|
||||||
|
|
||||||
AS_POP_IF86( bp)
|
AS_POP_IF86( bp)
|
||||||
#if !defined(_MSC_VER) || (_MSC_VER < 1400)
|
|
||||||
AS_POP_IF86( bx)
|
AS_POP_IF86( bx)
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
".att_syntax prefix;"
|
".att_syntax prefix;"
|
||||||
|
|
@ -647,6 +823,7 @@ void GCM_Base::AuthenticateLastConfidentialBlock()
|
||||||
void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
|
void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize)
|
||||||
{
|
{
|
||||||
m_ctr.Seek(0);
|
m_ctr.Seek(0);
|
||||||
|
ReverseHashBufferIfNeeded();
|
||||||
m_ctr.ProcessData(mac, HashBuffer(), macSize);
|
m_ctr.ProcessData(mac, HashBuffer(), macSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
1
gcm.h
1
gcm.h
|
|
@ -63,6 +63,7 @@ protected:
|
||||||
byte *HashBuffer() {return m_buffer+REQUIRED_BLOCKSIZE;}
|
byte *HashBuffer() {return m_buffer+REQUIRED_BLOCKSIZE;}
|
||||||
byte *HashKey() {return m_buffer+2*REQUIRED_BLOCKSIZE;}
|
byte *HashKey() {return m_buffer+2*REQUIRED_BLOCKSIZE;}
|
||||||
byte *MulTable() {return m_buffer+3*REQUIRED_BLOCKSIZE;}
|
byte *MulTable() {return m_buffer+3*REQUIRED_BLOCKSIZE;}
|
||||||
|
inline void ReverseHashBufferIfNeeded();
|
||||||
|
|
||||||
class CRYPTOPP_DLL GCTR : public CTR_Mode_ExternalCipher::Encryption
|
class CRYPTOPP_DLL GCTR : public CTR_Mode_ExternalCipher::Encryption
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -115,7 +115,7 @@ void CTR_ModePolicy::OperateKeystream(KeystreamOperation operation, byte *output
|
||||||
{
|
{
|
||||||
byte lsb = m_counterArray[s-1];
|
byte lsb = m_counterArray[s-1];
|
||||||
size_t blocks = UnsignedMin(iterationCount, 256U-lsb);
|
size_t blocks = UnsignedMin(iterationCount, 256U-lsb);
|
||||||
m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter);
|
m_cipher->AdvancedProcessBlocks(m_counterArray, input, output, blocks*s, BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_AllowParallel);
|
||||||
if ((m_counterArray[s-1] = lsb + (byte)blocks) == 0)
|
if ((m_counterArray[s-1] = lsb + (byte)blocks) == 0)
|
||||||
IncrementCounterBy256();
|
IncrementCounterBy256();
|
||||||
|
|
||||||
|
|
@ -147,7 +147,7 @@ void BlockOrientedCipherModeBase::UncheckedSetKey(const byte *key, unsigned int
|
||||||
void ECB_OneWay::ProcessData(byte *outString, const byte *inString, size_t length)
|
void ECB_OneWay::ProcessData(byte *outString, const byte *inString, size_t length)
|
||||||
{
|
{
|
||||||
assert(length%BlockSize()==0);
|
assert(length%BlockSize()==0);
|
||||||
m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, 0);
|
m_cipher->AdvancedProcessBlocks(inString, NULL, outString, length, BlockTransformation::BT_AllowParallel);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CBC_Encryption::ProcessData(byte *outString, const byte *inString, size_t length)
|
void CBC_Encryption::ProcessData(byte *outString, const byte *inString, size_t length)
|
||||||
|
|
@ -199,7 +199,7 @@ void CBC_Decryption::ProcessData(byte *outString, const byte *inString, size_t l
|
||||||
unsigned int blockSize = BlockSize();
|
unsigned int blockSize = BlockSize();
|
||||||
memcpy(m_temp, inString+length-blockSize, blockSize); // save copy now in case of in-place decryption
|
memcpy(m_temp, inString+length-blockSize, blockSize); // save copy now in case of in-place decryption
|
||||||
if (length > blockSize)
|
if (length > blockSize)
|
||||||
m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection);
|
m_cipher->AdvancedProcessBlocks(inString+blockSize, inString, outString+blockSize, length-blockSize, BlockTransformation::BT_ReverseDirection|BlockTransformation::BT_AllowParallel);
|
||||||
m_cipher->ProcessAndXorBlock(inString, m_register, outString);
|
m_cipher->ProcessAndXorBlock(inString, m_register, outString);
|
||||||
m_register.swap(m_temp);
|
m_register.swap(m_temp);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
1
modes.h
1
modes.h
|
|
@ -340,6 +340,7 @@ struct OFB_Mode_ExternalCipher : public CipherModeDocumentation
|
||||||
};
|
};
|
||||||
|
|
||||||
CRYPTOPP_DLL_TEMPLATE_CLASS AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> >;
|
CRYPTOPP_DLL_TEMPLATE_CLASS AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> >;
|
||||||
|
CRYPTOPP_DLL_TEMPLATE_CLASS CipherModeFinalTemplate_ExternalCipher<ConcretePolicyHolder<Empty, AdditiveCipherTemplate<AbstractPolicyHolder<AdditiveCipherAbstractPolicy, CTR_ModePolicy> > > >;
|
||||||
|
|
||||||
//! CTR mode
|
//! CTR mode
|
||||||
template <class CIPHER>
|
template <class CIPHER>
|
||||||
|
|
|
||||||
352
rijndael.cpp
352
rijndael.cpp
|
|
@ -4,6 +4,10 @@
|
||||||
|
|
||||||
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
|
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
|
||||||
|
|
||||||
|
/*
|
||||||
|
July 2010: Added support for AES-NI instructions via compiler intrinsics.
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
|
Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
|
||||||
caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
|
caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
|
||||||
|
|
@ -69,6 +73,10 @@ being unloaded from L1 cache, until that round is finished.
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "cpu.h"
|
#include "cpu.h"
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
#include <wmmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
|
#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
|
||||||
|
|
@ -198,20 +206,83 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
|
||||||
m_rounds = keylen/4 + 6;
|
m_rounds = keylen/4 + 6;
|
||||||
m_key.New(4*(m_rounds+1));
|
m_key.New(4*(m_rounds+1));
|
||||||
|
|
||||||
word32 temp, *rk = m_key;
|
word32 *rk = m_key;
|
||||||
const word32 *rc = rcon;
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
|
||||||
|
// MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
|
||||||
|
if (HasAESNI())
|
||||||
|
{
|
||||||
|
static const word32 rcLE[] = {
|
||||||
|
0x01, 0x02, 0x04, 0x08,
|
||||||
|
0x10, 0x20, 0x40, 0x80,
|
||||||
|
0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
|
||||||
|
};
|
||||||
|
const word32 *rc = rcLE;
|
||||||
|
|
||||||
|
__m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
|
||||||
|
memcpy(rk, userKey, keylen);
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
|
||||||
|
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
|
||||||
|
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
|
||||||
|
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
|
||||||
|
|
||||||
|
if (rk + keylen/4 + 4 == m_key.end())
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (keylen == 24)
|
||||||
|
{
|
||||||
|
rk[10] = rk[ 4] ^ rk[ 9];
|
||||||
|
rk[11] = rk[ 5] ^ rk[10];
|
||||||
|
temp = _mm_insert_epi32(temp, rk[11], 3);
|
||||||
|
}
|
||||||
|
else if (keylen == 32)
|
||||||
|
{
|
||||||
|
temp = _mm_insert_epi32(temp, rk[11], 3);
|
||||||
|
rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
|
||||||
|
rk[13] = rk[ 5] ^ rk[12];
|
||||||
|
rk[14] = rk[ 6] ^ rk[13];
|
||||||
|
rk[15] = rk[ 7] ^ rk[14];
|
||||||
|
temp = _mm_insert_epi32(temp, rk[15], 3);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
temp = _mm_insert_epi32(temp, rk[7], 3);
|
||||||
|
|
||||||
|
rk += keylen/4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!IsForwardTransformation())
|
||||||
|
{
|
||||||
|
rk = m_key;
|
||||||
|
unsigned int i, j;
|
||||||
|
|
||||||
|
std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
|
||||||
|
|
||||||
|
for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
|
||||||
|
{
|
||||||
|
temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
|
||||||
|
*(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
|
||||||
|
*(__m128i *)(rk+j) = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
*(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
|
GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
|
||||||
|
const word32 *rc = rcon;
|
||||||
|
word32 temp;
|
||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
temp = rk[keylen/4-1];
|
temp = rk[keylen/4-1];
|
||||||
rk[keylen/4] = rk[0] ^
|
word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
|
||||||
(word32(Se[GETBYTE(temp, 2)]) << 24) ^
|
rk[keylen/4] = rk[0] ^ x ^ *(rc++);
|
||||||
(word32(Se[GETBYTE(temp, 1)]) << 16) ^
|
|
||||||
(word32(Se[GETBYTE(temp, 0)]) << 8) ^
|
|
||||||
Se[GETBYTE(temp, 3)] ^
|
|
||||||
*(rc++);
|
|
||||||
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
|
rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
|
||||||
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
|
rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
|
||||||
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
|
rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
|
||||||
|
|
@ -227,11 +298,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
|
||||||
else if (keylen == 32)
|
else if (keylen == 32)
|
||||||
{
|
{
|
||||||
temp = rk[11];
|
temp = rk[11];
|
||||||
rk[12] = rk[ 4] ^
|
rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
|
||||||
(word32(Se[GETBYTE(temp, 3)]) << 24) ^
|
|
||||||
(word32(Se[GETBYTE(temp, 2)]) << 16) ^
|
|
||||||
(word32(Se[GETBYTE(temp, 1)]) << 8) ^
|
|
||||||
Se[GETBYTE(temp, 0)];
|
|
||||||
rk[13] = rk[ 5] ^ rk[12];
|
rk[13] = rk[ 5] ^ rk[12];
|
||||||
rk[14] = rk[ 6] ^ rk[13];
|
rk[14] = rk[ 6] ^ rk[13];
|
||||||
rk[15] = rk[ 7] ^ rk[14];
|
rk[15] = rk[ 7] ^ rk[14];
|
||||||
|
|
@ -239,10 +306,15 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
|
||||||
rk += keylen/4;
|
rk += keylen/4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rk = m_key;
|
||||||
|
|
||||||
if (IsForwardTransformation())
|
if (IsForwardTransformation())
|
||||||
{
|
{
|
||||||
if (!s_TeFilled)
|
if (!s_TeFilled)
|
||||||
FillEncTable();
|
FillEncTable();
|
||||||
|
|
||||||
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
|
||||||
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
@ -250,35 +322,37 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c
|
||||||
FillDecTable();
|
FillDecTable();
|
||||||
|
|
||||||
unsigned int i, j;
|
unsigned int i, j;
|
||||||
rk = m_key;
|
|
||||||
|
|
||||||
/* invert the order of the round keys: */
|
#define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
|
||||||
for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) {
|
|
||||||
temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
|
for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
|
||||||
temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
|
{
|
||||||
temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
|
temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
|
||||||
temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
|
temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
|
||||||
|
temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
|
||||||
|
temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
|
rk[i+0] = InverseMixColumn(rk[i+0]);
|
||||||
|
rk[i+1] = InverseMixColumn(rk[i+1]);
|
||||||
|
rk[i+2] = InverseMixColumn(rk[i+2]);
|
||||||
|
rk[i+3] = InverseMixColumn(rk[i+3]);
|
||||||
|
|
||||||
/* apply the inverse MixColumn transform to all round keys but the first and the last: */
|
temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
|
||||||
for (i = 1; i < m_rounds; i++) {
|
temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
|
||||||
rk += 4;
|
temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
|
||||||
InverseMixColumn(rk[0]);
|
temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
|
||||||
InverseMixColumn(rk[1]);
|
|
||||||
InverseMixColumn(rk[2]);
|
|
||||||
InverseMixColumn(rk[3]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16);
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
|
if (HasAESNI())
|
||||||
|
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||||
{
|
{
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
if (HasSSE2())
|
if (HasSSE2())
|
||||||
{
|
{
|
||||||
Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
|
Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
|
||||||
|
|
@ -354,6 +428,14 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
||||||
|
|
||||||
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||||
{
|
{
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
if (HasAESNI())
|
||||||
|
{
|
||||||
|
Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef BlockGetAndPut<word32, NativeByteOrder> Block;
|
typedef BlockGetAndPut<word32, NativeByteOrder> Block;
|
||||||
|
|
||||||
word32 s0, s1, s2, s3, t0, t1, t2, t3;
|
word32 s0, s1, s2, s3, t0, t1, t2, t3;
|
||||||
|
|
@ -913,14 +995,200 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end)
|
||||||
return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
|
return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
|
||||||
|
inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
block = _mm_xor_si128(block, subkeys[0]);
|
||||||
|
for (unsigned int i=1; i<rounds-1; i+=2)
|
||||||
|
{
|
||||||
|
block = _mm_aesenc_si128(block, subkeys[i]);
|
||||||
|
block = _mm_aesenc_si128(block, subkeys[i+1]);
|
||||||
|
}
|
||||||
|
block = _mm_aesenc_si128(block, subkeys[rounds-1]);
|
||||||
|
block = _mm_aesenclast_si128(block, subkeys[rounds]);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
__m128i rk = subkeys[0];
|
||||||
|
block0 = _mm_xor_si128(block0, rk);
|
||||||
|
block1 = _mm_xor_si128(block1, rk);
|
||||||
|
block2 = _mm_xor_si128(block2, rk);
|
||||||
|
block3 = _mm_xor_si128(block3, rk);
|
||||||
|
for (unsigned int i=1; i<rounds; i++)
|
||||||
|
{
|
||||||
|
rk = subkeys[i];
|
||||||
|
block0 = _mm_aesenc_si128(block0, rk);
|
||||||
|
block1 = _mm_aesenc_si128(block1, rk);
|
||||||
|
block2 = _mm_aesenc_si128(block2, rk);
|
||||||
|
block3 = _mm_aesenc_si128(block3, rk);
|
||||||
|
}
|
||||||
|
rk = subkeys[rounds];
|
||||||
|
block0 = _mm_aesenclast_si128(block0, rk);
|
||||||
|
block1 = _mm_aesenclast_si128(block1, rk);
|
||||||
|
block2 = _mm_aesenclast_si128(block2, rk);
|
||||||
|
block3 = _mm_aesenclast_si128(block3, rk);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
block = _mm_xor_si128(block, subkeys[0]);
|
||||||
|
for (unsigned int i=1; i<rounds-1; i+=2)
|
||||||
|
{
|
||||||
|
block = _mm_aesdec_si128(block, subkeys[i]);
|
||||||
|
block = _mm_aesdec_si128(block, subkeys[i+1]);
|
||||||
|
}
|
||||||
|
block = _mm_aesdec_si128(block, subkeys[rounds-1]);
|
||||||
|
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
__m128i rk = subkeys[0];
|
||||||
|
block0 = _mm_xor_si128(block0, rk);
|
||||||
|
block1 = _mm_xor_si128(block1, rk);
|
||||||
|
block2 = _mm_xor_si128(block2, rk);
|
||||||
|
block3 = _mm_xor_si128(block3, rk);
|
||||||
|
for (unsigned int i=1; i<rounds; i++)
|
||||||
|
{
|
||||||
|
rk = subkeys[i];
|
||||||
|
block0 = _mm_aesdec_si128(block0, rk);
|
||||||
|
block1 = _mm_aesdec_si128(block1, rk);
|
||||||
|
block2 = _mm_aesdec_si128(block2, rk);
|
||||||
|
block3 = _mm_aesdec_si128(block3, rk);
|
||||||
|
}
|
||||||
|
rk = subkeys[rounds];
|
||||||
|
block0 = _mm_aesdeclast_si128(block0, rk);
|
||||||
|
block1 = _mm_aesdeclast_si128(block1, rk);
|
||||||
|
block2 = _mm_aesdeclast_si128(block2, rk);
|
||||||
|
block3 = _mm_aesdeclast_si128(block3, rk);
|
||||||
|
}
|
||||||
|
|
||||||
|
static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
|
||||||
|
|
||||||
|
template <typename F1, typename F4>
|
||||||
|
inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
|
{
|
||||||
|
size_t blockSize = 16;
|
||||||
|
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
|
||||||
|
size_t xorIncrement = xorBlocks ? blockSize : 0;
|
||||||
|
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_ReverseDirection)
|
||||||
|
{
|
||||||
|
assert(length % blockSize == 0);
|
||||||
|
inBlocks += length - blockSize;
|
||||||
|
xorBlocks += length - blockSize;
|
||||||
|
outBlocks += length - blockSize;
|
||||||
|
inIncrement = 0-inIncrement;
|
||||||
|
xorIncrement = 0-xorIncrement;
|
||||||
|
outIncrement = 0-outIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_AllowParallel)
|
||||||
|
{
|
||||||
|
while (length >= 4*blockSize)
|
||||||
|
{
|
||||||
|
__m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
|
||||||
|
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||||
|
{
|
||||||
|
const __m128i be1 = *(const __m128i *)s_one;
|
||||||
|
block1 = _mm_add_epi32(block0, be1);
|
||||||
|
block2 = _mm_add_epi32(block1, be1);
|
||||||
|
block3 = _mm_add_epi32(block2, be1);
|
||||||
|
_mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
block1 = _mm_loadu_si128((const __m128i *)inBlocks);
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
block2 = _mm_loadu_si128((const __m128i *)inBlocks);
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
block3 = _mm_loadu_si128((const __m128i *)inBlocks);
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_XorInput)
|
||||||
|
{
|
||||||
|
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
func4(block0, block1, block2, block3, subkeys, rounds);
|
||||||
|
|
||||||
|
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||||
|
{
|
||||||
|
block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm_storeu_si128((__m128i *)outBlocks, block0);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
_mm_storeu_si128((__m128i *)outBlocks, block1);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
_mm_storeu_si128((__m128i *)outBlocks, block2);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
_mm_storeu_si128((__m128i *)outBlocks, block3);
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
|
||||||
|
length -= 4*blockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (length >= blockSize)
|
||||||
|
{
|
||||||
|
__m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_XorInput)
|
||||||
|
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||||
|
const_cast<byte *>(inBlocks)[15]++;
|
||||||
|
|
||||||
|
func1(block, subkeys, rounds);
|
||||||
|
|
||||||
|
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||||
|
block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
|
||||||
|
|
||||||
|
_mm_storeu_si128((__m128i *)outBlocks, block);
|
||||||
|
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
length -= blockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
|
size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
|
||||||
{
|
{
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
if (HasAESNI())
|
||||||
|
return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
||||||
|
if (HasSSE2())
|
||||||
|
{
|
||||||
if (length < BLOCKSIZE)
|
if (length < BLOCKSIZE)
|
||||||
return length;
|
return length;
|
||||||
|
|
||||||
if (HasSSE2())
|
|
||||||
{
|
|
||||||
struct Locals
|
struct Locals
|
||||||
{
|
{
|
||||||
word32 subkeys[4*12], workspace[8];
|
word32 subkeys[4*12], workspace[8];
|
||||||
|
|
@ -966,15 +1234,27 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo
|
||||||
locals.keysBegin = (12-keysToCopy)*16;
|
locals.keysBegin = (12-keysToCopy)*16;
|
||||||
|
|
||||||
Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
|
Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
|
||||||
return length%16;
|
return length % BLOCKSIZE;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
|
||||||
|
size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
|
||||||
|
{
|
||||||
|
if (HasAESNI())
|
||||||
|
return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
|
||||||
|
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,9 @@ class CRYPTOPP_DLL Rijndael : public Rijndael_Info, public BlockCipherDocumentat
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||||
|
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||||
|
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
|
||||||
|
|
@ -252,6 +252,7 @@ bool TestSettings()
|
||||||
cout << "passed: ";
|
cout << "passed: ";
|
||||||
|
|
||||||
cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize;
|
cout << "hasMMX == " << hasMMX << ", hasISSE == " << hasISSE << ", hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize;
|
||||||
|
cout << ", AESNI_INTRINSICS == " << CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE << endl;
|
||||||
|
|
||||||
if (!pass)
|
if (!pass)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue