Add Intel SHA1 extension support

pull/347/head
Jeffrey Walton 2016-12-01 00:49:59 -05:00
parent 6970ef702d
commit 7ab9b00f90
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
5 changed files with 233 additions and 30 deletions

View File

@ -502,11 +502,10 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0 #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
#endif #endif
// AVX2 in MSC 18.00 #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && !defined(_M_ARM) && ((_MSC_VER >= 1900) || (CRYPTOPP_GCC_VERSION >= 50000) || defined(__SHA__))
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AVX) && !defined(_M_ARM) && ((_MSC_VER >= 1600) || (defined(__RDRND__) || defined(__RDSEED__) || defined(__AVX__))) #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 1
#define CRYPTOPP_BOOL_AVX_AVAILABLE 1
#else #else
#define CRYPTOPP_BOOL_AVX_AVAILABLE 0 #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
#endif #endif
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains. // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.

View File

@ -502,11 +502,10 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0 #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
#endif #endif
// AVX2 in MSC 18.00 #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && !defined(_M_ARM) && ((_MSC_VER >= 1900) || (CRYPTOPP_GCC_VERSION >= 50000) || defined(__SHA__))
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AVX) && !defined(_M_ARM) && ((_MSC_VER >= 1600) || (defined(__RDRND__) || defined(__RDSEED__) || defined(__AVX__))) #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 1
#define CRYPTOPP_BOOL_AVX_AVAILABLE 1
#else #else
#define CRYPTOPP_BOOL_AVX_AVAILABLE 0 #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
#endif #endif
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains. // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.

9
cpu.h
View File

@ -47,12 +47,9 @@
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
# include <wmmintrin.h> // aesenc, aesdec, etc # include <wmmintrin.h> // aesenc, aesdec, etc
#endif // wmmintrin.h #endif // wmmintrin.h
#if CRYPTOPP_BOOL_AVX_INTRINSICS_AVAILABLE #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
# include <immintrin.h> // RDRAND, RDSEED and AVX # include <immintrin.h> // RDRAND, RDSEED, AVX, SHA
#endif #endif // immintrin.h
#if CRYPTOPP_BOOL_AVX2_INTRINSICS_AVAILABLE
# include <zmmintrin.h> // AVX 512-bit extensions
#endif
#endif // X86/X64/X32 Headers #endif // X86/X64/X32 Headers
// Applies to both X86/X32/X64 and ARM32/ARM64. And we've got MIPS devices on the way. // Applies to both X86/X32/X64 and ARM32/ARM64. And we've got MIPS devices on the way.

237
sha.cpp
View File

@ -1,7 +1,7 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
// Both are in the public domain. // implemented Intel SHA extensions. All are in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@ -29,20 +29,13 @@
NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(CryptoPP)
// start of Steve Reid's code ////////////////////////////////
// start of Steve Reid's code //
////////////////////////////////
#define blk0(i) (W[i] = data[i]) #define blk0(i) (W[i] = data[i])
#define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1)) #define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))
void SHA1::InitState(HashWordType *state)
{
state[0] = 0x67452301L;
state[1] = 0xEFCDAB89L;
state[2] = 0x98BADCFEL;
state[3] = 0x10325476L;
state[4] = 0xC3D2E1F0L;
}
#define f1(x,y,z) (z^(x&(y^z))) #define f1(x,y,z) (z^(x&(y^z)))
#define f2(x,y,z) (x^y^z) #define f2(x,y,z) (x^y^z)
#define f3(x,y,z) ((x&y)|(z&(x|y))) #define f3(x,y,z) ((x&y)|(z&(x|y)))
@ -55,7 +48,7 @@ void SHA1::InitState(HashWordType *state)
#define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30); #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
#define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30); #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
void SHA1::Transform(word32 *state, const word32 *data) static void SHA1_CXX_Transform(word32 *state, const word32 *data)
{ {
word32 W[16]; word32 W[16];
/* Copy context->state[] to working vars */ /* Copy context->state[] to working vars */
@ -93,7 +86,223 @@ void SHA1::Transform(word32 *state, const word32 *data)
state[4] += e; state[4] += e;
} }
// end of Steve Reid's code //////////////////////////////
// end of Steve Reid's code //
//////////////////////////////
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
{
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
word32 T[16];
ByteReverse(T, data, 64);
// Load initial values
ABCD = _mm_loadu_si128((__m128i*) state);
E0 = _mm_set_epi32(state[4], 0, 0, 0);
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
MASK = _mm_set_epi64x(W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f));
// Save current hash
ABCD_SAVE = ABCD;
E0_SAVE = E0;
// Rounds 0-3
MSG0 = _mm_loadu_si128((__m128i*) T+0);
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
E0 = _mm_add_epi32(E0, MSG0);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
// Rounds 4-7
MSG1 = _mm_loadu_si128((__m128i*) (T+4));
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
// Rounds 8-11
MSG2 = _mm_loadu_si128((__m128i*) (T+8));
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 12-15
MSG3 = _mm_loadu_si128((__m128i*) (T+12));
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 16-19
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 20-23
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 24-27
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 28-31
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 32-35
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 36-39
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 40-43
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 44-47
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 48-51
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 52-55
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 56-59
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 60-63
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 64-67
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 68-71
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 72-75
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
// Rounds 76-79
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
// Add values back to state
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128((__m128i*) state, ABCD);
*(state+4) = _mm_extract_epi32(E0, 3);
}
#endif
typedef void (*pfnSHA1Transform)(word32 *state, const word32 *data);
pfnSHA1Transform InitializeSHA1Transform()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA1_SHAEXT_Transform;
else
#endif
return &SHA1_CXX_Transform;
}
void SHA1::InitState(HashWordType *state)
{
state[0] = 0x67452301L;
state[1] = 0xEFCDAB89L;
state[2] = 0x98BADCFEL;
state[3] = 0x10325476L;
state[4] = 0xC3D2E1F0L;
}
void SHA1::Transform(word32 *state, const word32 *data)
{
static const pfnSHA1Transform s_pfn = InitializeSHA1Transform();
s_pfn(state, data);
}
// ************************************************************* // *************************************************************

3
sha.h
View File

@ -1,7 +1,6 @@
// sha.h - written and placed in the public domain by Wei Dai // sha.h - written and placed in the public domain by Wei Dai
//! \file //! \file sha.h
//! \headerfile sha.h
//! \brief Classes for SHA-1 and SHA-2 family of message digests //! \brief Classes for SHA-1 and SHA-2 family of message digests
#ifndef CRYPTOPP_SHA_H #ifndef CRYPTOPP_SHA_H