Fix bad SHA::Transform calculation (Issue 455)

Reworked SHA class internals to align all the implementations. Formerly all hashes were software based, IterHashBase handled endian conversions, IterHashBase repeatedly called the single block SHA{N}::Transform. The rework added SHA{N}::HashMultipleBlocks, and the SHA classes attempt to always use it.

Now SHA{N}::Transform calls into SHA{N}_HashMultipleBlocks, which is a free standing function. An added wrinkle is hardware wants little endian data and software presents big endian data, so HashMultipleBlocks accepts a ByteOrder for the incoming data. Hardware based SHA{N}_HashMultipleBlocks can often perform the endian swap much easier by setting an EPI mask so it was profitable to defer to hardware when available.

The rework also removed the hacked-in pointers to implementations. The class now looks more like AES, GCM, etc.
pull/461/head
Jeffrey Walton 2017-08-13 16:05:39 -04:00
parent 863bf9133c
commit 2aff92ddb6
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
5 changed files with 589 additions and 561 deletions

View File

@ -95,7 +95,7 @@ static void Rijndael_Dec_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byt
# define MAYBE_CONST const
#endif
// Clang casts
// Clang __m128i casts
#define M128I_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x))

View File

@ -38,12 +38,8 @@ word32 SEAL_Gamma::Apply(word32 i)
word32 shaIndex = i/5;
if (shaIndex != lastIndex)
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
D[0] = ConditionalByteReverse(HasSHA() ? BIG_ENDIAN_ORDER : LITTLE_ENDIAN_ORDER, shaIndex);
#else
D[0] = shaIndex;
#endif
memcpy(Z, H, 20);
D[0] = shaIndex;
SHA1::Transform(Z, D);
lastIndex = shaIndex;
}

553
sha.cpp
View File

@ -6,6 +6,16 @@
// code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke.
// All code is in the public domain.
// In August 2017 Walton reworked the internals to align all the implementations.
// Formerly all hashes were software based, IterHashBase handled endian conversions,
// IterHashBase repeatedly called the single block SHA{N}::Transform. The rework
// added SHA{N}::HashMultipleBlocks, and the SHA classes attempt to always use it.
// Now SHA{N}::Transform calls into SHA{N}::HashMultipleBlocks. An added wrinkle is
// hardware is little endian and software is big endian, so HashMultipleBlocks
// accepts a ByteOrder for the incoming data. Hardware based SHA{N}::HashMultipleBlocks
// can often perform the endian swap much easier by setting an EPI mask. The rework
// also removed the hacked-in pointers to implementations.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
#include "pch.h"
@ -30,11 +40,11 @@
# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#endif
NAMESPACE_BEGIN(CryptoPP)
// Clang __m128i casts
#define M128_CAST(x) ((__m128i *)(void *)(x))
#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
// Function pointer for specific SHA1 or SHA256 Transform function
typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
typedef void (CRYPTOPP_FASTCALL *pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
NAMESPACE_BEGIN(CryptoPP)
////////////////////////////////
// start of Steve Reid's code //
@ -55,8 +65,11 @@ typedef void (CRYPTOPP_FASTCALL *pfnSHAHashBlocks)(word32 *state, const word32 *
#define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
#define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
static void SHA1_CXX_Transform(word32 *state, const word32 *data)
static void SHA1_CXX_HashBlock(word32 *state, const word32 *data)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
word32 W[16];
/* Copy context->state[] to working vars */
word32 a = state[0];
@ -103,30 +116,42 @@ static void SHA1_CXX_Transform(word32 *state, const word32 *data)
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
static void SHA1_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= 64);
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
// Load initial values
ABCD = _mm_loadu_si128((__m128i*) state);
ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
E0 = _mm_set_epi32(state[4], 0, 0, 0);
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
MASK = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
// IA-32 SHA is little endian, SHA::Transform is big endian,
// and SHA::HashMultipleBlocks can be either. ByteOrder
// allows us to avoid extra endian reversals. It saves 1.0 cpb.
MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
_mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
_mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
while (length >= 64)
{
// Save current hash
ABCD_SAVE = ABCD;
E0_SAVE = E0;
// Rounds 0-3
MSG0 = _mm_loadu_si128((__m128i*) data+0);
MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
E0 = _mm_add_epi32(E0, MSG0);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
// Rounds 4-7
MSG1 = _mm_loadu_si128((__m128i*) (data+4));
MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
@ -134,7 +159,7 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
// Rounds 8-11
MSG2 = _mm_loadu_si128((__m128i*) (data+8));
MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
@ -143,7 +168,7 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 12-15
MSG3 = _mm_loadu_si128((__m128i*) (data+12));
MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
@ -278,9 +303,13 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
data += 16;
length -= 64;
}
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128((__m128i*) state, ABCD);
_mm_storeu_si128(M128_CAST(state), ABCD);
state[4] = _mm_extract_epi32(E0, 3);
}
#endif
@ -294,8 +323,12 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
//////////////////////////////////////////////////////////////
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
static void SHA1_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= 64);
uint32x4_t C0, C1, C2, C3;
uint32x4_t ABCD, ABCD_SAVED;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
@ -311,6 +344,8 @@ static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
ABCD = vld1q_u32(&state[0]);
E0 = state[4];
while (length >= 64)
{
// Save current hash
ABCD_SAVED = ABCD;
E0_SAVED = E0;
@ -320,6 +355,14 @@ static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
MSG2 = vld1q_u32(data + 8);
MSG3 = vld1q_u32(data + 12);
if (order == BIG_ENDIAN_ORDER) // Data arrangement
{
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
}
TMP0 = vaddq_u32(MSG0, C0);
TMP1 = vaddq_u32(MSG1, C0);
@ -458,6 +501,10 @@ static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
E0 += E0_SAVED;
ABCD = vaddq_u32(ABCD_SAVED, ABCD);
data += 16;
length -= 64;
}
// Save state
vst1q_u32(&state[0], ABCD);
state[4] = E0;
@ -468,21 +515,6 @@ static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
///////////////////////////////////////////////////////
pfnSHATransform InitializeSHA1Transform()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA1_SSE_SHA_Transform;
else
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA1())
return &SHA1_ARM_SHA_Transform;
else
#endif
return &SHA1_CXX_Transform;
}
void SHA1::InitState(HashWordType *state)
{
state[0] = 0x67452301L;
@ -494,53 +526,75 @@ void SHA1::InitState(HashWordType *state)
void SHA1::Transform(word32 *state, const word32 *data)
{
static const pfnSHATransform s_pfn = InitializeSHA1Transform();
s_pfn(state, data);
}
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
{
SHA1_SHANI_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
return;
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA1())
{
SHA1_ARM_SHA_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
return;
}
#endif
SHA1_CXX_HashBlock(state, data);
}
size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length)
{
static const bool noReverse = HasSHA() || NativeByteOrderIs(this->GetByteOrder());
const unsigned int blockSize = this->BlockSize();
CRYPTOPP_ASSERT(input);
CRYPTOPP_ASSERT(length >= 64);
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
{
SHA1_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
return length & (SHA1::BLOCKSIZE - 1);
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA1())
{
SHA1_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
return length & (SHA1::BLOCKSIZE - 1);
}
#endif
const bool noReverse = NativeByteOrderIs(this->GetByteOrder());
word32 *dataBuf = this->DataBuf();
do
{
if (noReverse)
this->HashEndianCorrectedBlock(input);
{
// this->HashEndianCorrectedBlock(input);
SHA1_CXX_HashBlock(m_state, input);
}
else
{
ByteReverse(dataBuf, input, this->BlockSize());
this->HashEndianCorrectedBlock(dataBuf);
ByteReverse(dataBuf, input, 64);
// this->HashEndianCorrectedBlock(dataBuf);
SHA1_CXX_HashBlock(m_state, dataBuf);
}
input += blockSize/sizeof(word32);
length -= blockSize;
input += 16;
length -= 64;
}
while (length >= blockSize);
while (length >= 64);
return length;
}
#endif
// *************************************************************
void SHA224::InitState(HashWordType *state)
{
static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
memcpy(state, s, sizeof(s));
}
CRYPTOPP_ALIGN_DATA(16)
extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
void SHA256::InitState(HashWordType *state)
{
static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
memcpy(state, s, sizeof(s));
}
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
#else
extern const word32 SHA256_K[64] = {
#endif
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -559,11 +613,75 @@ extern const word32 SHA256_K[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) (y^((x^y)&(y^z)))
#define a(i) T[(0-i)&7]
#define b(i) T[(1-i)&7]
#define c(i) T[(2-i)&7]
#define d(i) T[(3-i)&7]
#define e(i) T[(4-i)&7]
#define f(i) T[(5-i)&7]
#define g(i) T[(6-i)&7]
#define h(i) T[(7-i)&7]
#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
// for SHA256
#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
static void SHA256_CXX_HashBlock(word32 *state, const word32 *data)
{
word32 W[16], T[8];
/* Copy context->state[] to working vars */
memcpy(T, state, sizeof(T));
/* 64 operations, partially loop unrolled */
for (unsigned int j=0; j<64; j+=16)
{
R( 0); R( 1); R( 2); R( 3);
R( 4); R( 5); R( 6); R( 7);
R( 8); R( 9); R(10); R(11);
R(12); R(13); R(14); R(15);
}
/* Add the working vars back into context.state[] */
state[0] += a(0);
state[1] += b(0);
state[2] += c(0);
state[3] += d(0);
state[4] += e(0);
state[5] += f(0);
state[6] += g(0);
state[7] += h(0);
}
#undef S0
#undef S1
#undef s0
#undef s1
#undef R
void SHA224::InitState(HashWordType *state)
{
static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
memcpy(state, s, sizeof(s));
}
void SHA256::InitState(HashWordType *state)
{
static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
memcpy(state, s, sizeof(s));
}
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM))
static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len)
static void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len)
{
#define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
#define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
@ -685,7 +803,7 @@ static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32
INTEL_NOPREFIX
#elif defined(CRYPTOPP_GENERATE_X64_MASM)
ALIGN 8
X86_SHA256_HashBlocks PROC FRAME
SHA256_SSE_HashMultipleBlocks PROC FRAME
rex_push_reg rsi
push_reg rdi
push_reg rbx
@ -864,7 +982,7 @@ INTEL_NOPREFIX
pop rdi
pop rsi
ret
X86_SHA256_HashBlocks ENDP
SHA256_SSE_HashMultipleBlocks ENDP
#endif
#ifdef __GNUC__
@ -888,200 +1006,109 @@ INTEL_NOPREFIX
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len);
void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len);
}
#endif
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
static void SHA256_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order);
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
static void SHA256_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order);
#endif
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
pfnSHAHashBlocks InitializeSHA256HashBlocks()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA256_SSE_SHA_HashBlocks;
else
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA2())
return &SHA256_ARM_SHA_HashBlocks;
else
#endif
return &X86_SHA256_HashBlocks;
}
size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
{
static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
return length % BLOCKSIZE;
CRYPTOPP_ASSERT(input);
CRYPTOPP_ASSERT(length >= 64);
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
{
SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
return length & (SHA256::BLOCKSIZE - 1);
}
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
{
const size_t res = length & (SHA256::BLOCKSIZE - 1);
SHA256_SSE_HashMultipleBlocks(m_state, input, length-res);
return res;
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA2())
{
SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
return length & (SHA256::BLOCKSIZE - 1);
}
#endif
const bool noReverse = NativeByteOrderIs(this->GetByteOrder());
word32 *dataBuf = this->DataBuf();
do
{
if (noReverse)
{
// this->HashEndianCorrectedBlock(input);
SHA256_CXX_HashBlock(m_state, input);
}
else
{
ByteReverse(dataBuf, input, SHA256::BLOCKSIZE);
// this->HashEndianCorrectedBlock(dataBuf);
SHA256_CXX_HashBlock(m_state, dataBuf);
}
input += SHA256::BLOCKSIZE/sizeof(word32);
length -= SHA256::BLOCKSIZE;
}
while (length >= SHA256::BLOCKSIZE);
return length;
}
size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
{
static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
return length % BLOCKSIZE;
CRYPTOPP_ASSERT(input);
CRYPTOPP_ASSERT(length >= 64);
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
{
SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
return length & (SHA256::BLOCKSIZE - 1);
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA2())
{
SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
return length & (SHA256::BLOCKSIZE - 1);
}
#endif
#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) (y^((x^y)&(y^z)))
#define a(i) T[(0-i)&7]
#define b(i) T[(1-i)&7]
#define c(i) T[(2-i)&7]
#define d(i) T[(3-i)&7]
#define e(i) T[(4-i)&7]
#define f(i) T[(5-i)&7]
#define g(i) T[(6-i)&7]
#define h(i) T[(7-i)&7]
#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
// for SHA256
#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
#if defined(__OPTIMIZE_SIZE__)
// Smaller but slower
void SHA256_CXX_Transform(word32 *state, const word32 *data)
{
word32 W[32], T[20];
unsigned int i = 0, j = 0;
word32 *t = T+8;
memcpy(t, state, 8*4);
word32 e = t[4], a = t[0];
const bool noReverse = NativeByteOrderIs(this->GetByteOrder());
word32 *dataBuf = this->DataBuf();
do
{
word32 w = data[j];
W[j] = w;
w += SHA256_K[j];
w += t[7];
w += S1(e);
w += Ch(e, t[5], t[6]);
e = t[3] + w;
t[3] = t[3+8] = e;
w += S0(t[0]);
a = w + Maj(a, t[1], t[2]);
t[-1] = t[7] = a;
--t;
++j;
if (j%8 == 0)
t += 8;
} while (j<16);
do
if (noReverse)
{
i = j&0xf;
word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
W[i+16] = W[i] = w;
w += SHA256_K[j];
w += t[7];
w += S1(e);
w += Ch(e, t[5], t[6]);
e = t[3] + w;
t[3] = t[3+8] = e;
w += S0(t[0]);
a = w + Maj(a, t[1], t[2]);
t[-1] = t[7] = a;
w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
W[(i+1)+16] = W[(i+1)] = w;
w += SHA256_K[j+1];
w += (t-1)[7];
w += S1(e);
w += Ch(e, (t-1)[5], (t-1)[6]);
e = (t-1)[3] + w;
(t-1)[3] = (t-1)[3+8] = e;
w += S0((t-1)[0]);
a = w + Maj(a, (t-1)[1], (t-1)[2]);
(t-1)[-1] = (t-1)[7] = a;
t-=2;
j+=2;
if (j%8 == 0)
t += 8;
} while (j<64);
state[0] += a;
state[1] += t[1];
state[2] += t[2];
state[3] += t[3];
state[4] += e;
state[5] += t[5];
state[6] += t[6];
state[7] += t[7];
// this->HashEndianCorrectedBlock(input);
SHA256_CXX_HashBlock(m_state, input);
}
#else
// Bigger but faster
void SHA256_CXX_Transform(word32 *state, const word32 *data)
else
{
word32 W[16], T[8];
/* Copy context->state[] to working vars */
memcpy(T, state, sizeof(T));
/* 64 operations, partially loop unrolled */
for (unsigned int j=0; j<64; j+=16)
{
R( 0); R( 1); R( 2); R( 3);
R( 4); R( 5); R( 6); R( 7);
R( 8); R( 9); R(10); R(11);
R(12); R(13); R(14); R(15);
ByteReverse(dataBuf, input, SHA256::BLOCKSIZE);
// this->HashEndianCorrectedBlock(dataBuf);
SHA256_CXX_HashBlock(m_state, dataBuf);
}
/* Add the working vars back into context.state[] */
state[0] += a(0);
state[1] += b(0);
state[2] += c(0);
state[3] += d(0);
state[4] += e(0);
state[5] += f(0);
state[6] += g(0);
state[7] += h(0);
}
#endif // __OPTIMIZE_SIZE__
#undef S0
#undef S1
#undef s0
#undef s1
#undef R
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
static void SHA256_SSE2_Transform(word32 *state, const word32 *data)
{
// this byte reverse is a waste of time, but this function is only called by MDC
word32 W[16];
ByteReverse(W, data, SHA256::BLOCKSIZE);
X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
input += SHA256::BLOCKSIZE/sizeof(word32);
length -= SHA256::BLOCKSIZE;
}
#endif // CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
{
return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
while (length >= SHA256::BLOCKSIZE);
return length;
}
#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
static void SHA256_ARM_SHA_Transform(word32 *state, const word32 *data)
{
return SHA256_ARM_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
}
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
///////////////////////////////////
// start of Walton/Gulley's code //
@ -1089,10 +1116,11 @@ static void SHA256_ARM_SHA_Transform(word32 *state, const word32 *data)
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
static void SHA256_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
CRYPTOPP_ASSERT(state);
CRYPTOPP_ASSERT(data);
CRYPTOPP_ASSERT(length >= 64);
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
@ -1100,9 +1128,15 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_loadu_si128((__m128i*) &state[0]);
STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
TMP = _mm_loadu_si128(M128_CAST(&state[0]));
STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
// IA-32 SHA is little endian, SHA::Transform is big endian,
// and SHA::HashMultipleBlocks can be either. ByteOrder
// allows us to avoid extra endian reversals. It saves 1.0 cpb.
MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
_mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
_mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
@ -1116,7 +1150,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
CDGH_SAVE = STATE1;
// Rounds 0-3
MSG = _mm_loadu_si128((__m128i*) data+0);
MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@ -1124,7 +1158,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@ -1133,7 +1167,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@ -1142,7 +1176,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@ -1281,8 +1315,8 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_storeu_si128((__m128i*) &state[0], STATE0);
_mm_storeu_si128((__m128i*) &state[4], STATE1);
_mm_storeu_si128(M128_CAST(&state[0]), STATE0);
_mm_storeu_si128(M128_CAST(&state[4]), STATE1);
}
#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
@ -1295,7 +1329,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
/////////////////////////////////////////////////////////
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
static void SHA256_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
@ -1317,6 +1351,14 @@ static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const wor
MSG2 = vld1q_u32(data + 8);
MSG3 = vld1q_u32(data + 12);
if (order == BIG_ENDIAN_ORDER) // Data arrangement
{
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
}
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
// Rounds 0-3
@ -1456,31 +1498,24 @@ static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const wor
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
///////////////////////////////////////////////////////
pfnSHATransform InitializeSHA256Transform()
void SHA256::Transform(word32 *state, const word32 *data)
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
return &SHA256_SSE_SHA_Transform;
else
#endif
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
if (HasSSE2())
return &SHA256_SSE2_Transform;
else
{
SHA256_SHANI_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
return;
}
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA2())
return &SHA256_ARM_SHA_Transform;
else
{
SHA256_ARM_SHA_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
return;
}
#endif
return &SHA256_CXX_Transform;
}
void SHA256::Transform(word32 *state, const word32 *data)
{
static const pfnSHATransform s_pfn = InitializeSHA256Transform();
s_pfn(state, data);
SHA256_CXX_HashBlock(state, data);
}
// *************************************************************

45
sha.h
View File

@ -38,21 +38,20 @@ public:
//! \param digest the state of the hash
//! \param data the data to be digested
//! \details Transform operates the hash on <tt>data</tt>. When the call is invoked
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash or
//! updated state.
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash
//! or updated state.
//! \details Hashes which derive from IteratedHashWithStaticTransform provide static
//! member functions InitState and Transform. External classes, like SEAL and MDC,
//! can initialize state with a user provided key and operate the hash on the data
//! with the user supplied state.
//! \note On Intel platforms the state array and data must be 16-byte aligned for SSE2.
static void CRYPTOPP_API Transform(word32 *digest, const word32 *data);
static void CRYPTOPP_API Transform(HashWordType *digest, const HashWordType *data);
//! \brief The algorithm name
//! \returns C-style string "SHA-1"
CRYPTOPP_STATIC_CONSTEXPR const char* CRYPTOPP_API StaticAlgorithmName() {return "SHA-1";}
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
size_t HashMultipleBlocks(const word32 *input, size_t length);
#endif
protected:
size_t HashMultipleBlocks(const HashWordType *input, size_t length);
};
//! \class SHA256
@ -75,21 +74,20 @@ public:
//! \param digest the state of the hash
//! \param data the data to be digested
//! \details Transform operates the hash on <tt>data</tt>. When the call is invoked
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash or
//! updated state.
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash
//! or updated state.
//! \details Hashes which derive from IteratedHashWithStaticTransform provide static
//! member functions InitState and Transform. External classes, like SEAL and MDC,
//! can initialize state with a user provided key and operate the hash on the data
//! with the user supplied state.
//! \note On Intel platforms the state array and data must be 16-byte aligned for SSE2.
static void CRYPTOPP_API Transform(word32 *digest, const word32 *data);
static void CRYPTOPP_API Transform(HashWordType *digest, const HashWordType *data);
//! \brief The algorithm name
//! \returns C-style string "SHA-256"
CRYPTOPP_STATIC_CONSTEXPR const char* CRYPTOPP_API StaticAlgorithmName() {return "SHA-256";}
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
size_t HashMultipleBlocks(const word32 *input, size_t length);
#endif
protected:
size_t HashMultipleBlocks(const HashWordType *input, size_t length);
};
//! \class SHA224
@ -112,21 +110,20 @@ public:
//! \param digest the state of the hash
//! \param data the data to be digested
//! \details Transform operates the hash on <tt>data</tt>. When the call is invoked
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash or
//! updated state.
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash
//! or updated state.
//! \details Hashes which derive from IteratedHashWithStaticTransform provide static
//! member functions InitState and Transform. External classes, like SEAL and MDC,
//! can initialize state with a user provided key and operate the hash on the data
//! with the user supplied state.
//! \note On Intel platforms the state array and data must be 16-byte aligned for SSE2.
static void CRYPTOPP_API Transform(word32 *digest, const word32 *data) {SHA256::Transform(digest, data);}
static void CRYPTOPP_API Transform(HashWordType *digest, const HashWordType *data) {SHA256::Transform(digest, data);}
//! \brief The algorithm name
//! \returns C-style string "SHA-224"
CRYPTOPP_STATIC_CONSTEXPR const char* CRYPTOPP_API StaticAlgorithmName() {return "SHA-224";}
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
size_t HashMultipleBlocks(const word32 *input, size_t length);
#endif
protected:
size_t HashMultipleBlocks(const HashWordType *input, size_t length);
};
//! \class SHA512
@ -149,14 +146,14 @@ public:
//! \param digest the state of the hash
//! \param data the data to be digested
//! \details Transform operates the hash on <tt>data</tt>. When the call is invoked
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash or
//! updated state.
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash
//! or updated state.
//! \details Hashes which derive from IteratedHashWithStaticTransform provide static
//! member functions InitState and Transform. External classes, like SEAL and MDC,
//! can initialize state with a user provided key and operate the hash on the data
//! with the user supplied state.
//! \note On Intel platforms the state array and data must be 16-byte aligned for SSE2.
static void CRYPTOPP_API Transform(word64 *digest, const word64 *data);
static void CRYPTOPP_API Transform(HashWordType *digest, const HashWordType *data);
//! \brief The algorithm name
//! \returns C-style string "SHA-512"
CRYPTOPP_STATIC_CONSTEXPR const char* CRYPTOPP_API StaticAlgorithmName() {return "SHA-512";}
@ -182,14 +179,14 @@ public:
//! \param digest the state of the hash
//! \param data the data to be digested
//! \details Transform operates the hash on <tt>data</tt>. When the call is invoked
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash or
//! updated state.
//! <tt>digest</tt> holds initial state. Upon return <tt>digest</tt> holds the hash
//! or updated state.
//! \details Hashes which derive from IteratedHashWithStaticTransform provide static
//! member functions InitState and Transform. External classes, like SEAL and MDC,
//! can initialize state with a user provided key and operate the hash on the data
//! with the user supplied state.
//! \note On Intel platforms the state array and data must be 16-byte aligned for SSE2.
static void CRYPTOPP_API Transform(word64 *digest, const word64 *data) {SHA512::Transform(digest, data);}
static void CRYPTOPP_API Transform(HashWordType *digest, const HashWordType *data) {SHA512::Transform(digest, data);}
//! \brief The algorithm name
//! \returns C-style string "SHA-384"
CRYPTOPP_STATIC_CONSTEXPR const char* CRYPTOPP_API StaticAlgorithmName() {return "SHA-384";}

View File

@ -676,7 +676,7 @@ ret
GCM_AuthenticateBlocks_64K ENDP
ALIGN 8
X86_SHA256_HashBlocks PROC FRAME
SHA256_SSE_HashMultipleBlocks PROC FRAME
rex_push_reg rsi
push_reg rdi
push_reg rbx
@ -1962,7 +1962,7 @@ pop rbx
pop rdi
pop rsi
ret
X86_SHA256_HashBlocks ENDP
SHA256_SSE_HashMultipleBlocks ENDP
_TEXT ENDS
END