From a25e63fcb746193ee280e79026e3f0d388db1964 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 14 Aug 2017 03:19:20 -0400 Subject: [PATCH] Fix sources after sync with upstream --- gcm.cpp | 4 +- sha-simd.cpp | 828 ++++++++++++++++++++++++---------------------- sha.cpp | 917 +++------------------------------------------------ 3 files changed, 495 insertions(+), 1254 deletions(-) diff --git a/gcm.cpp b/gcm.cpp index a097ce11..167dba78 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -134,7 +134,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const // Avoid "parameter not used" error and suppress Coverity finding (void)params.GetIntValue(Name::TableSize(), tableSize); tableSize = s_cltableSizeInBlocks * blockSize; - CRYPTOPP_ASSERT(tableSize > blockSize); + CRYPTOPP_ASSERT(tableSize > static_cast(blockSize)); } else #elif CRYPTOPP_ARM_PMULL_AVAILABLE @@ -143,7 +143,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const // Avoid "parameter not used" error and suppress Coverity finding (void)params.GetIntValue(Name::TableSize(), tableSize); tableSize = s_cltableSizeInBlocks * blockSize; - CRYPTOPP_ASSERT(tableSize > blockSize); + CRYPTOPP_ASSERT(tableSize > static_cast(blockSize)); } else #endif diff --git a/sha-simd.cpp b/sha-simd.cpp index 019153e8..0081cac8 100644 --- a/sha-simd.cpp +++ b/sha-simd.cpp @@ -8,6 +8,7 @@ #include "pch.h" #include "config.h" +#include "sha.h" #include "misc.h" // Clang and GCC hoops... @@ -56,17 +57,21 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + NAMESPACE_BEGIN(CryptoPP) #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY extern "C" { typedef void (*SigHandler)(int); - static jmp_buf s_jmpSIGILL; - static void SigIllHandler(int) - { - longjmp(s_jmpSIGILL, 1); - } + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } }; #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY @@ -75,70 +80,70 @@ bool CPU_TrySHA1_ARMV8() { #if (CRYPTOPP_ARM_SHA_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + volatile bool result = true; + __try + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); - uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); - uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); - uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); - uint32x4_t r5 = vsha1su1q_u32 (data1, data2); + uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); + uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); + uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); + uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); + uint32x4_t r5 = vsha1su1q_u32 (data1, data2); - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; # else # if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_SHA1) - return true; + return true; // https://sourceware.org/ml/libc-help/2017-08/msg00012.html # elif defined(__linux__) && defined(__aarch64__) - if (getauxval(AT_HWCAP) & HWCAP_SHA1) - return true; + if (getauxval(AT_HWCAP) & HWCAP_SHA1) + return true; # elif defined(__linux__) && defined(__aarch32__) - if (getauxval(AT_HWCAP2) & HWCAP2_SHA1) - return true; + if (getauxval(AT_HWCAP2) & HWCAP2_SHA1) + return true; # endif - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); - uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); - uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); - uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); - uint32x4_t r5 = vsha1su1q_u32 (data1, data2); + uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); + uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); + uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); + uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); + uint32x4_t r5 = vsha1su1q_u32 (data1, data2); - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); - } + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); + } - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; # endif #else - return false; + return false; #endif // CRYPTOPP_ARM_SHA_AVAILABLE } @@ -146,68 +151,68 @@ bool CPU_TrySHA2_ARMV8() { #if (CRYPTOPP_ARM_SHA_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + volatile bool result = true; + __try + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); - uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); - uint32x4_t r3 = vsha256su0q_u32 (data1, data2); - uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); + uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); + uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); + uint32x4_t r3 = vsha256su0q_u32 (data1, data2); + uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; #else # if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_SHA2) - return true; + return true; // https://sourceware.org/ml/libc-help/2017-08/msg00012.html # elif defined(__linux__) && defined(__aarch64__) - if (getauxval(AT_HWCAP) & HWCAP_SHA2) - return true; + if (getauxval(AT_HWCAP) & HWCAP_SHA2) + return true; # elif defined(__linux__) && defined(__aarch32__) - if (getauxval(AT_HWCAP2) & HWCAP2_SHA2) - return true; + if (getauxval(AT_HWCAP2) & HWCAP2_SHA2) + return true; # endif - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); - uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); - uint32x4_t r3 = vsha256su0q_u32 (data1, data2); - uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); + uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); + uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); + uint32x4_t r3 = vsha256su0q_u32 (data1, data2); + uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); - } + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); + } - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; # endif #else - return false; + return false; #endif // CRYPTOPP_ARM_SHA_AVAILABLE } #endif // ARM32 or ARM64 @@ -220,192 +225,209 @@ extern const word32 SHA256_K[64]; #if CRYPTOPP_SHANI_AVAILABLE // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. -void SHA1_Transform_SHANI(word32 *state, const word32 *data) +void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order) { + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); + __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; __m128i MASK, MSG0, MSG1, MSG2, MSG3; // Load initial values - ABCD = _mm_loadu_si128((__m128i*) state); + ABCD = _mm_loadu_si128(CONST_M128_CAST(state)); E0 = _mm_set_epi32(state[4], 0, 0, 0); ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - MASK = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); - // Save current hash - ABCD_SAVE = ABCD; - E0_SAVE = E0; + // IA-32 SHA is little endian, SHA::Transform is big endian, + // and SHA::HashMultipleBlocks can be either. ByteOrder + // allows us to avoid extra endian reversals. It saves 1.0 cpb. + MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement + _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) : + _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ; - // Rounds 0-3 - MSG0 = _mm_loadu_si128((__m128i*) data+0); - MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + while (length >= SHA1::BLOCKSIZE) + { + // Save current hash + ABCD_SAVE = ABCD; + E0_SAVE = E0; - // Rounds 4-7 - MSG1 = _mm_loadu_si128((__m128i*) (data+4)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + // Rounds 0-3 + MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0)); + MSG0 = _mm_shuffle_epi8(MSG0, MASK); + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - // Rounds 8-11 - MSG2 = _mm_loadu_si128((__m128i*) (data+8)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + // Rounds 4-7 + MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - // Rounds 12-15 - MSG3 = _mm_loadu_si128((__m128i*) (data+12)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + // Rounds 8-11 + MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - // Rounds 16-19 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + // Rounds 12-15 + MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - // Rounds 20-23 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); + // Rounds 16-19 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - // Rounds 24-27 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + // Rounds 20-23 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); - // Rounds 28-31 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + // Rounds 24-27 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - // Rounds 32-35 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + // Rounds 28-31 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - // Rounds 36-39 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); + // Rounds 32-35 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - // Rounds 40-43 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + // Rounds 36-39 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); - // Rounds 44-47 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + // Rounds 40-43 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - // Rounds 48-51 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + // Rounds 44-47 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - // Rounds 52-55 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); + // Rounds 48-51 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - // Rounds 56-59 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + // Rounds 52-55 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); - // Rounds 60-63 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + // Rounds 56-59 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - // Rounds 64-67 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + // Rounds 60-63 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - // Rounds 68-71 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG3 = _mm_xor_si128(MSG3, MSG1); + // Rounds 64-67 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - // Rounds 72-75 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + // Rounds 68-71 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG3 = _mm_xor_si128(MSG3, MSG1); - // Rounds 76-79 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + // Rounds 72-75 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - // Add values back to state - E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); - ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); + // Rounds 76-79 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + + // Add values back to state + E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); + ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); + + data += SHA1::BLOCKSIZE/sizeof(word32); + length -= SHA1::BLOCKSIZE; + } // Save state ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - _mm_storeu_si128((__m128i*) state, ABCD); + _mm_storeu_si128(M128_CAST(state), ABCD); state[4] = _mm_extract_epi32(E0, 3); } // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. -void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data, size_t length) +void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order) { - CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length % 64 == 0); + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); __m128i STATE0, STATE1; __m128i MSG, TMP, MASK; @@ -413,24 +435,29 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data __m128i ABEF_SAVE, CDGH_SAVE; // Load initial values - TMP = _mm_loadu_si128((__m128i*) &state[0]); - STATE1 = _mm_loadu_si128((__m128i*) &state[4]); - MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203)); + TMP = _mm_loadu_si128(M128_CAST(&state[0])); + STATE1 = _mm_loadu_si128(M128_CAST(&state[4])); + + // IA-32 SHA is little endian, SHA::Transform is big endian, + // and SHA::HashMultipleBlocks can be either. ByteOrder + // allows us to avoid extra endian reversals. It saves 1.0 cpb. + MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement + _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) : + _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ; TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH - const size_t BLOCKSIZE = 64; - while (length >= BLOCKSIZE) + while (length >= SHA256::BLOCKSIZE) { // Save current hash ABEF_SAVE = STATE0; CDGH_SAVE = STATE1; // Rounds 0-3 - MSG = _mm_loadu_si128((__m128i*) data+0); + MSG = _mm_loadu_si128(CONST_M128_CAST(data+0)); TMSG0 = _mm_shuffle_epi8(MSG, MASK); MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98))); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); @@ -438,7 +465,7 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); // Rounds 4-7 - TMSG1 = _mm_loadu_si128((__m128i*) (data+4)); + TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B))); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); @@ -447,7 +474,7 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); // Rounds 8-11 - TMSG2 = _mm_loadu_si128((__m128i*) (data+8)); + TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98))); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); @@ -456,7 +483,7 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); // Rounds 12-15 - TMSG3 = _mm_loadu_si128((__m128i*) (data+12)); + TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74))); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); @@ -585,8 +612,8 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - data += BLOCKSIZE/sizeof(word32); - length -= BLOCKSIZE; + data += SHA256::BLOCKSIZE/sizeof(word32); + length -= SHA256::BLOCKSIZE; } TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA @@ -595,8 +622,8 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF // Save state - _mm_storeu_si128((__m128i*) &state[0], STATE0); - _mm_storeu_si128((__m128i*) &state[4], STATE1); + _mm_storeu_si128(M128_CAST(&state[0]), STATE0); + _mm_storeu_si128(M128_CAST(&state[4]), STATE1); } #endif // CRYPTOPP_SHANI_AVAILABLE @@ -609,8 +636,12 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data ///////////////////////////////////////////////////////// #if CRYPTOPP_ARM_SHA_AVAILABLE -void SHA1_Transform_ARMV8(word32 *state, const word32 *data) +void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) { + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); + uint32x4_t C0, C1, C2, C3; uint32x4_t ABCD, ABCD_SAVED; uint32x4_t MSG0, MSG1, MSG2, MSG3; @@ -626,160 +657,178 @@ void SHA1_Transform_ARMV8(word32 *state, const word32 *data) ABCD = vld1q_u32(&state[0]); E0 = state[4]; - // Save current hash - ABCD_SAVED = ABCD; - E0_SAVED = E0; + while (length >= SHA1::BLOCKSIZE) + { + // Save current hash + ABCD_SAVED = ABCD; + E0_SAVED = E0; - MSG0 = vld1q_u32(data + 0); - MSG1 = vld1q_u32(data + 4); - MSG2 = vld1q_u32(data + 8); - MSG3 = vld1q_u32(data + 12); + MSG0 = vld1q_u32(data + 0); + MSG1 = vld1q_u32(data + 4); + MSG2 = vld1q_u32(data + 8); + MSG3 = vld1q_u32(data + 12); - TMP0 = vaddq_u32(MSG0, C0); - TMP1 = vaddq_u32(MSG1, C0); + if (order == BIG_ENDIAN_ORDER) // Data arrangement + { + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + } - // Rounds 0-3 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C0); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + TMP0 = vaddq_u32(MSG0, C0); + TMP1 = vaddq_u32(MSG1, C0); - // Rounds 4-7 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C0); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + // Rounds 0-3 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C0); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - // Rounds 8-11 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C0); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + // Rounds 4-7 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C0); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - // Rounds 12-15 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C1); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + // Rounds 8-11 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C0); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - // Rounds 16-19 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C1); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + // Rounds 12-15 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C1); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - // Rounds 20-23 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C1); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + // Rounds 16-19 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C1); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - // Rounds 24-27 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C1); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + // Rounds 20-23 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C1); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - // Rounds 28-31 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C1); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + // Rounds 24-27 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C1); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - // Rounds 32-35 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C2); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + // Rounds 28-31 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C1); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - // Rounds 36-39 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C2); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + // Rounds 32-35 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C2); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - // Rounds 40-43 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C2); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + // Rounds 36-39 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C2); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - // Rounds 44-47 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C2); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + // Rounds 40-43 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C2); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - // Rounds 48-51 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C2); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + // Rounds 44-47 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C2); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - // Rounds 52-55 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C3); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + // Rounds 48-51 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C2); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - // Rounds 56-59 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C3); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + // Rounds 52-55 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C3); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - // Rounds 60-63 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C3); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + // Rounds 56-59 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C3); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - // Rounds 64-67 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C3); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + // Rounds 60-63 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C3); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - // Rounds 68-71 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C3); - MSG0 = vsha1su1q_u32(MSG0, MSG3); + // Rounds 64-67 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C3); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - // Rounds 72-75 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); + // Rounds 68-71 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C3); + MSG0 = vsha1su1q_u32(MSG0, MSG3); - // Rounds 76-79 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); + // Rounds 72-75 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); - E0 += E0_SAVED; - ABCD = vaddq_u32(ABCD_SAVED, ABCD); + // Rounds 76-79 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + + E0 += E0_SAVED; + ABCD = vaddq_u32(ABCD_SAVED, ABCD); + + data += SHA1::BLOCKSIZE/sizeof(word32); + length -= SHA1::BLOCKSIZE; + } // Save state vst1q_u32(&state[0], ABCD); state[4] = E0; } -void CRYPTOPP_FASTCALL SHA256_HashBlocks_ARMV8(word32 *state, const word32 *data, size_t length) +void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) { + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; uint32x4_t MSG0, MSG1, MSG2, MSG3; uint32x4_t TMP0, TMP1, TMP2; @@ -788,8 +837,7 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_ARMV8(word32 *state, const word32 *data STATE0 = vld1q_u32(&state[0]); STATE1 = vld1q_u32(&state[4]); - const size_t BLOCKSIZE = 64; - while (length >= BLOCKSIZE) + while (length >= SHA256::BLOCKSIZE) { // Save current hash ABEF_SAVE = STATE0; @@ -801,6 +849,14 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_ARMV8(word32 *state, const word32 *data MSG2 = vld1q_u32(data + 8); MSG3 = vld1q_u32(data + 12); + if (order == BIG_ENDIAN_ORDER) // Data arrangement + { + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + } + TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00])); // Rounds 0-3 @@ -926,8 +982,8 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_ARMV8(word32 *state, const word32 *data STATE0 = vaddq_u32(STATE0, ABEF_SAVE); STATE1 = vaddq_u32(STATE1, CDGH_SAVE); - data += BLOCKSIZE/sizeof(word32); - length -= BLOCKSIZE; + data += SHA256::BLOCKSIZE/sizeof(word32); + length -= SHA256::BLOCKSIZE; } // Save state diff --git a/sha.cpp b/sha.cpp index 4f126c0f..494fa981 100644 --- a/sha.cpp +++ b/sha.cpp @@ -48,15 +48,21 @@ # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #endif -// Clang __m128i casts -#define M128_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) - // C++ makes const internal linkage #define EXPORT_TABLE extern NAMESPACE_BEGIN(CryptoPP) +#if CRYPTOPP_SHANI_AVAILABLE +extern void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order); +extern void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order); +#endif + +#if CRYPTOPP_ARM_SHA_AVAILABLE +extern void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order); +extern void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order); +#endif + //////////////////////////////// // start of Steve Reid's code // //////////////////////////////// @@ -78,7 +84,7 @@ ANONYMOUS_NAMESPACE_BEGIN #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30); #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30); -void SHA1_CXX_HashBlock(word32 *state, const word32 *data) +void SHA1_HashBlock_CXX(word32 *state, const word32 *data) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); @@ -125,423 +131,6 @@ ANONYMOUS_NAMESPACE_END // end of Steve Reid's code // ////////////////////////////// -/////////////////////////////////// -// start of Walton/Gulley's code // -/////////////////////////////////// - -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. -void SHA1_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); - - __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; - __m128i MASK, MSG0, MSG1, MSG2, MSG3; - - // Load initial values - ABCD = _mm_loadu_si128(CONST_M128_CAST(state)); - E0 = _mm_set_epi32(state[4], 0, 0, 0); - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - - // IA-32 SHA is little endian, SHA::Transform is big endian, - // and SHA::HashMultipleBlocks can be either. ByteOrder - // allows us to avoid extra endian reversals. It saves 1.0 cpb. - MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement - _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) : - _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ; - - while (length >= SHA1::BLOCKSIZE) - { - // Save current hash - ABCD_SAVE = ABCD; - E0_SAVE = E0; - - // Rounds 0-3 - MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0)); - MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - - // Rounds 4-7 - MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - - // Rounds 8-11 - MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 12-15 - MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 16-19 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 20-23 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 24-27 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 28-31 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 32-35 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 36-39 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 40-43 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 44-47 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 48-51 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 52-55 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 56-59 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 60-63 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 64-67 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 68-71 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 72-75 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - - // Rounds 76-79 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - - // Add values back to state - E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); - ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); - - data += SHA1::BLOCKSIZE/sizeof(word32); - length -= SHA1::BLOCKSIZE; - } - - // Save state - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - _mm_storeu_si128(M128_CAST(state), ABCD); - state[4] = _mm_extract_epi32(E0, 3); -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -///////////////////////////////// -// end of Walton/Gulley's code // -///////////////////////////////// - -////////////////////////////////////////////////////////////// -// start of Walton/Schneiders/O'Rourke/Skip Hovsmith's code // -////////////////////////////////////////////////////////////// - -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -void SHA1_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); - - uint32x4_t C0, C1, C2, C3; - uint32x4_t ABCD, ABCD_SAVED; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32x4_t TMP0, TMP1; - uint32_t E0, E0_SAVED, E1; - - // Load initial values - C0 = vdupq_n_u32(0x5A827999); - C1 = vdupq_n_u32(0x6ED9EBA1); - C2 = vdupq_n_u32(0x8F1BBCDC); - C3 = vdupq_n_u32(0xCA62C1D6); - - ABCD = vld1q_u32(&state[0]); - E0 = state[4]; - - while (length >= SHA1::BLOCKSIZE) - { - // Save current hash - ABCD_SAVED = ABCD; - E0_SAVED = E0; - - MSG0 = vld1q_u32(data + 0); - MSG1 = vld1q_u32(data + 4); - MSG2 = vld1q_u32(data + 8); - MSG3 = vld1q_u32(data + 12); - - if (order == BIG_ENDIAN_ORDER) // Data arrangement - { - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - } - - TMP0 = vaddq_u32(MSG0, C0); - TMP1 = vaddq_u32(MSG1, C0); - - // Rounds 0-3 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C0); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 4-7 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C0); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 8-11 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C0); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 12-15 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C1); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 16-19 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C1); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 20-23 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C1); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 24-27 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C1); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 28-31 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C1); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 32-35 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C2); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 36-39 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C2); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 40-43 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C2); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 44-47 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C2); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 48-51 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C2); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 52-55 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C3); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 56-59 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C3); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 60-63 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C3); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 64-67 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C3); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 68-71 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C3); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - - // Rounds 72-75 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - - // Rounds 76-79 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - - E0 += E0_SAVED; - ABCD = vaddq_u32(ABCD_SAVED, ABCD); - - data += SHA1::BLOCKSIZE/sizeof(word32); - length -= SHA1::BLOCKSIZE; - } - - // Save state - vst1q_u32(&state[0], ABCD); - state[4] = E0; -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -/////////////////////////////////////////////////////// -// end of Walton/Schneiders/O'Rourke/Hovsmith's code // -/////////////////////////////////////////////////////// - void SHA1::InitState(HashWordType *state) { state[0] = 0x67452301; @@ -556,22 +145,22 @@ void SHA1::Transform(word32 *state, const word32 *data) CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA1_SHANI_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_SHANI(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA1()) { - SHA1_ARM_SHA_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_ARMV8(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif - SHA1_CXX_HashBlock(state, data); + SHA1_HashBlock_CXX(state, data); } size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) @@ -579,17 +168,17 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) CRYPTOPP_ASSERT(input); CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA1_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_SHANI(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA1::BLOCKSIZE - 1); } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA1()) { - SHA1_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_ARMV8(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA1::BLOCKSIZE - 1); } #endif @@ -600,12 +189,12 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) { if (noReverse) { - SHA1_CXX_HashBlock(m_state, input); + SHA1_HashBlock_CXX(m_state, input); } else { ByteReverse(dataBuf, input, SHA1::BLOCKSIZE); - SHA1_CXX_HashBlock(m_state, dataBuf); + SHA1_HashBlock_CXX(m_state, dataBuf); } input += SHA1::BLOCKSIZE/sizeof(word32); @@ -663,7 +252,7 @@ ANONYMOUS_NAMESPACE_BEGIN #define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3)) #define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10)) -void SHA256_CXX_HashBlock(word32 *state, const word32 *data) +void SHA256_HashBlock_CXX(word32 *state, const word32 *data) { word32 W[16], T[8]; /* Copy context->state[] to working vars */ @@ -712,7 +301,7 @@ void SHA256::InitState(HashWordType *state) ANONYMOUS_NAMESPACE_BEGIN -void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len) +void CRYPTOPP_FASTCALL SHA256_HashMultipleBlocks_SSE2(word32 *state, const word32 *data, size_t len) { #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4] @@ -834,7 +423,7 @@ void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 INTEL_NOPREFIX #elif defined(CRYPTOPP_GENERATE_X64_MASM) ALIGN 8 - SHA256_SSE_HashMultipleBlocks PROC FRAME + SHA256_HashMultipleBlocks_SSE2 PROC FRAME rex_push_reg rsi push_reg rdi push_reg rbx @@ -1013,7 +602,7 @@ INTEL_NOPREFIX pop rdi pop rsi ret - SHA256_SSE_HashMultipleBlocks ENDP + SHA256_HashMultipleBlocks_SSE2 ENDP #endif #ifdef __GNUC__ @@ -1039,435 +628,31 @@ ANONYMOUS_NAMESPACE_END #ifdef CRYPTOPP_X64_MASM_AVAILABLE EXPORT_TABLE "C" { -void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len); +void CRYPTOPP_FASTCALL SHA256_HashMultipleBlocks_SSE2(word32 *state, const word32 *data, size_t len); } #endif -/////////////////////////////////// -// start of Walton/Gulley's code // -/////////////////////////////////// - -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. -void SHA256_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); - - __m128i STATE0, STATE1; - __m128i MSG, TMP, MASK; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_loadu_si128(M128_CAST(&state[0])); - STATE1 = _mm_loadu_si128(M128_CAST(&state[4])); - - // IA-32 SHA is little endian, SHA::Transform is big endian, - // and SHA::HashMultipleBlocks can be either. ByteOrder - // allows us to avoid extra endian reversals. It saves 1.0 cpb. - MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement - _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) : - _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ; - - TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH - - while (length >= SHA256::BLOCKSIZE) - { - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - MSG = _mm_loadu_si128(CONST_M128_CAST(data+0)); - TMSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 4-7 - TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); - TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 8-11 - TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); - TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 12-15 - TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); - TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 16-19 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 20-23 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 24-27 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 28-31 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 32-35 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 36-39 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 40-43 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 44-47 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 48-51 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 52-55 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 56-59 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 60-63 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Add values back to state - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - data += SHA256::BLOCKSIZE/sizeof(word32); - length -= SHA256::BLOCKSIZE; - } - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF - - // Save state - _mm_storeu_si128(M128_CAST(&state[0]), STATE0); - _mm_storeu_si128(M128_CAST(&state[4]), STATE1); -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -///////////////////////////////// -// end of Walton/Gulley's code // -///////////////////////////////// - -///////////////////////////////////////////////////////// -// start of Walton/Schneiders/O'Rourke/Hovsmith's code // -///////////////////////////////////////////////////////// - -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -void SHA256_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); - - uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32x4_t TMP0, TMP1, TMP2; - - // Load initial values - STATE0 = vld1q_u32(&state[0]); - STATE1 = vld1q_u32(&state[4]); - - while (length >= SHA256::BLOCKSIZE) - { - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Load message - MSG0 = vld1q_u32(data + 0); - MSG1 = vld1q_u32(data + 4); - MSG2 = vld1q_u32(data + 8); - MSG3 = vld1q_u32(data + 12); - - if (order == BIG_ENDIAN_ORDER) // Data arrangement - { - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - } - - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00])); - - // Rounds 0-3 - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; - - // Rounds 4-7 - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; - - // Rounds 8-11 - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; - - // Rounds 12-15 - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; - - // Rounds 16-19 - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; - - // Rounds 20-23 - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; - - // Rounds 24-27 - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; - - // Rounds 28-31 - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; - - // Rounds 32-35 - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; - - // Rounds 36-39 - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; - - // Rounds 40-43 - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; - - // Rounds 44-47 - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; - - // Rounds 48-51 - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);; - - // Rounds 52-55 - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);; - - // Rounds 56-59 - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);; - - // Rounds 60-63 - TMP2 = STATE0; - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);; - - // Add back to state - STATE0 = vaddq_u32(STATE0, ABEF_SAVE); - STATE1 = vaddq_u32(STATE1, CDGH_SAVE); - - data += SHA256::BLOCKSIZE/sizeof(word32); - length -= SHA256::BLOCKSIZE; - } - - // Save state - vst1q_u32(&state[0], STATE0); - vst1q_u32(&state[4], STATE1); -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -/////////////////////////////////////////////////////// -// end of Walton/Schneiders/O'Rourke/Hovsmith's code // -/////////////////////////////////////////////////////// - void SHA256::Transform(word32 *state, const word32 *data) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA256_SHANI_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_SHANI(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA2()) { - SHA256_ARM_SHA_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_ARMV8(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif - SHA256_CXX_HashBlock(state, data); + SHA256_HashBlock_CXX(state, data); } size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) @@ -1475,10 +660,10 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) CRYPTOPP_ASSERT(input); CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_SHANI(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1486,14 +671,14 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) if (HasSSE2()) { const size_t res = length & (SHA256::BLOCKSIZE - 1); - SHA256_SSE_HashMultipleBlocks(m_state, input, length-res); + SHA256_HashMultipleBlocks_SSE2(m_state, input, length-res); return res; } #endif #if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA2()) { - SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_ARMV8(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1504,12 +689,12 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) { if (noReverse) { - SHA256_CXX_HashBlock(m_state, input); + SHA256_HashBlock_CXX(m_state, input); } else { ByteReverse(dataBuf, input, SHA256::BLOCKSIZE); - SHA256_CXX_HashBlock(m_state, dataBuf); + SHA256_HashBlock_CXX(m_state, dataBuf); } input += SHA256::BLOCKSIZE/sizeof(word32); @@ -1524,10 +709,10 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) CRYPTOPP_ASSERT(input); CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_SHANI(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1535,14 +720,14 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) if (HasSSE2()) { const size_t res = length & (SHA256::BLOCKSIZE - 1); - SHA256_SSE_HashMultipleBlocks(m_state, input, length-res); + SHA256_HashMultipleBlocks_SSE2(m_state, input, length-res); return res; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA2()) { - SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_ARMV8(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1553,12 +738,12 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) { if (noReverse) { - SHA256_CXX_HashBlock(m_state, input); + SHA256_HashBlock_CXX(m_state, input); } else { ByteReverse(dataBuf, input, SHA256::BLOCKSIZE); - SHA256_CXX_HashBlock(m_state, dataBuf); + SHA256_HashBlock_CXX(m_state, dataBuf); } input += SHA256::BLOCKSIZE/sizeof(word32); @@ -1591,7 +776,7 @@ void SHA512::InitState(HashWordType *state) } CRYPTOPP_ALIGN_DATA(16) -static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { +const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), @@ -1638,7 +823,7 @@ static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { ANONYMOUS_NAMESPACE_BEGIN -CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) +CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const word64 *data) { #ifdef __GNUC__ __asm__ __volatile__ @@ -1844,9 +1029,9 @@ ANONYMOUS_NAMESPACE_BEGIN #define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6)) #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+\ - (j?blk2(i):blk0(i));d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) + (j?blk2(i):blk0(i));d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) -void SHA512_CXX_HashBlock(word64 *state, const word64 *data) +void SHA512_HashBlock_CXX(word64 *state, const word64 *data) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); @@ -1884,12 +1069,12 @@ void SHA512::Transform(word64 *state, const word64 *data) #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32) if (HasSSE2()) { - SHA512_SSE2_Transform(state, data); + SHA512_HashBlock_SSE2(state, data); return; } #endif - SHA512_CXX_HashBlock(state, data); + SHA512_HashBlock_CXX(state, data); } NAMESPACE_END