From b9abd7141e0fc262ad336833cf77f75f283c0e84 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Sat, 22 Apr 2017 12:19:55 -0400
Subject: [PATCH] Fix endian-reversal and loading of MSG0-MSG3

Initially we performed a 32-bit word-size ByteReverse() on the entire 64-byte buffer being hashed. Then we performed another fix-up when loading each 16-byte portion of the buffer into the SSE2 registers for SHA processing. The [undesired] consequence was byte swapping and reversals happened twice. Worse, the call to ByteReverse() produced 16 bswaps instead of 1 call pshufb, so it was orders of magnitude slower than it needed to be.

This check-in takes the sane approach to byte reversals and swapping. It performs it once when the message is loaded for SSE processing. The result is SHA1 calculations drop from about 3.0 cpb to about 2.5 cpb.
---
 sha.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/sha.cpp b/sha.cpp
index 8344ffc5..843b7233 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -122,17 +122,11 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
     __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
     __m128i MASK, MSG0, MSG1, MSG2, MSG3;
 
-    // IteratedHashBase<T> has code to perform this step before HashEndianCorrectedBlock()
-    //  is called, but the design does not lend itself to optional hardware components
-    //  where SHA1 needs reversing, but SHA256 does not.
-    word32* dataBuf = const_cast<word32*>(data);
-    ByteReverse(dataBuf, dataBuf, 64);
-
     // Load initial values
     ABCD = _mm_loadu_si128((__m128i*) state);
     E0 = _mm_set_epi32(state[4], 0, 0, 0);
     ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
-    MASK = _mm_set_epi64x(W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f));
+    MASK = _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
 
     // Save current hash
     ABCD_SAVE = ABCD;