From 6e829cebeed19d5ec55655ee73de4e9d1e312736 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Wed, 29 Nov 2017 08:53:48 -0500
Subject: [PATCH] Use EPI8 Shuffle rather than Shifts and Or for rotate when
 R=8 Louis Wingers and Bryan Weeks from the Simon and Speck team offered the
 suggestion. The change save 0.7 cpb for Speck, and 5 cpb for Simon on x86_64.
 Speck is now running very close to the Team's time sor SSE4. Simon is still
 off, but we know the root cause. For Simon, the Team used a fast bit-sliced
 implementation

---
 simon-simd.cpp | 18 ++++++++++++++++++
 speck-simd.cpp | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/simon-simd.cpp b/simon-simd.cpp
index d39ec9cf..5cb3ca8e 100644
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@@ -471,6 +471,24 @@ inline __m128i RotateRight64(const __m128i& val)
     return _mm_or_si128(a, b);
 }
 
+// Faster than two Shifts and an Or
+template <>
+inline __m128i RotateLeft64<8>(const __m128i& val)
+{
+    CRYPTOPP_ASSERT(R < 64);
+	const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or
+template <>
+inline __m128i RotateRight64<8>(const __m128i& val)
+{
+    CRYPTOPP_ASSERT(R < 64);
+	const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
+    return _mm_shuffle_epi8(val, mask);
+}
+
 inline __m128i SIMON128_f(const __m128i& v)
 {
     return _mm_xor_si128(RotateLeft64<2>(v),
diff --git a/speck-simd.cpp b/speck-simd.cpp
index d0b9cb1f..a04f39da 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -418,6 +418,24 @@ inline __m128i RotateRight64(const __m128i& val)
     return _mm_or_si128(a, b);
 }
 
+// Faster than two Shifts and an Or
+template <>
+inline __m128i RotateLeft64<8>(const __m128i& val)
+{
+    CRYPTOPP_ASSERT(R < 64);
+	const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or
+template <>
+inline __m128i RotateRight64<8>(const __m128i& val)
+{
+    CRYPTOPP_ASSERT(R < 64);
+	const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
+    return _mm_shuffle_epi8(val, mask);
+}
+
 inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds)
 {
     // Hack ahead... Rearrange the data for vectorization. It is easier to permute