From 6e829cebeed19d5ec55655ee73de4e9d1e312736 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 29 Nov 2017 08:53:48 -0500 Subject: [PATCH] Use EPI8 Shuffle rather than Shifts and Or for rotate when R=8 Louis Wingers and Bryan Weeks from the Simon and Speck team offered the suggestion. The change save 0.7 cpb for Speck, and 5 cpb for Simon on x86_64. Speck is now running very close to the Team's time sor SSE4. Simon is still off, but we know the root cause. For Simon, the Team used a fast bit-sliced implementation --- simon-simd.cpp | 18 ++++++++++++++++++ speck-simd.cpp | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/simon-simd.cpp b/simon-simd.cpp index d39ec9cf..5cb3ca8e 100644 --- a/simon-simd.cpp +++ b/simon-simd.cpp @@ -471,6 +471,24 @@ inline __m128i RotateRight64(const __m128i& val) return _mm_or_si128(a, b); } +// Faster than two Shifts and an Or +template <> +inline __m128i RotateLeft64<8>(const __m128i& val) +{ + CRYPTOPP_ASSERT(R < 64); + const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or +template <> +inline __m128i RotateRight64<8>(const __m128i& val) +{ + CRYPTOPP_ASSERT(R < 64); + const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + inline __m128i SIMON128_f(const __m128i& v) { return _mm_xor_si128(RotateLeft64<2>(v), diff --git a/speck-simd.cpp b/speck-simd.cpp index d0b9cb1f..a04f39da 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -418,6 +418,24 @@ inline __m128i RotateRight64(const __m128i& val) return _mm_or_si128(a, b); } +// Faster than two Shifts and an Or +template <> +inline __m128i RotateLeft64<8>(const __m128i& val) +{ + CRYPTOPP_ASSERT(R < 64); + const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or +template <> +inline __m128i RotateRight64<8>(const __m128i& val) +{ + CRYPTOPP_ASSERT(R < 64); + const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) { // Hack ahead... Rearrange the data for vectorization. It is easier to permute