From 80ae9f4f0a66596a164fee67e3fdd09628d3871f Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Fri, 22 Jun 2018 17:44:16 -0400
Subject: [PATCH] Add AVX512 rotates to RotateLeft and RotateRight templates

---
 cham-simd.cpp  | 13 +++++++++++++
 lea-simd.cpp   | 12 ++++++++++++
 simon-simd.cpp | 30 ++++++++----------------------
 speck-simd.cpp | 31 ++++++++-----------------------
 4 files changed, 41 insertions(+), 45 deletions(-)
diff --git a/cham-simd.cpp b/cham-simd.cpp
index a4eb706e..a503aa4a 100644
--- a/cham-simd.cpp
+++ b/cham-simd.cpp
@@ -22,6 +22,11 @@
 # include <tmmintrin.h>
 #endif
 
+#if defined(__AVX512F__) && defined(__AVX512VL__)
+# define CRYPTOPP_AVX512_ROTATE 1
+# include <immintrin.h>
+#endif
+
 ANONYMOUS_NAMESPACE_BEGIN
 
 using CryptoPP::word16;
@@ -775,15 +780,23 @@ NAMESPACE_BEGIN(W32)  // CHAM128, 32-bit word size
 template <unsigned int R>
 inline __m128i RotateLeft32(const __m128i& val)
 {
+#if defined(CRYPTOPP_AVX512_ROTATE)
+    return _mm_rol_epi32(val, R);
+#else
     return _mm_or_si128(
         _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
+#endif
 }
 
 template <unsigned int R>
 inline __m128i RotateRight32(const __m128i& val)
 {
+#if defined(CRYPTOPP_AVX512_ROTATE)
+    return _mm_ror_epi32(val, R);
+#else
     return _mm_or_si128(
         _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
+#endif
 }
 
 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
diff --git a/lea-simd.cpp b/lea-simd.cpp
index 0076926d..f91702c9 100644
--- a/lea-simd.cpp
+++ b/lea-simd.cpp
@@ -22,6 +22,18 @@
 # include <tmmintrin.h>
 #endif
 
+#if defined(__AVX512F__) && defined(__AVX512VL__)
+# define CRYPTOPP_AVX512_ROTATE 1
+# include <immintrin.h>
+#endif
+
+// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
+// compilers don't follow ACLE conventions for the include.
+#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
+# include <stdint.h>
+# include <arm_acle.h>
+#endif
+
 ANONYMOUS_NAMESPACE_BEGIN
 
 using CryptoPP::word32;
diff --git a/simon-simd.cpp b/simon-simd.cpp
index 91f10779..b947cdd3 100644
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@@ -43,14 +43,6 @@
 # include <arm_acle.h>
 #endif
 
-// https://www.spinics.net/lists/gcchelp/msg47735.html and
-// https://www.spinics.net/lists/gcchelp/msg47749.html
-#if (CRYPTOPP_GCC_VERSION >= 40900)
-# define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
-#else
-# define GCC_NO_UBSAN
-#endif
-
 ANONYMOUS_NAMESPACE_BEGIN
 
 using CryptoPP::byte;
@@ -571,31 +563,26 @@ inline void Swap128(__m128i& a,__m128i& b)
 #endif
 }
 
+template <unsigned int R>
+inline __m128i RotateLeft64(const __m128i& val)
+{
 #if defined(CRYPTOPP_AVX512_ROTATE)
-template <unsigned int R>
-inline __m128i RotateLeft64(const __m128i& val)
-{
     return _mm_rol_epi64(val, R);
-}
-
-template <unsigned int R>
-inline __m128i RotateRight64(const __m128i& val)
-{
-    return _mm_ror_epi64(val, R);
-}
 #else
-template <unsigned int R>
-inline __m128i RotateLeft64(const __m128i& val)
-{
     return _mm_or_si128(
         _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
+#endif
 }
 
 template <unsigned int R>
 inline __m128i RotateRight64(const __m128i& val)
 {
+#if defined(CRYPTOPP_AVX512_ROTATE)
+    return _mm_ror_epi64(val, R);
+#else
     return _mm_or_si128(
         _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
+#endif
 }
 
 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
@@ -613,7 +600,6 @@ inline __m128i RotateRight64<8>(const __m128i& val)
     const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
     return _mm_shuffle_epi8(val, mask);
 }
-#endif  // CRYPTOPP_AVX512_ROTATE
 
 inline __m128i SIMON128_f(const __m128i& v)
 {
diff --git a/speck-simd.cpp b/speck-simd.cpp
index 3666f240..f9be52e4 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -43,14 +43,6 @@
 # include <arm_acle.h>
 #endif
 
-// https://www.spinics.net/lists/gcchelp/msg47735.html and
-// https://www.spinics.net/lists/gcchelp/msg47749.html
-#if (CRYPTOPP_GCC_VERSION >= 40900)
-# define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
-#else
-# define GCC_NO_UBSAN
-#endif
-
 ANONYMOUS_NAMESPACE_BEGIN
 
 using CryptoPP::byte;
@@ -507,31 +499,26 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
 #endif
 
+template <unsigned int R>
+inline __m128i RotateLeft64(const __m128i& val)
+{
 #if defined(CRYPTOPP_AVX512_ROTATE)
-template <unsigned int R>
-inline __m128i RotateLeft64(const __m128i& val)
-{
     return _mm_rol_epi64(val, R);
-}
-
-template <unsigned int R>
-inline __m128i RotateRight64(const __m128i& val)
-{
-    return _mm_ror_epi64(val, R);
-}
 #else
-template <unsigned int R>
-inline __m128i RotateLeft64(const __m128i& val)
-{
     return _mm_or_si128(
         _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
+#endif
 }
 
 template <unsigned int R>
 inline __m128i RotateRight64(const __m128i& val)
 {
+#if defined(CRYPTOPP_AVX512_ROTATE)
+    return _mm_ror_epi64(val, R);
+#else
     return _mm_or_si128(
         _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
+#endif
 }
 
 // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
@@ -550,8 +537,6 @@ inline __m128i RotateRight64<8>(const __m128i& val)
     return _mm_shuffle_epi8(val, mask);
 }
 
-#endif  // CRYPTOPP_AVX512_ROTATE
-
 inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {