diff --git a/simon-simd.cpp b/simon-simd.cpp
index 9d60cbd8..0b963b46 100644
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@@ -96,6 +96,28 @@ inline uint64x2_t RotateRight64(const uint64x2_t& val)
     return vorrq_u64(a, b);
 }
 
+#if defined(__aarch32__) || defined(__aarch64__)
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
+{
+    const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
+{
+    const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+#endif
+
 inline uint64x2_t Shuffle64(const uint64x2_t& val)
 {
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
@@ -475,7 +497,7 @@ inline __m128i RotateRight64(const __m128i& val)
     return _mm_or_si128(a, b);
 }
 
-// Faster than two Shifts and an Or
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateLeft64<8>(const __m128i& val)
 {
@@ -484,7 +506,7 @@ inline __m128i RotateLeft64<8>(const __m128i& val)
     return _mm_shuffle_epi8(val, mask);
 }
 
-// Faster than two Shifts and an Or
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateRight64<8>(const __m128i& val)
 {
diff --git a/speck-simd.cpp b/speck-simd.cpp
index 776f53fd..fcd91869 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -94,6 +94,28 @@ inline uint64x2_t RotateRight64(const uint64x2_t& val)
     return vorrq_u64(a, b);
 }
 
+#if defined(__aarch32__) || defined(__aarch64__)
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
+{
+    const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
+{
+    const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+#endif
+
 inline uint64x2_t Shuffle64(const uint64x2_t& val)
 {
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
@@ -422,7 +444,7 @@ inline __m128i RotateRight64(const __m128i& val)
     return _mm_or_si128(a, b);
 }
 
-// Faster than two Shifts and an Or
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateLeft64<8>(const __m128i& val)
 {
@@ -431,7 +453,7 @@ inline __m128i RotateLeft64<8>(const __m128i& val)
     return _mm_shuffle_epi8(val, mask);
 }
 
-// Faster than two Shifts and an Or
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
 template <>
 inline __m128i RotateRight64<8>(const __m128i& val)
 {