Add fast rotate-by-8 for Aarch32 and Aarch64

pull/548/head
Jeffrey Walton 2017-11-29 12:33:34 -05:00
parent 532f13fe53
commit 39594a53b0
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 48 additions and 4 deletions

View File

@ -96,6 +96,28 @@ inline uint64x2_t RotateRight64(const uint64x2_t& val)
return vorrq_u64(a, b); return vorrq_u64(a, b);
} }
#if defined(__aarch32__) || defined(__aarch64__)
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
{
const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u64_u8(
vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
{
const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u64_u8(
vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
}
#endif
inline uint64x2_t Shuffle64(const uint64x2_t& val) inline uint64x2_t Shuffle64(const uint64x2_t& val)
{ {
#if defined(CRYPTOPP_LITTLE_ENDIAN) #if defined(CRYPTOPP_LITTLE_ENDIAN)
@ -475,7 +497,7 @@ inline __m128i RotateRight64(const __m128i& val)
return _mm_or_si128(a, b); return _mm_or_si128(a, b);
} }
// Faster than two Shifts and an Or // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateLeft64<8>(const __m128i& val) inline __m128i RotateLeft64<8>(const __m128i& val)
{ {
@ -484,7 +506,7 @@ inline __m128i RotateLeft64<8>(const __m128i& val)
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
} }
// Faster than two Shifts and an Or // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight64<8>(const __m128i& val) inline __m128i RotateRight64<8>(const __m128i& val)
{ {

View File

@ -94,6 +94,28 @@ inline uint64x2_t RotateRight64(const uint64x2_t& val)
return vorrq_u64(a, b); return vorrq_u64(a, b);
} }
#if defined(__aarch32__) || defined(__aarch64__)
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
{
const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u64_u8(
vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
}
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <>
inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
{
const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
const uint8x16_t mask = vld1q_u8(maskb);
return vreinterpretq_u64_u8(
vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
}
#endif
inline uint64x2_t Shuffle64(const uint64x2_t& val) inline uint64x2_t Shuffle64(const uint64x2_t& val)
{ {
#if defined(CRYPTOPP_LITTLE_ENDIAN) #if defined(CRYPTOPP_LITTLE_ENDIAN)
@ -422,7 +444,7 @@ inline __m128i RotateRight64(const __m128i& val)
return _mm_or_si128(a, b); return _mm_or_si128(a, b);
} }
// Faster than two Shifts and an Or // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateLeft64<8>(const __m128i& val) inline __m128i RotateLeft64<8>(const __m128i& val)
{ {
@ -431,7 +453,7 @@ inline __m128i RotateLeft64<8>(const __m128i& val)
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
} }
// Faster than two Shifts and an Or // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight64<8>(const __m128i& val) inline __m128i RotateRight64<8>(const __m128i& val)
{ {