Add XOP aware SIMON and SPECK

pull/730/head
Jeffrey Walton 2018-10-24 16:55:59 -04:00
parent ed4d57cecb
commit 210995b867
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
4 changed files with 84 additions and 0 deletions

View File

@ -22,6 +22,10 @@
# include <tmmintrin.h> # include <tmmintrin.h>
#endif #endif
#if defined(__XOP__)
# include <ammintrin.h>
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) #if defined(__AVX512F__) && defined(__AVX512VL__)
# define CRYPTOPP_AVX512_ROTATE 1 # define CRYPTOPP_AVX512_ROTATE 1
# include <immintrin.h> # include <immintrin.h>
@ -316,6 +320,8 @@ inline __m128i RotateLeft64(const __m128i& val)
{ {
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
return _mm_rol_epi64(val, R); return _mm_rol_epi64(val, R);
#elif defined(__XOP__)
return _mm_roti_epi64(val, R);
#else #else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
@ -327,6 +333,8 @@ inline __m128i RotateRight64(const __m128i& val)
{ {
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
return _mm_ror_epi64(val, R); return _mm_ror_epi64(val, R);
#elif defined(__XOP__)
return _mm_roti_epi64(val, 64-R);
#else #else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
@ -337,16 +345,24 @@ inline __m128i RotateRight64(const __m128i& val)
template <> template <>
inline __m128i RotateLeft64<8>(const __m128i& val) inline __m128i RotateLeft64<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi64(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight64<8>(const __m128i& val) inline __m128i RotateRight64<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi64(val, 64-8);
#else
const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
inline __m128i SIMON128_f(const __m128i& v) inline __m128i SIMON128_f(const __m128i& v)

View File

@ -26,6 +26,10 @@
# include <smmintrin.h> # include <smmintrin.h>
#endif #endif
#if defined(__XOP__)
# include <ammintrin.h>
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) #if defined(__AVX512F__) && defined(__AVX512VL__)
# define CRYPTOPP_AVX512_ROTATE 1 # define CRYPTOPP_AVX512_ROTATE 1
# include <immintrin.h> # include <immintrin.h>
@ -322,6 +326,8 @@ inline __m128i RotateLeft64(const __m128i& val)
{ {
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
return _mm_rol_epi64(val, R); return _mm_rol_epi64(val, R);
#elif defined(__XOP__)
return _mm_roti_epi64(val, R);
#else #else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
@ -333,6 +339,8 @@ inline __m128i RotateRight64(const __m128i& val)
{ {
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
return _mm_ror_epi64(val, R); return _mm_ror_epi64(val, R);
#elif defined(__XOP__)
return _mm_roti_epi64(val, 64-R);
#else #else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
@ -343,16 +351,24 @@ inline __m128i RotateRight64(const __m128i& val)
template <> template <>
inline __m128i RotateLeft64<8>(const __m128i& val) inline __m128i RotateLeft64<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi64(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight64<8>(const __m128i& val) inline __m128i RotateRight64<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi64(val, 64-8);
#else
const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
inline __m128i SIMON128_f(const __m128i& v) inline __m128i SIMON128_f(const __m128i& v)
@ -527,31 +543,47 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
template <unsigned int R> template <unsigned int R>
inline __m128i RotateLeft32(const __m128i& val) inline __m128i RotateLeft32(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, R);
#else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
#endif
} }
template <unsigned int R> template <unsigned int R>
inline __m128i RotateRight32(const __m128i& val) inline __m128i RotateRight32(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-R);
#else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateLeft32<8>(const __m128i& val) inline __m128i RotateLeft32<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight32<8>(const __m128i& val) inline __m128i RotateRight32<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-8);
#else
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
inline __m128i SIMON64_f(const __m128i& v) inline __m128i SIMON64_f(const __m128i& v)

View File

@ -22,6 +22,10 @@
# include <tmmintrin.h> # include <tmmintrin.h>
#endif #endif
#if defined(__XOP__)
# include <ammintrin.h>
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) #if defined(__AVX512F__) && defined(__AVX512VL__)
# define CRYPTOPP_AVX512_ROTATE 1 # define CRYPTOPP_AVX512_ROTATE 1
# include <immintrin.h> # include <immintrin.h>
@ -278,6 +282,8 @@ inline __m128i RotateLeft64(const __m128i& val)
{ {
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
return _mm_rol_epi64(val, R); return _mm_rol_epi64(val, R);
#elif defined(__XOP__)
return _mm_roti_epi64(val, R);
#else #else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
@ -289,6 +295,8 @@ inline __m128i RotateRight64(const __m128i& val)
{ {
#if defined(CRYPTOPP_AVX512_ROTATE) #if defined(CRYPTOPP_AVX512_ROTATE)
return _mm_ror_epi64(val, R); return _mm_ror_epi64(val, R);
#elif defined(__XOP__)
return _mm_roti_epi64(val, 64-R);
#else #else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
@ -299,16 +307,24 @@ inline __m128i RotateRight64(const __m128i& val)
template <> template <>
inline __m128i RotateLeft64<8>(const __m128i& val) inline __m128i RotateLeft64<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi64(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight64<8>(const __m128i& val) inline __m128i RotateRight64<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi64(val, 64-8);
#else
const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1, inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1,

View File

@ -26,6 +26,10 @@
# include <smmintrin.h> # include <smmintrin.h>
#endif #endif
#if defined(__XOP__)
# include <ammintrin.h>
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) #if defined(__AVX512F__) && defined(__AVX512VL__)
# define CRYPTOPP_AVX512_ROTATE 1 # define CRYPTOPP_AVX512_ROTATE 1
# include <immintrin.h> # include <immintrin.h>
@ -266,31 +270,47 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
template <unsigned int R> template <unsigned int R>
inline __m128i RotateLeft32(const __m128i& val) inline __m128i RotateLeft32(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, R);
#else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
#endif
} }
template <unsigned int R> template <unsigned int R>
inline __m128i RotateRight32(const __m128i& val) inline __m128i RotateRight32(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-R);
#else
return _mm_or_si128( return _mm_or_si128(
_mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateLeft32<8>(const __m128i& val) inline __m128i RotateLeft32<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, 8);
#else
const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. // Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
template <> template <>
inline __m128i RotateRight32<8>(const __m128i& val) inline __m128i RotateRight32<8>(const __m128i& val)
{ {
#if defined(__XOP__)
return _mm_roti_epi32(val, 32-8);
#else
const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
return _mm_shuffle_epi8(val, mask); return _mm_shuffle_epi8(val, mask);
#endif
} }
inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1, inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,