Switch to _mm_unpacklo_epi32 and _mm_unpackhi_epi32

The manual _mm_extract_epi32 and  _mm_insert_epi32 are required during setup, be we can use SSE on teardown
pull/548/head
Jeffrey Walton 2017-12-04 05:01:27 -05:00
parent cd31fa29dc
commit e9714b40d2
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 14 additions and 22 deletions

View File

@ -1109,6 +1109,7 @@ inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned i
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// The zero block below is a "don't care". It is present so we can vectorize.
// We really want an SSE equivalent to NEON's vunzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
@ -1130,9 +1131,8 @@ inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned i
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
block0 =_mm_setzero_si128();
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(x1, 0), 0);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(y1, 0), 1);
// The is the SSE equivalent to ARM vzp32
block0 = _mm_unpacklo_epi32(x1, y1);
}
inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds)
@ -1140,6 +1140,7 @@ inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned i
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// The zero block below is a "don't care". It is present so we can vectorize.
// We really want an SSE equivalent to NEON's vunzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
@ -1161,15 +1162,15 @@ inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned i
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
block0 =_mm_setzero_si128();
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(x1, 0), 0);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(y1, 0), 1);
// The is the SSE equivalent to ARM vzp32
block0 = _mm_unpacklo_epi32(x1, y1);
}
inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// We really want an SSE equivalent to NEON's vunzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1);
@ -1197,20 +1198,16 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(x1, 0), 0);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(y1, 0), 1);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(x1, 1), 2);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(y1, 1), 3);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(x1, 2), 0);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(y1, 2), 1);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(x1, 3), 2);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(y1, 3), 3);
// The is the SSE equivalent to ARM vzp32
block0 = _mm_unpacklo_epi32(x1, y1);
block1 = _mm_unpackhi_epi32(x1, y1);
}
inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// We really want an SSE equivalent to NEON's vunzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1);
@ -1238,14 +1235,9 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(x1, 0), 0);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(y1, 0), 1);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(x1, 1), 2);
block0 = _mm_insert_epi32(block0, _mm_extract_epi32(y1, 1), 3);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(x1, 2), 0);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(y1, 2), 1);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(x1, 3), 2);
block1 = _mm_insert_epi32(block1, _mm_extract_epi32(y1, 3), 3);
// The is the SSE equivalent to ARM vzp32
block0 = _mm_unpacklo_epi32(x1, y1);
block1 = _mm_unpackhi_epi32(x1, y1);
}
template <typename F1, typename F4>