Fix SPECK64 vector permutes

Thanks to Peter Cordes for the suggestion on handling the case
pull/548/head
Jeffrey Walton 2017-12-04 09:47:26 -05:00
parent 46271660a1
commit 25709d2597
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 122 additions and 62 deletions

View File

@ -288,6 +288,8 @@ ifeq ($(SUN_COMPILER),1)
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
BLAKE2_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
SIMON_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
SPECK_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
LDFLAGS += -xarch=sse4_1
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")

View File

@ -137,8 +137,11 @@ inline const word64* Ptr64(const T* ptr)
inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4_t zero = {0, 0, 0, 0};
const uint32x4x2_t t1 = vuzpq_u32(block0, zero);
uint32x4_t x1 = t1.val[0];
@ -161,6 +164,7 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
x1 = Shuffle32(x1);
y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t2 = vzipq_u32(x1, y1);
block0 = t2.val[0];
// block1 = t2.val[1];
@ -168,8 +172,11 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4_t zero = {0, 0, 0, 0};
const uint32x4x2_t t1 = vuzpq_u32(block0, zero);
uint32x4_t x1 = t1.val[0];
@ -192,6 +199,7 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
x1 = Shuffle32(x1);
y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t2 = vzipq_u32(x1, y1);
block0 = t2.val[0];
// block1 = t2.val[1];
@ -199,8 +207,11 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t1 = vuzpq_u32(block0, block1);
uint32x4_t x1 = t1.val[0];
uint32x4_t y1 = t1.val[1];
@ -222,6 +233,7 @@ inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w
x1 = Shuffle32(x1);
y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t2 = vzipq_u32(x1, y1);
block0 = t2.val[0];
block1 = t2.val[1];
@ -229,8 +241,11 @@ inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w
inline void SPECK64_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t1 = vuzpq_u32(block0, block1);
uint32x4_t x1 = t1.val[0];
uint32x4_t y1 = t1.val[1];
@ -252,6 +267,7 @@ inline void SPECK64_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w
x1 = Shuffle32(x1);
y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t2 = vzipq_u32(x1, y1);
block0 = t2.val[0];
block1 = t2.val[1];
@ -456,9 +472,11 @@ inline uint64x2_t Shuffle64(const uint64x2_t& val)
inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
// The zero block below is a "don't care". It is present so we can vectorize.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint64x2_t block1 = {0};
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
@ -480,6 +498,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign
x1 = Shuffle64(x1);
y1 = Shuffle64(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow64(x1, y1);
// block1 = UnpackHigh64(x1, y1);
}
@ -488,8 +507,11 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4,
uint64x2_t &block5, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
uint64x2_t x2 = UnpackLow64(block2, block3);
@ -532,6 +554,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow64(x1, y1);
block1 = UnpackHigh64(x1, y1);
block2 = UnpackLow64(x2, y2);
@ -542,9 +565,11 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
// The zero block below is a "don't care". It is present so we can vectorize.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint64x2_t block1 = {0};
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
@ -566,6 +591,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign
x1 = Shuffle64(x1);
y1 = Shuffle64(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow64(x1, y1);
// block1 = UnpackHigh64(x1, y1);
}
@ -574,8 +600,11 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4,
uint64x2_t &block5, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
uint64x2_t x2 = UnpackLow64(block2, block3);
@ -618,6 +647,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = UnpackLow64(x1, y1);
block1 = UnpackHigh64(x1, y1);
block2 = UnpackLow64(x2, y2);
@ -804,9 +834,11 @@ inline __m128i RotateRight64<8>(const __m128i& val)
inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
// The zero block below is a "don't care". It is present so we can vectorize.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
__m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -830,6 +862,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi64(x1, y1);
// block1 = _mm_unpackhi_epi64(x1, y1);
}
@ -837,8 +870,11 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3);
@ -872,6 +908,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2);
@ -880,9 +917,11 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
// The zero block below is a "don't care". It is present so we can vectorize.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
__m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -906,6 +945,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi64(x1, y1);
// block1 = _mm_unpackhi_epi64(x1, y1);
}
@ -913,8 +953,11 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
__m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3);
@ -948,6 +991,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2);
@ -1106,12 +1150,17 @@ inline __m128i RotateRight32<8>(const __m128i& val)
inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// The zero block below is a "don't care". It is present so we can vectorize.
// We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. Thanks to Peter
// Cordes for help with the SSE permutes below.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128i zero = _mm_setzero_si128();
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(zero);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask);
@ -1132,17 +1181,24 @@ inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned i
y1 = _mm_shuffle_epi8(y1, mask);
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(x1, y1);
// block1 = _mm_unpackhigh_epi32(x1, y1);
}
inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// The zero block below is a "don't care". It is present so we can vectorize.
// We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. Thanks to Peter
// Cordes for help with the SSE permutes below.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128i zero = _mm_setzero_si128();
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(zero);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask);
@ -1163,22 +1219,23 @@ inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned i
y1 = _mm_shuffle_epi8(y1, mask);
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(x1, y1);
// block1 = _mm_unpackhigh_epi32(x1, y1);
}
inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1);
y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block0, 3), 1);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 0), 2);
y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 1), 2);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 2), 3);
y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 3), 3);
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. Thanks to Peter
// Cordes for help with the SSE permutes below.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask);
@ -1199,23 +1256,23 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32
y1 = _mm_shuffle_epi8(y1, mask);
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(x1, y1);
block1 = _mm_unpackhi_epi32(x1, y1);
}
inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds)
{
// Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
// We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
__m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
__m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1);
y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block0, 3), 1);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 0), 2);
y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 1), 2);
x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 2), 3);
y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 3), 3);
// Rearrange the data for vectorization. The incoming data was read from
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. Thanks to Peter
// Cordes for help with the SSE permutes below.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const __m128 t0 = _mm_castsi128_ps(block0);
const __m128 t1 = _mm_castsi128_ps(block1);
__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
x1 = _mm_shuffle_epi8(x1, mask);
@ -1236,6 +1293,7 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32
y1 = _mm_shuffle_epi8(y1, mask);
// The is roughly the SSE equivalent to ARM vzp32
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = _mm_unpacklo_epi32(x1, y1);
block1 = _mm_unpackhi_epi32(x1, y1);
}