From 25709d25975d945ba7d91937724f7845aa9a0e8b Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 4 Dec 2017 09:47:26 -0500 Subject: [PATCH] Fix SPECK64 vector permutes Thanks to Peter Cordes for the suggestion on handling the case --- GNUmakefile | 2 + speck-simd.cpp | 182 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 122 insertions(+), 62 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 16f132de..9fe82502 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -288,6 +288,8 @@ ifeq ($(SUN_COMPILER),1) COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) BLAKE2_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 + SIMON_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 + SPECK_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 LDFLAGS += -xarch=sse4_1 endif COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") diff --git a/speck-simd.cpp b/speck-simd.cpp index 2195b771..f0d3ee87 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -137,8 +137,11 @@ inline const word64* Ptr64(const T* ptr) inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const uint32x4_t zero = {0, 0, 0, 0}; const uint32x4x2_t t1 = vuzpq_u32(block0, zero); uint32x4_t x1 = t1.val[0]; @@ -161,6 +164,7 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigne x1 = Shuffle32(x1); y1 = Shuffle32(y1); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] const uint32x4x2_t t2 = vzipq_u32(x1, y1); block0 = t2.val[0]; // block1 = t2.val[1]; @@ -168,8 +172,11 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigne inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const uint32x4_t zero = {0, 0, 0, 0}; const uint32x4x2_t t1 = vuzpq_u32(block0, zero); uint32x4_t x1 = t1.val[0]; @@ -192,6 +199,7 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigne x1 = Shuffle32(x1); y1 = Shuffle32(y1); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] const uint32x4x2_t t2 = vzipq_u32(x1, y1); block0 = t2.val[0]; // block1 = t2.val[1]; @@ -199,8 +207,11 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigne inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const uint32x4x2_t t1 = vuzpq_u32(block0, block1); uint32x4_t x1 = t1.val[0]; uint32x4_t y1 = t1.val[1]; @@ -222,6 +233,7 @@ inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w x1 = Shuffle32(x1); y1 = Shuffle32(y1); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] const uint32x4x2_t t2 = vzipq_u32(x1, y1); block0 = t2.val[0]; block1 = t2.val[1]; @@ -229,8 +241,11 @@ inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w inline void SPECK64_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const uint32x4x2_t t1 = vuzpq_u32(block0, block1); uint32x4_t x1 = t1.val[0]; uint32x4_t y1 = t1.val[1]; @@ -252,6 +267,7 @@ inline void SPECK64_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w x1 = Shuffle32(x1); y1 = Shuffle32(y1); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] const uint32x4x2_t t2 = vzipq_u32(x1, y1); block0 = t2.val[0]; block1 = t2.val[1]; @@ -456,9 +472,11 @@ inline uint64x2_t Shuffle64(const uint64x2_t& val) inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON. - // The zero block below is a "don't care". It is present so we can vectorize. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... uint64x2_t block1 = {0}; uint64x2_t x1 = UnpackLow64(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1); @@ -480,6 +498,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign x1 = Shuffle64(x1); y1 = Shuffle64(y1); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = UnpackLow64(x1, y1); // block1 = UnpackHigh64(x1, y1); } @@ -488,8 +507,11 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... uint64x2_t x1 = UnpackLow64(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1); uint64x2_t x2 = UnpackLow64(block2, block3); @@ -532,6 +554,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, x3 = Shuffle64(x3); y3 = Shuffle64(y3); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = UnpackLow64(x1, y1); block1 = UnpackHigh64(x1, y1); block2 = UnpackLow64(x2, y2); @@ -542,9 +565,11 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON. - // The zero block below is a "don't care". It is present so we can vectorize. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... uint64x2_t block1 = {0}; uint64x2_t x1 = UnpackLow64(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1); @@ -566,6 +591,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign x1 = Shuffle64(x1); y1 = Shuffle64(y1); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = UnpackLow64(x1, y1); // block1 = UnpackHigh64(x1, y1); } @@ -574,8 +600,11 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... uint64x2_t x1 = UnpackLow64(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1); uint64x2_t x2 = UnpackLow64(block2, block3); @@ -618,6 +647,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, x3 = Shuffle64(x3); y3 = Shuffle64(y3); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = UnpackLow64(x1, y1); block1 = UnpackHigh64(x1, y1); block2 = UnpackLow64(x2, y2); @@ -804,9 +834,11 @@ inline __m128i RotateRight64<8>(const __m128i& val) inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3. - // The zero block below is a "don't care". It is present so we can vectorize. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... __m128i block1 = _mm_setzero_si128(); __m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1); @@ -830,6 +862,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi64(x1, y1); // block1 = _mm_unpackhi_epi64(x1, y1); } @@ -837,8 +870,11 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... __m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i x2 = _mm_unpacklo_epi64(block2, block3); @@ -872,6 +908,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1); block2 = _mm_unpacklo_epi64(x2, y2); @@ -880,9 +917,11 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3. - // The zero block below is a "don't care". It is present so we can vectorize. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... __m128i block1 = _mm_setzero_si128(); __m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1); @@ -906,6 +945,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi64(x1, y1); // block1 = _mm_unpackhi_epi64(x1, y1); } @@ -913,8 +953,11 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3. + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... __m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i x2 = _mm_unpacklo_epi64(block2, block3); @@ -948,6 +991,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1); block2 = _mm_unpacklo_epi64(x2, y2); @@ -1106,12 +1150,17 @@ inline __m128i RotateRight32<8>(const __m128i& val) inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. - // The zero block below is a "don't care". It is present so we can vectorize. - // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one. - __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0); - __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0); + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. Thanks to Peter + // Cordes for help with the SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128i zero = _mm_setzero_si128(); + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(zero); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); @@ -1132,17 +1181,24 @@ inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned i y1 = _mm_shuffle_epi8(y1, mask); // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi32(x1, y1); + // block1 = _mm_unpackhigh_epi32(x1, y1); } inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. - // The zero block below is a "don't care". It is present so we can vectorize. - // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one. - __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0); - __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0); + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. Thanks to Peter + // Cordes for help with the SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128i zero = _mm_setzero_si128(); + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(zero); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); @@ -1163,22 +1219,23 @@ inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned i y1 = _mm_shuffle_epi8(y1, mask); // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi32(x1, y1); + // block1 = _mm_unpackhigh_epi32(x1, y1); } inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. - // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one. - __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0); - __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0); - x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1); - y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block0, 3), 1); - x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 0), 2); - y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 1), 2); - x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 2), 3); - y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 3), 3); + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. Thanks to Peter + // Cordes for help with the SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); @@ -1199,23 +1256,23 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32 y1 = _mm_shuffle_epi8(y1, mask); // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1); } inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds) { - // Hack ahead... Rearrange the data for vectorization. It is easier to permute - // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3. - // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one. - __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0); - __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0); - x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1); - y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block0, 3), 1); - x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 0), 2); - y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 1), 2); - x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 2), 3); - y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 3), 3); + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. Thanks to Peter + // Cordes for help with the SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); @@ -1236,6 +1293,7 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32 y1 = _mm_shuffle_epi8(y1, mask); // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = _mm_unpacklo_epi32(x1, y1); block1 = _mm_unpackhi_epi32(x1, y1); }