From 25709d25975d945ba7d91937724f7845aa9a0e8b Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Mon, 4 Dec 2017 09:47:26 -0500
Subject: [PATCH] Fix SPECK64 vector permutes Thanks to Peter Cordes for the
 suggestion on handling the case

---
 GNUmakefile    |   2 +
 speck-simd.cpp | 182 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 122 insertions(+), 62 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile
index 16f132de..9fe82502 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -288,6 +288,8 @@ ifeq ($(SUN_COMPILER),1)
   COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     BLAKE2_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
+    SIMON_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
+    SPECK_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
     LDFLAGS += -xarch=sse4_1
   endif
   COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
diff --git a/speck-simd.cpp b/speck-simd.cpp
index 2195b771..f0d3ee87 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -137,8 +137,11 @@ inline const word64* Ptr64(const T* ptr)
 
 inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const uint32x4_t zero = {0, 0, 0, 0};
     const uint32x4x2_t t1 = vuzpq_u32(block0, zero);
     uint32x4_t x1 = t1.val[0];
@@ -161,6 +164,7 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
     x1 = Shuffle32(x1);
     y1 = Shuffle32(y1);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     const uint32x4x2_t t2 = vzipq_u32(x1, y1);
     block0 = t2.val[0];
     // block1 = t2.val[1];
@@ -168,8 +172,11 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
 
 inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const uint32x4_t zero = {0, 0, 0, 0};
     const uint32x4x2_t t1 = vuzpq_u32(block0, zero);
     uint32x4_t x1 = t1.val[0];
@@ -192,6 +199,7 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
     x1 = Shuffle32(x1);
     y1 = Shuffle32(y1);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     const uint32x4x2_t t2 = vzipq_u32(x1, y1);
     block0 = t2.val[0];
     // block1 = t2.val[1];
@@ -199,8 +207,11 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, const word32 *subkeys, unsigne
 
 inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const uint32x4x2_t t1 = vuzpq_u32(block0, block1);
     uint32x4_t x1 = t1.val[0];
     uint32x4_t y1 = t1.val[1];
@@ -222,6 +233,7 @@ inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w
     x1 = Shuffle32(x1);
     y1 = Shuffle32(y1);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     const uint32x4x2_t t2 = vzipq_u32(x1, y1);
     block0 = t2.val[0];
     block1 = t2.val[1];
@@ -229,8 +241,11 @@ inline void SPECK64_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w
 
 inline void SPECK64_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const uint32x4x2_t t1 = vuzpq_u32(block0, block1);
     uint32x4_t x1 = t1.val[0];
     uint32x4_t y1 = t1.val[1];
@@ -252,6 +267,7 @@ inline void SPECK64_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, const w
     x1 = Shuffle32(x1);
     y1 = Shuffle32(y1);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     const uint32x4x2_t t2 = vzipq_u32(x1, y1);
     block0 = t2.val[0];
     block1 = t2.val[1];
@@ -456,9 +472,11 @@ inline uint64x2_t Shuffle64(const uint64x2_t& val)
 
 inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
-    // The zero block below is a "don't care". It is present so we can vectorize.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint64x2_t block1 = {0};
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
@@ -480,6 +498,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign
     x1 = Shuffle64(x1);
     y1 = Shuffle64(y1);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = UnpackLow64(x1, y1);
     // block1 = UnpackHigh64(x1, y1);
 }
@@ -488,8 +507,11 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
             uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4,
             uint64x2_t &block5, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
     uint64x2_t x2 = UnpackLow64(block2, block3);
@@ -532,6 +554,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     x3 = Shuffle64(x3);
     y3 = Shuffle64(y3);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = UnpackLow64(x1, y1);
     block1 = UnpackHigh64(x1, y1);
     block2 = UnpackLow64(x2, y2);
@@ -542,9 +565,11 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
 
 inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
-    // The zero block below is a "don't care". It is present so we can vectorize.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint64x2_t block1 = {0};
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
@@ -566,6 +591,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign
     x1 = Shuffle64(x1);
     y1 = Shuffle64(y1);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = UnpackLow64(x1, y1);
     // block1 = UnpackHigh64(x1, y1);
 }
@@ -574,8 +600,11 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
             uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4,
             uint64x2_t &block5, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
     uint64x2_t x2 = UnpackLow64(block2, block3);
@@ -618,6 +647,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     x3 = Shuffle64(x3);
     y3 = Shuffle64(y3);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = UnpackLow64(x1, y1);
     block1 = UnpackHigh64(x1, y1);
     block2 = UnpackLow64(x2, y2);
@@ -804,9 +834,11 @@ inline __m128i RotateRight64<8>(const __m128i& val)
 
 inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
-    // The zero block below is a "don't care". It is present so we can vectorize.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     __m128i block1 = _mm_setzero_si128();
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@@ -830,6 +862,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
     x1 = _mm_shuffle_epi8(x1, mask);
     y1 = _mm_shuffle_epi8(y1, mask);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi64(x1, y1);
     // block1 = _mm_unpackhi_epi64(x1, y1);
 }
@@ -837,8 +870,11 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
 inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
     __m128i x2 = _mm_unpacklo_epi64(block2, block3);
@@ -872,6 +908,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
     x2 = _mm_shuffle_epi8(x2, mask);
     y2 = _mm_shuffle_epi8(y2, mask);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi64(x1, y1);
     block1 = _mm_unpackhi_epi64(x1, y1);
     block2 = _mm_unpacklo_epi64(x2, y2);
@@ -880,9 +917,11 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
 
 inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
-    // The zero block below is a "don't care". It is present so we can vectorize.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     __m128i block1 = _mm_setzero_si128();
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@@ -906,6 +945,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
     x1 = _mm_shuffle_epi8(x1, mask);
     y1 = _mm_shuffle_epi8(y1, mask);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi64(x1, y1);
     // block1 = _mm_unpackhi_epi64(x1, y1);
 }
@@ -913,8 +953,11 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
 inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_SSSE3.
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
     __m128i x2 = _mm_unpacklo_epi64(block2, block3);
@@ -948,6 +991,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
     x2 = _mm_shuffle_epi8(x2, mask);
     y2 = _mm_shuffle_epi8(y2, mask);
 
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi64(x1, y1);
     block1 = _mm_unpackhi_epi64(x1, y1);
     block2 = _mm_unpacklo_epi64(x2, y2);
@@ -1106,12 +1150,17 @@ inline __m128i RotateRight32<8>(const __m128i& val)
 
 inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
-    // The zero block below is a "don't care". It is present so we can vectorize.
-    // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
-    __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
-    __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations. Thanks to Peter
+    // Cordes for help with the SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128i zero = _mm_setzero_si128();
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(zero);
+    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
 
     const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
     x1 = _mm_shuffle_epi8(x1, mask);
@@ -1132,17 +1181,24 @@ inline void SPECK64_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned i
     y1 = _mm_shuffle_epi8(y1, mask);
 
     // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(x1, y1);
+    // block1 = _mm_unpackhigh_epi32(x1, y1);
 }
 
 inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
-    // The zero block below is a "don't care". It is present so we can vectorize.
-    // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
-    __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
-    __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations. Thanks to Peter
+    // Cordes for help with the SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128i zero = _mm_setzero_si128();
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(zero);
+    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
 
     const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
     x1 = _mm_shuffle_epi8(x1, mask);
@@ -1163,22 +1219,23 @@ inline void SPECK64_Dec_Block(__m128i &block0, const word32 *subkeys, unsigned i
     y1 = _mm_shuffle_epi8(y1, mask);
 
     // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(x1, y1);
+    // block1 = _mm_unpackhigh_epi32(x1, y1);
 }
 
 inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Enc_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
-    // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
-    __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
-    __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
-    x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1);
-    y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block0, 3), 1);
-    x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 0), 2);
-    y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 1), 2);
-    x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 2), 3);
-    y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 3), 3);
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations. Thanks to Peter
+    // Cordes for help with the SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
 
     const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
     x1 = _mm_shuffle_epi8(x1, mask);
@@ -1199,23 +1256,23 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, const word32
     y1 = _mm_shuffle_epi8(y1, mask);
 
     // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(x1, y1);
     block1 = _mm_unpackhi_epi32(x1, y1);
 }
 
 inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32 *subkeys, unsigned int rounds)
 {
-    // Hack ahead... Rearrange the data for vectorization. It is easier to permute
-    // the data in SPECK64_Dec_Blocks then SPECK64_AdvancedProcessBlocks_SSSE3.
-    // We really want an SSE equivalent to NEON's vuzp, but SSE does not have one.
-    __m128i x1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 0), 0);
-    __m128i y1 = _mm_insert_epi32(_mm_setzero_si128(), _mm_extract_epi32(block0, 1), 0);
-    x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block0, 2), 1);
-    y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block0, 3), 1);
-    x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 0), 2);
-    y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 1), 2);
-    x1 = _mm_insert_epi32(x1, _mm_extract_epi32(block1, 2), 3);
-    y1 = _mm_insert_epi32(y1, _mm_extract_epi32(block1, 3), 3);
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations. Thanks to Peter
+    // Cordes for help with the SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+    __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+    __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
 
     const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
     x1 = _mm_shuffle_epi8(x1, mask);
@@ -1236,6 +1293,7 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, const word32
     y1 = _mm_shuffle_epi8(y1, mask);
 
     // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(x1, y1);
     block1 = _mm_unpackhi_epi32(x1, y1);
 }