From 076937eb81b8bf1a7703894efee6fb90247874cc Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Mon, 4 Dec 2017 12:31:32 -0500
Subject: [PATCH] Update comments for vector permutes in SPECK-128

---
 speck-simd.cpp | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/speck-simd.cpp b/speck-simd.cpp
index f0d3ee87..75907f47 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -380,10 +380,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
             block = veorq_u32(block, x);
         }
 
-        const word32 t0 = vgetq_lane_u32(block, 0);
-        std::memcpy(Ptr32(outBlocks)+0, &t0, 4);
-        const word32 t1 = vgetq_lane_u32(block, 1);
-        std::memcpy(Ptr32(outBlocks)+1, &t1, 4);
+        word32 t[2];
+        t[0] = vgetq_lane_u32(block, 0);
+        t[1] = vgetq_lane_u32(block, 1);
+        std::memcpy(outBlocks, t, sizeof(t));
 
         inBlocks += inIncrement;
         outBlocks += outIncrement;
@@ -476,7 +476,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t block1 = {0};
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
@@ -498,7 +498,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign
     x1 = Shuffle64(x1);
     y1 = Shuffle64(y1);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = UnpackLow64(x1, y1);
     // block1 = UnpackHigh64(x1, y1);
 }
@@ -511,7 +511,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
     uint64x2_t x2 = UnpackLow64(block2, block3);
@@ -554,7 +554,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     x3 = Shuffle64(x3);
     y3 = Shuffle64(y3);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = UnpackLow64(x1, y1);
     block1 = UnpackHigh64(x1, y1);
     block2 = UnpackLow64(x2, y2);
@@ -569,7 +569,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t block1 = {0};
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
@@ -591,7 +591,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign
     x1 = Shuffle64(x1);
     y1 = Shuffle64(y1);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = UnpackLow64(x1, y1);
     // block1 = UnpackHigh64(x1, y1);
 }
@@ -604,7 +604,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t x1 = UnpackLow64(block0, block1);
     uint64x2_t y1 = UnpackHigh64(block0, block1);
     uint64x2_t x2 = UnpackLow64(block2, block3);
@@ -647,7 +647,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     x3 = Shuffle64(x3);
     y3 = Shuffle64(y3);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = UnpackLow64(x1, y1);
     block1 = UnpackHigh64(x1, y1);
     block2 = UnpackLow64(x2, y2);
@@ -838,7 +838,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i block1 = _mm_setzero_si128();
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@@ -862,7 +862,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
     x1 = _mm_shuffle_epi8(x1, mask);
     y1 = _mm_shuffle_epi8(y1, mask);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = _mm_unpacklo_epi64(x1, y1);
     // block1 = _mm_unpackhi_epi64(x1, y1);
 }
@@ -874,7 +874,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
     __m128i x2 = _mm_unpacklo_epi64(block2, block3);
@@ -908,7 +908,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
     x2 = _mm_shuffle_epi8(x2, mask);
     y2 = _mm_shuffle_epi8(y2, mask);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = _mm_unpacklo_epi64(x1, y1);
     block1 = _mm_unpackhi_epi64(x1, y1);
     block2 = _mm_unpacklo_epi64(x2, y2);
@@ -921,7 +921,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i block1 = _mm_setzero_si128();
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@@ -945,7 +945,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
     x1 = _mm_shuffle_epi8(x1, mask);
     y1 = _mm_shuffle_epi8(y1, mask);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = _mm_unpacklo_epi64(x1, y1);
     // block1 = _mm_unpackhi_epi64(x1, y1);
 }
@@ -957,7 +957,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. If only a single block is available then
     // a Zero block is provided to promote vectorizations.
-    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpacklo_epi64(block0, block1);
     __m128i y1 = _mm_unpackhi_epi64(block0, block1);
     __m128i x2 = _mm_unpacklo_epi64(block2, block3);
@@ -991,7 +991,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
     x2 = _mm_shuffle_epi8(x2, mask);
     y2 = _mm_shuffle_epi8(y2, mask);
 
-    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
     block0 = _mm_unpacklo_epi64(x1, y1);
     block1 = _mm_unpackhi_epi64(x1, y1);
     block2 = _mm_unpacklo_epi64(x2, y2);
@@ -1402,9 +1402,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F1 func1, F4 func4,
             block = _mm_xor_si128(block, _mm_insert_epi32(x, xorPtr[1], 1));
         }
 
-        word32* outPtr = reinterpret_cast<word32*>(outBlocks);
-        outPtr[0] = _mm_extract_epi32(block, 0);
-        outPtr[1] = _mm_extract_epi32(block, 1);
+        word32 t[2];
+        t[0] = _mm_extract_epi32(block, 0);
+        t[1] = _mm_extract_epi32(block, 1);
+        std::memcpy(outBlocks, t, sizeof(t));
 
         inBlocks += inIncrement;
         outBlocks += outIncrement;