diff --git a/simon-simd.cpp b/simon-simd.cpp
index 99a6acf9..997cfaab 100644
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@@ -140,10 +140,6 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
 inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. If only a single block is available then
-    // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
     uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
@@ -173,10 +169,6 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
 inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. If only a single block is available then
-    // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
     uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
@@ -208,10 +200,6 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
     uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. If only a single block is available then
-    // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
     uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
@@ -256,10 +244,6 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
     uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. If only a single block is available then
-    // a Zero block is provided to promote vectorizations.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     uint32x4_t x1 = vuzpq_u32(block0, block1).val[1];
     uint32x4_t y1 = vuzpq_u32(block0, block1).val[0];
@@ -380,9 +364,6 @@ inline uint64x2_t SIMON128_f(const uint64x2_t& val)
 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t x1 = UnpackHigh64(block0, block1);
     uint64x2_t y1 = UnpackLow64(block0, block1);
@@ -413,9 +394,6 @@ inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t x1 = UnpackHigh64(block0, block1);
     uint64x2_t y1 = UnpackLow64(block0, block1);
@@ -459,9 +437,6 @@ inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t x1 = UnpackHigh64(block0, block1);
     uint64x2_t y1 = UnpackLow64(block0, block1);
@@ -493,9 +468,6 @@ inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
     uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     uint64x2_t x1 = UnpackHigh64(block0, block1);
     uint64x2_t y1 = UnpackLow64(block0, block1);
@@ -617,9 +589,6 @@ inline __m128i SIMON128_f(const __m128i& v)
 inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
     __m128i y1 = _mm_unpacklo_epi64(block0, block1);
@@ -653,9 +622,6 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
     __m128i y1 = _mm_unpacklo_epi64(block0, block1);
@@ -701,9 +667,6 @@ inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
 inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
     __m128i y1 = _mm_unpacklo_epi64(block0, block1);
@@ -738,9 +701,6 @@ inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word64 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following.
     // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
     __m128i x1 = _mm_unpackhi_epi64(block0, block1);
     __m128i y1 = _mm_unpacklo_epi64(block0, block1);
@@ -828,10 +788,6 @@ inline __m128i SIMON64_f(const __m128i& v)
 inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. Thanks to Peter Cordes for help with the
-    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -854,7 +810,6 @@ inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
         Swap128(x1, y1);
     }
 
-    // The is roughly the SSE equivalent to ARM vzp32
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(y1, x1);
     block1 = _mm_unpackhi_epi32(y1, x1);
@@ -863,10 +818,6 @@ inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
 inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. Thanks to Peter Cordes for help with the
-    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -890,7 +841,6 @@ inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
         y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
     }
 
-    // The is roughly the SSE equivalent to ARM vzp32
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(y1, x1);
     block1 = _mm_unpackhi_epi32(y1, x1);
@@ -900,10 +850,6 @@ inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. Thanks to Peter Cordes for help with the
-    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -942,7 +888,6 @@ inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
         Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
     }
 
-    // The is roughly the SSE equivalent to ARM vzp32
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(y1, x1);
     block1 = _mm_unpackhi_epi32(y1, x1);
@@ -956,10 +901,6 @@ inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
     __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
     const word32 *subkeys, unsigned int rounds)
 {
-    // Rearrange the data for vectorization. The incoming data was read into
-    // a little-endian word array. Depending on the number of blocks it needs to
-    // be permuted to the following. Thanks to Peter Cordes for help with the
-    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -999,7 +940,6 @@ inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
         y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
     }
 
-    // The is roughly the SSE equivalent to ARM vzp32
     // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
     block0 = _mm_unpacklo_epi32(y1, x1);
     block1 = _mm_unpackhi_epi32(y1, x1);