Fix Speck-64 CTR mode

It looks like the delay was due to some GCC 7 issue. We had to disable parallel blocks on Aarch64 with GCC 7. We may be running out of registers and that could be causing problems. It looks like GCC uses up to v30.
2017-12-07 22:30:03 -05:00 · 2017-12-07 22:30:03 -05:00 · 148202369b
parent 02037b5ce6
commit 148202369b
2 changed files with 41 additions and 26 deletions
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@ -383,10 +383,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
            if (flags & BlockTransformation::BT_InBlockIsCounter)
            {
                // For 64-bit block ciphers we need to load the initial single CTR block.
-                // After the dup load we have two counters in the XMM word. Then we need
+                // After the dup load we have two counters in the NEON word. Then we need
                // to increment the low ctr by 0 and the high ctr by 1.
-                block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
+                const uint8x8_t c = vld1_u8(inBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
+                block0 = vaddq_u32(be1, vreinterpretq_u32_u8(vcombine_u8(c,c)));
                // After initial increment of {0,1} remaining counters increment by {1,1}.
                block1 = vaddq_u32(be2, block0);
@ -454,10 +454,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
            if (flags & BlockTransformation::BT_InBlockIsCounter)
            {
                // For 64-bit block ciphers we need to load the initial single CTR block.
-                // After the dup load we have two counters in the XMM word. Then we need
+                // After the dup load we have two counters in the NEON word. Then we need
                // to increment the low ctr by 0 and the high ctr by 1.
-                block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
+                const uint8x8_t c = vld1_u8(inBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
+                block0 = vaddq_u32(be1, vreinterpretq_u32_u8(vcombine_u8(c,c)));
                // After initial increment of {0,1} remaining counters increment by {1,1}.
                block1 = vaddq_u32(be2, block0);
@ -522,14 +522,15 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
        while (length >= blockSize)
        {
-            uint32x4_t zero = vld1q_u32(s_zero);
+            uint32x4_t block, zero = vld1q_u32(s_zero);
-            uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
+
-                reinterpret_cast<const word64*>(inBlocks)));
+            const uint8x8_t v = vld1_u8(inBlocks);
            block = vreinterpretq_u32_u8(vcombine_u8(v,v));
            if (flags & BlockTransformation::BT_XorInput)
            {
-                block = veorq_u32(block, vreinterpretq_u32_u64(
+                const uint8x8_t x = vld1_u8(xorBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
+                block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
            }
            if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -539,8 +540,8 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
            if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
            {
-                block = veorq_u32(block, vreinterpretq_u32_u64(
+                const uint8x8_t x = vld1_u8(xorBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
+                block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
            }
            vst1_u8(const_cast<byte*>(outBlocks),
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@ -34,6 +34,15 @@
 # include <immintrin.h>
 #endif
 // Weird GCC 7.0 issue on GCC118 which is Aarch64. The 2x blocks produce
 // a bad result. The same code works fine with Speck (it was copied/pasted).
 // It may affect more versions, but we can only test GCC 7.2, 4.8 and 4.9.
 #if defined(__aarch32__) || defined(__aarch64__)
 # if defined(__GNUC__) && (__GNUC__ >= 7)
 #  define WORKAROUND_GCC_7_ISSUE 1
 # endif
 #endif
 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
 #define M128_CAST(x) ((__m128i *)(void *)(x))
 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
@ -338,6 +347,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
        outIncrement = 0-outIncrement;
    }
 #if defined(WORKAROUND_GCC_7_ISSUE)
    flags &= ~BlockTransformation::BT_AllowParallel;
 #endif
    if (flags & BlockTransformation::BT_AllowParallel)
    {
        // Load these magic values once. Analysis claims be1 and be2
@ -355,10 +368,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
            if (flags & BlockTransformation::BT_InBlockIsCounter)
            {
                // For 64-bit block ciphers we need to load the initial single CTR block.
-                // After the dup load we have two counters in the XMM word. Then we need
+                // After the dup load we have two counters in the NEON word. Then we need
                // to increment the low ctr by 0 and the high ctr by 1.
-                block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
+                const uint8x8_t c = vld1_u8(inBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
+                block0 = vaddq_u32(be1, vreinterpretq_u32_u8(vcombine_u8(c,c)));
                // After initial increment of {0,1} remaining counters increment by {1,1}.
                block1 = vaddq_u32(be2, block0);
@ -426,10 +439,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
            if (flags & BlockTransformation::BT_InBlockIsCounter)
            {
                // For 64-bit block ciphers we need to load the initial single CTR block.
-                // After the dup load we have two counters in the XMM word. Then we need
+                // After the dup load we have two counters in the NEON word. Then we need
                // to increment the low ctr by 0 and the high ctr by 1.
-                block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
+                const uint8x8_t c = vld1_u8(inBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
+                block0 = vaddq_u32(be1, vreinterpretq_u32_u8(vcombine_u8(c,c)));
                // After initial increment of {0,1} remaining counters increment by {1,1}.
                block1 = vaddq_u32(be2, block0);
@ -494,14 +507,15 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
        while (length >= blockSize)
        {
-            uint32x4_t zero = vld1q_u32(s_zero);
+            uint32x4_t block, zero = vld1q_u32(s_zero);
-            uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
+
-                reinterpret_cast<const word64*>(inBlocks)));
+            const uint8x8_t v = vld1_u8(inBlocks);
            block = vreinterpretq_u32_u8(vcombine_u8(v,v));
            if (flags & BlockTransformation::BT_XorInput)
            {
-                block = veorq_u32(block, vreinterpretq_u32_u64(
+                const uint8x8_t x = vld1_u8(xorBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
+                block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
            }
            if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -511,8 +525,8 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
            if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
            {
-                block = veorq_u32(block, vreinterpretq_u32_u64(
+                const uint8x8_t x = vld1_u8(xorBlocks);
-                    vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
+                block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
            }
            vst1_u8(const_cast<byte*>(outBlocks),