From 81da61fe7b32ab771d121cc8c889c61dfb2e60a0 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 26 May 2019 22:10:26 -0400 Subject: [PATCH] Breakout sha_block_data_order and sha_block_data_order_neon (GH #847) --- cpu.cpp | 14 -------------- sha.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ sha1_armv4.S | 21 ++++++--------------- sha256_armv4.S | 19 ++++--------------- sha512_armv4.S | 21 ++++++--------------- 5 files changed, 67 insertions(+), 59 deletions(-) diff --git a/cpu.cpp b/cpu.cpp index 28792d3e..25b50c71 100644 --- a/cpu.cpp +++ b/cpu.cpp @@ -61,16 +61,6 @@ unsigned long int getauxval(unsigned long int) { return 0; } extern "C" unsigned long long __fastcall XGETBV64(unsigned int); #endif -#if CRYPTOGAMS_ARM_SHA1 || CRYPTOGAMS_ARM_SHA256 || CRYPTOGAMS_ARM_SHA512 -// The Cryptogams code uses a global variable named CRYPTOGAMS_armcap_P -// for capabilities like ARMv7 and NEON. We allocate storage for -// CRYPTOGAMS_armcap_P, and the Cryptogams object files use our symbol. -// The Cryptogams code defines ARMV7_NEON as 1<<0, so we need to set -// the bits accordingly in CRYPTOGAMS_armcap_P. -extern "C" unsigned int CRYPTOGAMS_armcap_P; -unsigned int CRYPTOGAMS_armcap_P = 0; -#endif - ANONYMOUS_NAMESPACE_BEGIN #if defined(__APPLE__) @@ -860,10 +850,6 @@ void DetectArmFeatures() g_hasSM3 = CPU_QuerySM3(); // || CPU_ProbeSM3(); g_hasSM4 = CPU_QuerySM4(); // || CPU_ProbeSM4(); -# if CRYPTOGAMS_ARM_SHA1 || CRYPTOGAMS_ARM_SHA256 || CRYPTOGAMS_ARM_SHA512 - CRYPTOGAMS_armcap_P = g_hasNEON ? (1<<0) : 0; -# endif - #if defined(_SC_LEVEL1_DCACHE_LINESIZE) // Glibc does not implement on some platforms. The runtime returns 0 instead of error. // https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/posix/sysconf.c diff --git a/sha.cpp b/sha.cpp index ce742b2a..4fc80520 100644 --- a/sha.cpp +++ b/sha.cpp @@ -68,6 +68,7 @@ extern void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, s #if CRYPTOGAMS_ARM_SHA1 extern "C" void sha1_block_data_order(word32* state, const word32 *data, size_t blocks); +extern "C" void sha1_block_data_order_neon(word32* state, const word32 *data, size_t blocks); #endif #if CRYPTOPP_ARM_SHA1_AVAILABLE @@ -80,6 +81,7 @@ extern void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, s #if CRYPTOGAMS_ARM_SHA256 extern "C" void sha256_block_data_order(word32* state, const word32 *data, size_t blocks); +extern "C" void sha256_block_data_order_neon(word32* state, const word32 *data, size_t blocks); #endif #if CRYPTOPP_ARM_SHA512_AVAILABLE @@ -93,6 +95,7 @@ extern void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, #if CRYPTOGAMS_ARM_SHA512 extern "C" void sha512_block_data_order(word64* state, const word64 *data, size_t blocks); +extern "C" void sha512_block_data_order_neon(word64* state, const word64 *data, size_t blocks); #endif // We add extern to export table to sha_simd.cpp, but it @@ -289,6 +292,17 @@ void SHA1::Transform(word32 *state, const word32 *data) } #endif #if CRYPTOGAMS_ARM_SHA1 && 0 + if (HasNEON()) + { +# if defined(CRYPTOPP_LITTLE_ENDIAN) + word32 dataBuf[16]; + ByteReverse(dataBuf, data, SHA1::BLOCKSIZE); + sha1_block_data_order_neon(state, data, 1); +# else + sha1_block_data_order_neon(state, data, 1); +# endif + return; + } if (HasARMv7()) { # if defined(CRYPTOPP_LITTLE_ENDIAN) @@ -325,6 +339,11 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) } #endif #if CRYPTOGAMS_ARM_SHA1 + if (HasNEON()) + { + sha1_block_data_order_neon(m_state, input, length / SHA1::BLOCKSIZE); + return length & (SHA1::BLOCKSIZE - 1); + } if (HasARMv7()) { sha1_block_data_order(m_state, input, length / SHA1::BLOCKSIZE); @@ -834,6 +853,17 @@ void SHA256::Transform(word32 *state, const word32 *data) } #endif #if CRYPTOGAMS_ARM_SHA256 && 0 + if (HasNEON()) + { +# if defined(CRYPTOPP_LITTLE_ENDIAN) + word32 dataBuf[16]; + ByteReverse(dataBuf, data, SHA256::BLOCKSIZE); + sha256_block_data_order_neon(state, data, 1); +# else + sha256_block_data_order_neon(state, data, 1); +# endif + return; + } if (HasARMv7()) { # if defined(CRYPTOPP_LITTLE_ENDIAN) @@ -885,6 +915,11 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) } #endif #if CRYPTOGAMS_ARM_SHA256 + if (HasNEON()) + { + sha256_block_data_order_neon(m_state, input, length / SHA256::BLOCKSIZE); + return length & (SHA256::BLOCKSIZE - 1); + } if (HasARMv7()) { sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE); @@ -948,6 +983,11 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) } #endif #if CRYPTOGAMS_ARM_SHA256 + if (HasNEON()) + { + sha256_block_data_order_neon(m_state, input, length / SHA256::BLOCKSIZE); + return length & (SHA256::BLOCKSIZE - 1); + } if (HasARMv7()) { sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE); @@ -1311,6 +1351,17 @@ void SHA512::Transform(word64 *state, const word64 *data) } #endif #if CRYPTOGAMS_ARM_SHA512 + if (HasNEON()) + { +# if (CRYPTOPP_LITTLE_ENDIAN) + word64 dataBuf[16]; + ByteReverse(dataBuf, data, SHA512::BLOCKSIZE); + sha512_block_data_order_neon(state, dataBuf, 1); +# else + sha512_block_data_order_neon(state, data, 1); +# endif + return; + } if (HasARMv7()) { # if (CRYPTOPP_LITTLE_ENDIAN) diff --git a/sha1_armv4.S b/sha1_armv4.S index 189d7e21..4a493f3d 100644 --- a/sha1_armv4.S +++ b/sha1_armv4.S @@ -76,33 +76,25 @@ .code 32 #endif -.extern CRYPTOGAMS_armcap_P - .text +.align 5 .globl sha1_block_data_order .type sha1_block_data_order,%function -.align 5 sha1_block_data_order: .Lsha1_block_data_order: + #if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha1_block_data_order #else adr r3,.Lsha1_block_data_order #endif -#if __ARM_MAX_ARCH__>=7 - ldr r12,=CRYPTOGAMS_armcap_P - ldr r12,[r12] @ CRYPTOGAMS_armcap_P - - tst r12,#ARMV7_NEON - bne .LNEON -#endif - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 ldmia r0,{r3,r4,r5,r6,r7} + .Lloop: ldr r8,.LK_00_19 mov r14,sp @@ -552,9 +544,6 @@ sha1_block_data_order: #endif .size sha1_block_data_order,.-sha1_block_data_order -@ CRYPTOGAMS_armcap_P -.ltorg - .align 5 .LK_00_19:.word 0x5a827999 .LK_20_39:.word 0x6ed9eba1 @@ -566,10 +555,12 @@ sha1_block_data_order: .arch armv7-a .fpu neon +.globl sha1_block_data_order_neon .type sha1_block_data_order_neon,%function + .align 4 sha1_block_data_order_neon: -.LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 @ dmb @ errata #451034 on early Cortex A8 diff --git a/sha256_armv4.S b/sha256_armv4.S index ef1f4854..2cccdfd1 100644 --- a/sha256_armv4.S +++ b/sha256_armv4.S @@ -76,8 +76,6 @@ .code 32 #endif -.extern CRYPTOGAMS_armcap_P - .text .type K256,%object @@ -103,30 +101,24 @@ K256: .word 0 @ terminator .align 5 - .globl sha256_block_data_order .type sha256_block_data_order,%function + sha256_block_data_order: .Lsha256_block_data_order: + #if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha256_block_data_order #else adr r3,.Lsha256_block_data_order #endif -#if __ARM_MAX_ARCH__>=7 - ldr r12,=CRYPTOGAMS_armcap_P - ldr r12,[r12] @ CRYPTOGAMS_armcap_P - - tst r12,#ARMV7_NEON - bne .LNEON -#endif - add r2,r1,r2,lsl#6 @ len to point at the end of inp stmdb sp!,{r0,r1,r2,r4-r11,lr} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} sub r14,r3,#256+32 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) + .Loop: # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @@ -1889,9 +1881,6 @@ sha256_block_data_order: #endif .size sha256_block_data_order,.-sha256_block_data_order -@ CRYPTOGAMS_armcap_P -.ltorg - #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon @@ -1901,7 +1890,7 @@ sha256_block_data_order: .align 5 .skip 16 sha256_block_data_order_neon: -.LNEON: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r11,sp,#16*4+16 diff --git a/sha512_armv4.S b/sha512_armv4.S index 7a9e2ac2..d7181a0e 100644 --- a/sha512_armv4.S +++ b/sha512_armv4.S @@ -87,8 +87,6 @@ .code 32 #endif -.extern CRYPTOGAMS_armcap_P - .text .type K512,%object @@ -138,24 +136,19 @@ K512: .skip 32 +.align 5 .globl sha512_block_data_order .type sha512_block_data_order,%function + sha512_block_data_order: .Lsha512_block_data_order: + #if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha512_block_data_order #else adr r3,.Lsha512_block_data_order #endif -#if __ARM_MAX_ARCH__>=7 - ldr r12,=CRYPTOGAMS_armcap_P - ldr r12,[r12] @ CRYPTOGAMS_armcap_P - - tst r12,#ARMV7_NEON - bne .LNEON -#endif - add r2,r1,r2,lsl#7 @ len to point at the end of inp stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r14,r3,#672 @ K512 @@ -535,18 +528,16 @@ sha512_block_data_order: #endif .size sha512_block_data_order,.-sha512_block_data_order -@ CRYPTOGAMS_armcap_P -.ltorg - #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon +.align 4 .globl sha512_block_data_order_neon .type sha512_block_data_order_neon,%function -.align 4 + sha512_block_data_order_neon: -.LNEON: + dmb @ errata #451034 on early Cortex A8 add r2,r1,r2,lsl#7 @ len to point at the end of inp adr r3,K512