Fix SHA512 on ARM benchmarks

This was a mistake when porting from Cryptogams to Crypto++. The macros VFP_ABI_PUSH and VFP_ABI_POP needed to be defined because they save and restore SIMD register state. They were originally missing during the port. The benchmarks would hang because the doubles we used for benchmarking were blown away in sha512_block_data_order_neon.
pull/853/head
Jeffrey Walton 2019-05-25 06:23:19 -04:00
parent 92df2a685f
commit fc10a7f1ea
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
4 changed files with 23 additions and 28 deletions

View File

@ -998,7 +998,7 @@ std::string SHA512_AlgorithmProvider()
if (HasSSE2()) if (HasSSE2())
return "SSE2"; return "SSE2";
#endif #endif
#if CRYPTOGAMS_ARM_SHA512 && 0 #if CRYPTOGAMS_ARM_SHA512
if (HasNEON()) if (HasNEON())
return "NEON"; return "NEON";
if (HasARMv7()) if (HasARMv7())
@ -1310,7 +1310,7 @@ void SHA512::Transform(word64 *state, const word64 *data)
return; return;
} }
#endif #endif
#if CRYPTOGAMS_ARM_SHA512 && 0 #if CRYPTOGAMS_ARM_SHA512
if (HasARMv7()) if (HasARMv7())
{ {
# if (CRYPTOPP_LITTLE_ENDIAN) # if (CRYPTOPP_LITTLE_ENDIAN)

View File

@ -85,21 +85,21 @@
.align 5 .align 5
sha1_block_data_order: sha1_block_data_order:
.Lsha1_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ sha1_block_data_order
#else
adr r3,.Lsha1_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
.Lsha1_block:
@ldr r12,.LCRYPTOGAMS_armcap
ldr r12,=CRYPTOGAMS_armcap_P ldr r12,=CRYPTOGAMS_armcap_P
# if !defined(_WIN32) ldr r12,[r12] @ CRYPTOGAMS_armcap_P
adr r3,.Lsha1_block
@ldr r12,[r3,r12] @ CRYPTOGAMS_armcap_P
ldr r12,[r12]
# endif
# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
tst r12,#ARMV7_NEON tst r12,#ARMV7_NEON
bne .LNEON bne .LNEON
#endif #endif
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
ldmia r0,{r3,r4,r5,r6,r7} ldmia r0,{r3,r4,r5,r6,r7}

View File

@ -113,19 +113,15 @@ sha256_block_data_order:
#else #else
adr r3,.Lsha256_block_data_order adr r3,.Lsha256_block_data_order
#endif #endif
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
@ldr r12,.LCRYPTOGAMS_armcap
ldr r12,=CRYPTOGAMS_armcap_P ldr r12,=CRYPTOGAMS_armcap_P
# if !defined(_WIN32)
@ldr r12,[r3,r12] @ CRYPTOGAMS_armcap_P
ldr r12,[r12] @ CRYPTOGAMS_armcap_P ldr r12,[r12] @ CRYPTOGAMS_armcap_P
# endif
# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
tst r12,#ARMV7_NEON tst r12,#ARMV7_NEON
bne .LNEON bne .LNEON
#endif #endif
add r2,r1,r2,lsl#6 @ len to point at the end of inp add r2,r1,r2,lsl#6 @ len to point at the end of inp
stmdb sp!,{r0,r1,r2,r4-r11,lr} stmdb sp!,{r0,r1,r2,r4-r11,lr}
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}

View File

@ -79,6 +79,9 @@
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif #endif
#define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
#define VFP_ABI_POP vldmia sp!,{d8-d15}
#if defined(__thumb2__) #if defined(__thumb2__)
.syntax unified .syntax unified
.thumb .thumb
@ -147,19 +150,15 @@ sha512_block_data_order:
#else #else
adr r3,.Lsha512_block_data_order adr r3,.Lsha512_block_data_order
#endif #endif
#if __ARM_MAX_ARCH__>=7 #if __ARM_MAX_ARCH__>=7
@ldr r12,.LCRYPTOGAMS_armcap
ldr r12,=CRYPTOGAMS_armcap_P ldr r12,=CRYPTOGAMS_armcap_P
# if !defined(_WIN32)
@ldr r12,[r3,r12] @ CRYPTOGAMS_armcap_P
ldr r12,[r12] @ CRYPTOGAMS_armcap_P ldr r12,[r12] @ CRYPTOGAMS_armcap_P
# endif
# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
tst r12,#ARMV7_NEON tst r12,#ARMV7_NEON
bne .LNEON bne .LNEON
#endif #endif
add r2,r1,r2,lsl#7 @ len to point at the end of inp add r2,r1,r2,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
sub r14,r3,#672 @ K512 sub r14,r3,#672 @ K512
@ -554,7 +553,7 @@ sha512_block_data_order_neon:
dmb @ errata #451034 on early Cortex A8 dmb @ errata #451034 on early Cortex A8
add r2,r1,r2,lsl#7 @ len to point at the end of inp add r2,r1,r2,lsl#7 @ len to point at the end of inp
adr r3,K512 adr r3,K512
@VFP_ABI_PUSH VFP_ABI_PUSH
vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
.Loop_neon: .Loop_neon:
vshr.u64 d24,d20,#14 @ 0 vshr.u64 d24,d20,#14 @ 0
@ -1868,7 +1867,7 @@ sha512_block_data_order_neon:
teq r1,r2 teq r1,r2
sub r3,#640 @ rewind K512 sub r3,#640 @ rewind K512
bne .Loop_neon bne .Loop_neon
@VFP_ABI_POP VFP_ABI_POP
bx lr @ .word 0xe12fff1e bx lr @ .word 0xe12fff1e
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif #endif