Add ARM SHA512 asm implementation from Cryptogams (GH #841, PR #843)

Cryptogams is Andy Polyakov's project used to create high speed crypto algorithms and share them with other developers. Cryptogams  has a dual license. First is the OpenSSL license because Andy contributes to OpenSSL. Second is a BSD license for those who want a more permissive license.

Andy's implementation runs about 45% faster than C/C++ code. Testing on a 1.8 GHz Cortex-A17 shows Cryptograms at 45 cpb, and C++ at 79 cpb.

The integration instructions are documented at [Cryptogams SHA](https://wiki.openssl.org/index.php/Cryptogams_SHA) on the OpenSSL wiki.
pull/853/head
Jeffrey Walton 2019-05-19 16:29:45 -04:00 committed by GitHub
parent 4c9ca6b723
commit d38e5a954d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 2013 additions and 43 deletions

View File

@ -318,6 +318,8 @@ sha1_armv4.h
sha1_armv4.S sha1_armv4.S
sha256_armv4.h sha256_armv4.h
sha256_armv4.S sha256_armv4.S
sha512_armv4.h
sha512_armv4.S
sha3.cpp sha3.cpp
sha3.h sha3.h
shacal2.cpp shacal2.cpp

View File

@ -1059,7 +1059,7 @@ endif
ifeq ($(IS_ARM32),1) ifeq ($(IS_ARM32),1)
CRYPTOGAMS_ARCH_FLAG = -march=armv7-a CRYPTOGAMS_ARCH_FLAG = -march=armv7-a
CRYPTOGAMS_ARCH_FLAG += -Wa,--noexecstack CRYPTOGAMS_ARCH_FLAG += -Wa,--noexecstack
SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S sha512_armv4.S
endif endif
# List cryptlib.cpp first, then cpu.cpp, then integer.cpp to tame C++ static initialization problems. # List cryptlib.cpp first, then cpu.cpp, then integer.cpp to tame C++ static initialization problems.
@ -1513,6 +1513,10 @@ sha1_armv4.o : sha1_armv4.S
sha256_armv4.o : sha256_armv4.S sha256_armv4.o : sha256_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $< $(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
# Cryptogams ARM asm implementation.
sha512_armv4.o : sha512_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
sha3_simd.o : sha3_simd.cpp sha3_simd.o : sha3_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA3_FLAG) -c) $< $(CXX) $(strip $(CXXFLAGS) $(SHA3_FLAG) -c) $<

View File

@ -594,7 +594,7 @@ ifeq ($(IS_ARM32),1)
ifneq ($(IS_IOS),1) ifneq ($(IS_IOS),1)
CRYPTOGAMS_ARCH_FLAG = -march=armv7-a CRYPTOGAMS_ARCH_FLAG = -march=armv7-a
CRYPTOGAMS_ARCH_FLAG += -Wa,--noexecstack CRYPTOGAMS_ARCH_FLAG += -Wa,--noexecstack
SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S SRCS += aes_armv4.S sha1_armv4.S sha256_armv4.S sha512_armv4.S
endif endif
endif endif
@ -873,6 +873,10 @@ sha1_armv4.o : sha1_armv4.S
sha256_armv4.o : sha256_armv4.S sha256_armv4.o : sha256_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $< $(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
# Cryptogams ARM asm implementation.
sha512_armv4.o : sha512_armv4.S
$(CC) $(strip $(CXXFLAGS) $(CRYPTOGAMS_ARCH_FLAG) -c) $<
# SSE4.2/SHA-NI or ARMv8a available # SSE4.2/SHA-NI or ARMv8a available
shacal2_simd.o : shacal2_simd.cpp shacal2_simd.o : shacal2_simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<

View File

@ -366,6 +366,7 @@
# define CRYPTOGAMS_ARM_AES 1 # define CRYPTOGAMS_ARM_AES 1
# define CRYPTOGAMS_ARM_SHA1 1 # define CRYPTOGAMS_ARM_SHA1 1
# define CRYPTOGAMS_ARM_SHA256 1 # define CRYPTOGAMS_ARM_SHA256 1
# define CRYPTOGAMS_ARM_SHA512 1
# endif # endif
#endif #endif

139
sha.cpp
View File

@ -1,29 +1,38 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey
// implemented Intel SHA extensions based on Intel articles and code by // Walton implemented Intel SHA extensions based on Intel articles and code
// Sean Gulley. Jeffrey Walton implemented ARM SHA based on ARM code and // by Sean Gulley. Jeffrey Walton implemented ARM SHA based on ARM code and
// code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. // code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. All
// All code is in the public domain. // code is in the public domain.
// In August 2017 JW reworked the internals to align all the implementations. // In August 2017 JW reworked the internals to align all the
// Formerly all hashes were software based, IterHashBase handled endian conversions, // implementations. Formerly all hashes were software based, IterHashBase
// and IterHashBase dispatched a single to block SHA{N}::Transform. SHA{N}::Transform // handled endian conversions, and IterHashBase dispatched a single to
// then performed the single block hashing. It was repeated for multiple blocks. // block SHA{N}::Transform. SHA{N}::Transform then performed the single
// block hashing. It was repeated for multiple blocks.
// //
// The rework added SHA{N}::HashMultipleBlocks (class) and SHA{N}_HashMultipleBlocks // The rework added SHA{N}::HashMultipleBlocks (class) and
// (free standing). There are also hardware accelerated variations. Callers enter // SHA{N}_HashMultipleBlocks (free standing). There are also hardware
// SHA{N}::HashMultipleBlocks (class), and the function calls SHA{N}_HashMultipleBlocks // accelerated variations. Callers enter SHA{N}::HashMultipleBlocks (class)
// (free standing) or SHA{N}_HashBlock (free standing) as a fallback. // and the function calls SHA{N}_HashMultipleBlocks (free standing) or
// SHA{N}_HashBlock (free standing) as a fallback.
// //
// An added wrinkle is hardware is little endian, C++ is big endian, and callers use // An added wrinkle is hardware is little endian, C++ is big endian, and
// big endian, so SHA{N}_HashMultipleBlock accepts a ByteOrder for the incoming data // callers use big endian, so SHA{N}_HashMultipleBlock accepts a ByteOrder
// arrangement. Hardware based SHA{N}_HashMultipleBlock can often perform the endian // for the incoming data arrangement. Hardware based SHA{N}_HashMultipleBlock
// swap much easier by setting an EPI mask. Endian swap incurs no penalty on Intel SHA, // can often perform the endian swap much easier by setting an EPI mask.
// and 4-instruction penalty on ARM SHA. Under C++ the full software based swap penalty // Endian swap incurs no penalty on Intel SHA, and 4-instruction penalty on
// is incurred due to use of ReverseBytes(). // ARM SHA. Under C++ the full software based swap penalty is incurred due
// to use of ReverseBytes().
// //
// The rework also removed the hacked-in pointers to implementations. // In May 2019 JW added Cryptogams ARMv7 and NEON implementations for SHA1,
// SHA256 and SHA512. The Cryptogams code closed a performance gap on modern
// 32-bit ARM devices. Cryptogams is Andy Polyakov's project used to create
// high speed crypto algorithms and share them with other developers. Andy's
// code runs 30% to 50% faster than C/C++ code. The Cryptogams code can be
// disabled in config_asm.h. An example of integrating Andy's code is at
// https://wiki.openssl.org/index.php/Cryptogams_SHA.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@ -56,7 +65,7 @@ extern void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, siz
extern void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order); extern void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order);
#endif #endif
#if (CRYPTOGAMS_ARM_SHA1) #if CRYPTOGAMS_ARM_SHA1
extern "C" unsigned int CRYPTOGAMS_armcaps; extern "C" unsigned int CRYPTOGAMS_armcaps;
extern "C" int sha1_block_data_order(word32* state, const word32 *data, size_t blocks); extern "C" int sha1_block_data_order(word32* state, const word32 *data, size_t blocks);
#endif #endif
@ -69,7 +78,7 @@ extern void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, siz
extern void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order); extern void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order);
#endif #endif
#if (CRYPTOGAMS_ARM_SHA256) #if CRYPTOGAMS_ARM_SHA256
extern "C" unsigned int CRYPTOGAMS_armcaps; extern "C" unsigned int CRYPTOGAMS_armcaps;
extern "C" int sha256_block_data_order(word32* state, const word32 *data, size_t blocks); extern "C" int sha256_block_data_order(word32* state, const word32 *data, size_t blocks);
#endif #endif
@ -83,6 +92,11 @@ extern void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data,
extern void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order); extern void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order);
#endif #endif
#if CRYPTOGAMS_ARM_SHA512
extern "C" unsigned int CRYPTOGAMS_armcaps;
extern "C" int sha512_block_data_order(word64* state, const word64 *data, size_t blocks);
#endif
// We add extern to export table to sha_simd.cpp, but it // We add extern to export table to sha_simd.cpp, but it
// cleared http://github.com/weidai11/cryptopp/issues/502 // cleared http://github.com/weidai11/cryptopp/issues/502
extern const word32 SHA256_K[64]; extern const word32 SHA256_K[64];
@ -153,6 +167,23 @@ const word32 SHA256_K[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
}; };
ANONYMOUS_NAMESPACE_BEGIN
#if CRYPTOGAMS_ARM_SHA1 || CRYPTOGAMS_ARM_SHA256 || CRYPTOGAMS_ARM_SHA512
inline bool CryptogamsArmCaps()
{
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
*const_cast<volatile unsigned int*>(&CRYPTOGAMS_armcaps) = CryptoPP::HasNEON() ? (1<<0) : 0;
return true;
}
#endif
ANONYMOUS_NAMESPACE_END
//////////////////////////////// ////////////////////////////////
// start of Steve Reid's code // // start of Steve Reid's code //
//////////////////////////////// ////////////////////////////////
@ -276,11 +307,19 @@ void SHA1::Transform(word32 *state, const word32 *data)
return; return;
} }
#endif #endif
#if CRYPTOGAMS_ARM_SHA1 && 0 #if CRYPTOGAMS_ARM_SHA1
// TODO: convert LE to BE and use Cryptogams code
if (HasARMv7()) if (HasARMv7())
{ {
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
# if defined(CRYPTOPP_LITTLE_ENDIAN)
word32 dataBuf[16];
ByteReverse(dataBuf, data, SHA1::BLOCKSIZE);
sha1_block_data_order(state, data, 1); sha1_block_data_order(state, data, 1);
# else
sha1_block_data_order(state, data, 1);
# endif
return; return;
} }
#endif #endif
@ -310,11 +349,7 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length)
#if CRYPTOGAMS_ARM_SHA1 #if CRYPTOGAMS_ARM_SHA1
if (HasARMv7()) if (HasARMv7())
{ {
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps static const bool unused = CryptogamsArmCaps();
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
static const unsigned int unused = CRYPTOGAMS_armcaps = HasNEON() ? (1<<0) : 0;
CRYPTOPP_UNUSED(unused); CRYPTOPP_UNUSED(unused);
sha1_block_data_order(m_state, input, length / SHA1::BLOCKSIZE); sha1_block_data_order(m_state, input, length / SHA1::BLOCKSIZE);
@ -823,11 +858,19 @@ void SHA256::Transform(word32 *state, const word32 *data)
return; return;
} }
#endif #endif
#if CRYPTOGAMS_ARM_SHA256 && 0 #if CRYPTOGAMS_ARM_SHA256
// TODO: convert LE to BE and use Cryptogams code
if (HasARMv7()) if (HasARMv7())
{ {
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
# if defined(CRYPTOPP_LITTLE_ENDIAN)
word32 dataBuf[16];
ByteReverse(dataBuf, data, SHA256::BLOCKSIZE);
sha256_block_data_order(state, data, 1); sha256_block_data_order(state, data, 1);
# else
sha256_block_data_order(state, data, 1);
# endif
return; return;
} }
#endif #endif
@ -872,11 +915,7 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
#if CRYPTOGAMS_ARM_SHA256 #if CRYPTOGAMS_ARM_SHA256
if (HasARMv7()) if (HasARMv7())
{ {
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps static const bool unused = CryptogamsArmCaps();
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
static const unsigned int unused = CRYPTOGAMS_armcaps = HasNEON() ? (1<<0) : 0;
CRYPTOPP_UNUSED(unused); CRYPTOPP_UNUSED(unused);
sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE); sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE);
@ -942,11 +981,7 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
#if CRYPTOGAMS_ARM_SHA256 #if CRYPTOGAMS_ARM_SHA256
if (HasARMv7()) if (HasARMv7())
{ {
// The Cryptogams code uses a global variable named CRYPTOGAMS_armcaps static const bool unused = CryptogamsArmCaps();
// for capabilities like ARMv7 and NEON. Storage is allocated in the
// module. We still need to set CRYPTOGAMS_armcaps accordingly.
// The Cryptogams code defines NEON as 1<<0; see ARMV7_NEON.
static const unsigned int unused = CRYPTOGAMS_armcaps = HasNEON() ? (1<<0) : 0;
CRYPTOPP_UNUSED(unused); CRYPTOPP_UNUSED(unused);
sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE); sha256_block_data_order(m_state, input, length / SHA256::BLOCKSIZE);
@ -997,6 +1032,12 @@ std::string SHA512_AlgorithmProvider()
if (HasSSE2()) if (HasSSE2())
return "SSE2"; return "SSE2";
#endif #endif
#if CRYPTOGAMS_ARM_SHA512
if (HasNEON())
return "NEON";
if (HasARMv7())
return "ARMv7";
#endif
#if (CRYPTOPP_POWER8_SHA_AVAILABLE) #if (CRYPTOPP_POWER8_SHA_AVAILABLE)
if (HasSHA512()) if (HasSHA512())
return "Power8"; return "Power8";
@ -1303,6 +1344,22 @@ void SHA512::Transform(word64 *state, const word64 *data)
return; return;
} }
#endif #endif
#if CRYPTOGAMS_ARM_SHA512
if (HasARMv7())
{
static const bool unused = CryptogamsArmCaps();
CRYPTOPP_UNUSED(unused);
# if defined(CRYPTOPP_LITTLE_ENDIAN)
word64 dataBuf[16];
ByteReverse(dataBuf, data, SHA512::BLOCKSIZE);
sha512_block_data_order(state, dataBuf, 1);
# else
sha512_block_data_order(state, data, 1);
# endif
return;
}
#endif
#if CRYPTOPP_POWER8_SHA_AVAILABLE #if CRYPTOPP_POWER8_SHA_AVAILABLE
if (HasSHA512()) if (HasSHA512())
{ {

1881
sha512_armv4.S Normal file

File diff suppressed because it is too large Load Diff

21
sha512_armv4.h Normal file
View File

@ -0,0 +1,21 @@
/* Header file for use with Cryptogam's ARMv4 SHA512. */
/* Also see http://www.openssl.org/~appro/cryptogams/ */
/* https://wiki.openssl.org/index.php/Cryptogams_SHA. */
#ifndef CRYPTOGAMS_SHA512_ARMV4_H
#define CRYPTOGAMS_SHA512_ARMV4_H
#ifdef __cplusplus
extern "C" {
#endif
void sha512_block_data_order(void *state, const void *data, size_t blocks);
/* Cryptogams arm caps */
#define ARMV7_NEON (1<<0)
#ifdef __cplusplus
}
#endif
#endif /* CRYPTOGAMS_SHA512_ARMV4_H */