diff --git a/rijndael.cpp b/rijndael.cpp index 256630c9..9eecc661 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -81,6 +81,13 @@ NAMESPACE_BEGIN(CryptoPP) # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1 #endif +#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +static void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, + const word32 *subKeys, unsigned int rounds); +static void Rijndael_Dec_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, + const word32 *subKeys, unsigned int rounds); +#endif + // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 #if (__SUNPRO_CC >= 0x5130) # define MAYBE_CONST @@ -355,6 +362,13 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock return; } #endif +#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + if (HasAES()) + { + Rijndael_Enc_ProcessAndXorBlock_ARMV8(inBlock, xorBlock, outBlock, m_key.begin(), m_rounds); + return; + } +#endif typedef BlockGetAndPut Block; @@ -433,6 +447,13 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock return; } #endif +#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + if (HasAES()) + { + Rijndael_Dec_ProcessAndXorBlock_ARMV8(inBlock, xorBlock, outBlock, m_key.begin(), m_rounds); + return; + } +#endif #if (CRYPTOPP_ARM_AES_AVAILABLE) if (HasAES()) @@ -1138,6 +1159,103 @@ size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo } #endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + +void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, + const word32 *subKeys, unsigned int rounds) +{ + uint8x16_t data = vld1q_u8(inBlock); + const byte *keys = reinterpret_cast(subKeys); + + // Unroll the loop, profit 0.3 to 0.5 cpb. + data = vaeseq_u8(data, vld1q_u8(keys+0)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+16)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+32)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+48)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+64)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+80)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+96)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+112)); + data = vaesmcq_u8(data); + data = vaeseq_u8(data, vld1q_u8(keys+128)); + data = vaesmcq_u8(data); + + unsigned int i=9; + for ( ; i(subKeys); + + // Unroll the loop, profit 0.3 to 0.5 cpb. + data = vaesdq_u8(data, vld1q_u8(keys+0)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+16)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+32)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+48)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+64)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+80)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+96)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+112)); + data = vaesimcq_u8(data); + data = vaesdq_u8(data, vld1q_u8(keys+128)); + data = vaesimcq_u8(data); + + unsigned int i=9; + for ( ; i