diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 25a87eb8..8f202f35 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -1,10 +1,11 @@ // rijndael-simd.cpp - written and placed in the public domain by // Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// AES-NI code originally written by Wei Dai. // -// This source file uses intrinsics to gain access to AES-NI and -// ARMv8a AES instructions. A separate source file is needed -// because additional CXXFLAGS are required to enable the -// appropriate instructions sets in some build configurations. +// This source file uses intrinsics and built-ins to gain access to +// AES-NI, ARMv8a AES and Power8 AES instructions. A separate source +// file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. // // ARMv8a AES code based on CriticalBlue code from Johannes Schneiders, // Skip Hovsmith and Barry O'Rourke for the mbedTLS project. Stepping @@ -13,13 +14,11 @@ // // AltiVec and Power8 code based on http://github.com/noloader/AES-Intrinsics and // http://www.ibm.com/developerworks/library/se-power8-in-core-cryptography/ -// The IBM documentation absolutely sucks. Thanks to Andy Polyakov, Paul R and -// Trudeaun for answering questions and filling the gaps in the IBM documentation. +// For Power8 do not remove the casts, even when const-ness is cast away. It causes +// a 0.3 to 0.6 cpb drop in performance. The IBM documentation absolutely sucks. +// Thanks to Andy Polyakov, Paul R and Trudeaun for answering questions and filling +// the gaps in the IBM documentation. // -// For Power8 do not remove the casts. It causes a 0.3 to 0.6 cpb drop in performance. -// uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey((const uint8_t*)skptr); -// uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey((const uint8_t*)s_rcon[0]); -// uint8x16_p8 r5 = (uint8x16_p8)VectorLoadKey((const uint8_t*)s_mask); #include "pch.h" #include "config.h" @@ -891,7 +890,7 @@ static inline void Store64x2(const uint64x2_p8& src, uint8_t dest[16]) // Loads a mis-aligned byte array, performs an endian conversion. static inline VectorType VectorLoad(const byte src[16]) { - return (VectorType)Load8x16((uint8_t*)src); + return (VectorType)Load8x16(0, (uint8_t*)src); } // Loads a mis-aligned byte array, performs an endian conversion. @@ -1092,9 +1091,9 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32* std::memcpy(rk, userKey, keyLen); uint8_t* skptr = (uint8_t*)rk; - uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey((const uint8_t*)skptr); - uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey((const uint8_t*)s_rcon[0]); - uint8x16_p8 r5 = (uint8x16_p8)VectorLoadKey((const uint8_t*)s_mask); + uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey(skptr); + uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]); + uint8x16_p8 r5 = (uint8x16_p8)VectorLoadKey(s_mask); #if defined(IS_LITTLE_ENDIAN) // Only the user key requires byte reversing. @@ -1110,12 +1109,12 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32* } /* Round 9 using rcon=0x1b */ - r4 = (uint8x16_p8)VectorLoadKey((const uint8_t*)s_rcon[1]); + r4 = (uint8x16_p8)VectorLoadKey(s_rcon[1]); r1 = Rijndael_Subkey_POWER8(r1, r4, r5); skptr += 16; VectorStore(r1, skptr); /* Round 10 using rcon=0x36 */ - r4 = (uint8x16_p8)VectorLoadKey((const uint8_t*)s_rcon[2]); + r4 = (uint8x16_p8)VectorLoadKey(s_rcon[2]); r1 = Rijndael_Subkey_POWER8(r1, r4, r5); skptr += 16; VectorStore(r1, skptr);