Fix AdvancedProcessBlocks_{MxN}_NEON compile with MSVC compiler (GH #776)
parent
b70bc4865f
commit
ed349dced1
52
adv_simd.h
52
adv_simd.h
|
|
@ -58,6 +58,17 @@
|
||||||
# include <arm_acle.h>
|
# include <arm_acle.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Thanks to Peter Cordes, https://stackoverflow.com/q/54016821/608639
|
||||||
|
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
# ifndef PACK32x4
|
||||||
|
# if defined(_MSC_VER)
|
||||||
|
# define PACK32x4(w,x,y,z) { ((w) + (word64(x) << 32)), ((y) + (word64(z) << 32)) }
|
||||||
|
# else
|
||||||
|
# define PACK32x4(w,x,y,z) { (w), (x), (y), (z) }
|
||||||
|
# endif
|
||||||
|
# endif // PACK32x4
|
||||||
|
#endif // Microsoft workaround
|
||||||
|
|
||||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
|
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE)
|
||||||
# include <emmintrin.h>
|
# include <emmintrin.h>
|
||||||
# include <xmmintrin.h>
|
# include <xmmintrin.h>
|
||||||
|
|
@ -113,12 +124,12 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
|
||||||
CRYPTOPP_ASSERT(length >= 8);
|
CRYPTOPP_ASSERT(length >= 8);
|
||||||
|
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1<<24};
|
const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24);
|
||||||
const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
|
const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24);
|
||||||
#else
|
#else
|
||||||
// TODO: verify these constants on ARM-BE
|
// TODO: verify these constants on ARM-BE
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1};
|
//const uint32x4_t s_one = PACK32x4(0, 0, 0, 1);
|
||||||
const uint32x4_t s_two = {0, 2, 0, 2};
|
//const uint32x4_t s_two = PACK32x4(0, 2, 0, 2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const size_t blockSize = 8;
|
const size_t blockSize = 8;
|
||||||
|
|
@ -358,12 +369,12 @@ inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
|
||||||
CRYPTOPP_ASSERT(length >= 16);
|
CRYPTOPP_ASSERT(length >= 16);
|
||||||
|
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1<<24};
|
const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24);
|
||||||
//const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
|
//const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24);
|
||||||
#else
|
#else
|
||||||
// TODO: verify these constants on ARM-BE
|
// TODO: verify these constants on ARM-BE
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1};
|
//const uint32x4_t s_one = PACK32x4(0, 0, 0, 1);
|
||||||
//const uint32x4_t s_two = {0, 2, 0, 2};
|
//const uint32x4_t s_two = PACK32x4(0, 2, 0, 2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const size_t blockSize = 16;
|
const size_t blockSize = 16;
|
||||||
|
|
@ -509,24 +520,23 @@ inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6,
|
||||||
/// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
|
/// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
|
||||||
/// vector type. The V parameter is used to avoid template argument
|
/// vector type. The V parameter is used to avoid template argument
|
||||||
/// deduction/substitution failures.
|
/// deduction/substitution failures.
|
||||||
template <typename F1, typename F4, typename W, typename V>
|
template <typename F1, typename F4, typename W>
|
||||||
inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
|
inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
|
||||||
const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks,
|
const W *subKeys, size_t rounds, const byte *inBlocks,
|
||||||
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(subKeys);
|
CRYPTOPP_ASSERT(subKeys);
|
||||||
CRYPTOPP_ASSERT(inBlocks);
|
CRYPTOPP_ASSERT(inBlocks);
|
||||||
CRYPTOPP_ASSERT(outBlocks);
|
CRYPTOPP_ASSERT(outBlocks);
|
||||||
CRYPTOPP_ASSERT(length >= 16);
|
CRYPTOPP_ASSERT(length >= 16);
|
||||||
CRYPTOPP_UNUSED(unused);
|
|
||||||
|
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1<<24};
|
const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24);
|
||||||
//const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
|
//const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24);
|
||||||
#else
|
#else
|
||||||
// TODO: verify these constants on ARM-BE
|
// TODO: verify these constants on ARM-BE
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1};
|
//const uint32x4_t s_one = PACK32x4(0, 0, 0, 1);
|
||||||
//const uint32x4_t s_two = {0, 2, 0, 2};
|
//const uint32x4_t s_two = PACK32x4(0, 2, 0, 2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const size_t blockSize = 16;
|
const size_t blockSize = 16;
|
||||||
|
|
@ -589,7 +599,7 @@ inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
|
||||||
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
xorBlocks = PtrAdd(xorBlocks, xorIncrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
|
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
|
||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
{
|
{
|
||||||
|
|
@ -626,7 +636,7 @@ inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
|
||||||
if (flags & BT_InBlockIsCounter)
|
if (flags & BT_InBlockIsCounter)
|
||||||
const_cast<byte *>(inBlocks)[15]++;
|
const_cast<byte *>(inBlocks)[15]++;
|
||||||
|
|
||||||
func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
|
func1(block, subKeys, static_cast<unsigned int>(rounds));
|
||||||
|
|
||||||
if (xorOutput)
|
if (xorOutput)
|
||||||
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||||
|
|
@ -661,12 +671,12 @@ inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
|
||||||
CRYPTOPP_ASSERT(length >= 16);
|
CRYPTOPP_ASSERT(length >= 16);
|
||||||
|
|
||||||
#if (CRYPTOPP_LITTLE_ENDIAN)
|
#if (CRYPTOPP_LITTLE_ENDIAN)
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1<<24};
|
const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24);
|
||||||
//const uint32x4_t s_two = {0, 2<<24, 0, 2<<24};
|
//const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24);
|
||||||
#else
|
#else
|
||||||
// TODO: verify these constants on ARM-BE
|
// TODO: verify these constants on ARM-BE
|
||||||
const uint32x4_t s_one = {0, 0, 0, 1};
|
//const uint32x4_t s_one = PACK32x4(0, 0, 0, 1);
|
||||||
//const uint32x4_t s_two = {0, 2, 0, 2};
|
//const uint32x4_t s_two = PACK32x4(0, 2, 0, 2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const size_t blockSize = 16;
|
const size_t blockSize = 16;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue