From 5dca85b81944969a64de072545e371c01ebbe348 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Fri, 2 Nov 2018 19:09:36 -0400 Subject: [PATCH] Split Blake2 SIMD files into blake2s-simd.cpp and blake2b-simd.cpp (GH #729, GH #731) The split was required for Blake2b and Power8; Blake2s only requires Power7 --- Filelist.txt | 3 +- GNUmakefile | 33 +- blake2-simd.cpp => blake2b-simd.cpp | 1007 +------------------------ blake2s-simd.cpp | 1078 +++++++++++++++++++++++++++ cryptest.nmake | 8 +- cryptlib.vcxproj | 3 +- cryptlib.vcxproj.filters | 5 +- 7 files changed, 1117 insertions(+), 1020 deletions(-) rename blake2-simd.cpp => blake2b-simd.cpp (51%) create mode 100644 blake2s-simd.cpp diff --git a/Filelist.txt b/Filelist.txt index 22b7b83f..60d37620 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -35,7 +35,8 @@ bench2.cpp bench3.cpp bfinit.cpp blake2.cpp -blake2-simd.cpp +blake2s-simd.cpp +blake2b-simd.cpp blake2.h blowfish.cpp blowfish.h diff --git a/GNUmakefile b/GNUmakefile index d9558251..b07e68bd 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -257,7 +257,8 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),) ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),) HAVE_SSE4 = $(shell $(CXX) $(CXXFLAGS) -msse4.1 -dM -E pch.cpp 2>&1 | $(GREP) -i -c __SSE4_1__) ifeq ($(HAVE_SSE4),1) - BLAKE2_FLAG = -msse4.1 + BLAKE2B_FLAG = -msse4.1 + BLAKE2S_FLAG = -msse4.1 SIMON64_FLAG = -msse4.1 SPECK64_FLAG = -msse4.1 endif @@ -309,7 +310,8 @@ ifeq ($(SUN_COMPILER),1) endif COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) - BLAKE2_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 + BLAKE2B_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 + BLAKE2S_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 SIMON64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 SPECK64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 LDFLAGS += -xarch=sse4_1 @@ -377,7 +379,8 @@ ifeq ($(IS_NEON),1) AES_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon CRC_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon - BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon + BLAKE2B_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon + BLAKE2S_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon CHACHA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon CHAM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon LEA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon @@ -395,7 +398,8 @@ ifeq ($(IS_ARMV8),1) HAVE_NEON = $(shell $(CXX) $(CXXFLAGS) -march=armv8-a -dM -E pch.cpp 2>&1 | $(GREP) -i -c __ARM_NEON) ifeq ($(HAVE_NEON),1) ARIA_FLAG = -march=armv8-a - BLAKE2_FLAG = -march=armv8-a + BLAKE2B_FLAG = -march=armv8-a + BLAKE2S_FLAG = -march=armv8-a CHACHA_FLAG = -march=armv8-a CHAM_FLAG = -march=armv8-a LEA_FLAG = -march=armv8-a @@ -432,7 +436,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_POWER8),0) POWER8_FLAG = -mcpu=power8 -maltivec AES_FLAG = $(POWER8_FLAG) - BLAKE2_FLAG = $(POWER8_FLAG) + BLAKE2B_FLAG = $(POWER8_FLAG) + BLAKE2S_FLAG = $(POWER8_FLAG) CHACHA_FLAG = $(POWER8_FLAG) GCM_FLAG = $(POWER8_FLAG) SHA_FLAG = $(POWER8_FLAG) @@ -448,6 +453,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_POWER7),0) POWER7_FLAG = -mcpu=power7 -maltivec ARIA_FLAG = $(POWER7_FLAG) + BLAKE2S_FLAG = $(POWER7_FLAG) CHAM_FLAG = $(POWER7_FLAG) LEA_FLAG = $(POWER7_FLAG) SIMECK_FLAG = $(POWER7_FLAG) @@ -466,7 +472,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_POWER8),0) POWER8_FLAG = -qarch=pwr8 -qaltivec AES_FLAG = $(POWER8_FLAG) - BLAKE2_FLAG = $(POWER8_FLAG) + BLAKE2B_FLAG = $(POWER8_FLAG) + BLAKE2S_FLAG = $(POWER8_FLAG) CHACHA_FLAG = $(POWER8_FLAG) GCM_FLAG = $(POWER8_FLAG) SHA_FLAG = $(POWER8_FLAG) @@ -482,6 +489,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_POWER7),0) POWER7_FLAG = -qarch=pwr7 -qaltivec ARIA_FLAG = $(POWER7_FLAG) + BLAKE2S_FLAG = $(POWER7_FLAG) CHAM_FLAG = $(POWER7_FLAG) LEA_FLAG = $(POWER7_FLAG) SIMECK_FLAG = $(POWER7_FLAG) @@ -502,7 +510,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_LLVM),0) POWER7_FLAG = $(POWER8_FLAG) ARIA_FLAG = $(POWER8_FLAG) - BLAKE2_FLAG = $(POWER8_FLAG) + BLAKE2B_FLAG = $(POWER8_FLAG) + BLAKE2S_FLAG = $(POWER8_FLAG) CHACHA_FLAG = $(POWER8_FLAG) CHAM_FLAG = $(POWER8_FLAG) LEA_FLAG = $(POWER8_FLAG) @@ -1176,9 +1185,13 @@ aes-armv4.o : aes-armv4.S aria-simd.o : aria-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(ARIA_FLAG) -c) $< -# SSE4.1 or ARMv8a available -blake2-simd.o : blake2-simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(BLAKE2_FLAG) -c) $< +# SSE, NEON or POWER7 available +blake2s-simd.o : blake2s-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(BLAKE2S_FLAG) -c) $< + +# SSE, NEON or POWER8 available +blake2b-simd.o : blake2b-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(BLAKE2B_FLAG) -c) $< # SSE2 or NEON available chacha-simd.o : chacha-simd.cpp diff --git a/blake2-simd.cpp b/blake2b-simd.cpp similarity index 51% rename from blake2-simd.cpp rename to blake2b-simd.cpp index 145d365d..5a7f65d3 100644 --- a/blake2-simd.cpp +++ b/blake2b-simd.cpp @@ -3,9 +3,9 @@ // Jeffrey Walton, Uri Blumenthal and Marcel Raad. // // This source file uses intrinsics to gain access to ARMv7a/ARMv8a -// NEON and SSE4.2 instructions. A separate source file is needed -// because additional CXXFLAGS are required to enable the appropriate -// instructions sets in some build configurations. +// NEON, Power8 and SSE4.1 instructions. A separate source file is +// needed because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. #include "pch.h" #include "config.h" @@ -41,30 +41,15 @@ # include #endif -#if defined(CRYPTOPP_POWER7_AVAILABLE) +#if defined(CRYPTOPP_POWER8_AVAILABLE) # include "ppc-simd.h" #endif -// Disable POWER7 on PowerPC big-endian machines. BLAKE2s runs slower than C++. -#if defined(__powerpc__) && defined(__BIG_ENDIAN__) -# undef CRYPTOPP_POWER7_AVAILABLE -#endif - ANONYMOUS_NAMESPACE_BEGIN using CryptoPP::word32; using CryptoPP::word64; -#if (CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_POWER7_AVAILABLE) - -CRYPTOPP_ALIGN_DATA(16) -const word32 BLAKE2S_IV[8] = { - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -#endif - #if (CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_POWER8_AVAILABLE) CRYPTOPP_ALIGN_DATA(16) @@ -88,300 +73,6 @@ NAMESPACE_BEGIN(CryptoPP) #define TOF(reg) _mm_castsi128_ps((reg)) #define TOI(reg) _mm_castps_si128((reg)) -void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& state) -{ - #define BLAKE2S_LOAD_MSG_0_1(buf) \ - buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); - - #define BLAKE2S_LOAD_MSG_0_2(buf) \ - buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); - - #define BLAKE2S_LOAD_MSG_0_3(buf) \ - buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); - - #define BLAKE2S_LOAD_MSG_0_4(buf) \ - buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); - - #define BLAKE2S_LOAD_MSG_1_1(buf) \ - t0 = _mm_blend_epi16(m1, m2, 0x0C); \ - t1 = _mm_slli_si128(m3, 4); \ - t2 = _mm_blend_epi16(t0, t1, 0xF0); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); - - #define BLAKE2S_LOAD_MSG_1_2(buf) \ - t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ - t1 = _mm_blend_epi16(m1,m3,0xC0); \ - t2 = _mm_blend_epi16(t0, t1, 0xF0); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); - - #define BLAKE2S_LOAD_MSG_1_3(buf) \ - t0 = _mm_slli_si128(m1, 4); \ - t1 = _mm_blend_epi16(m2, t0, 0x30); \ - t2 = _mm_blend_epi16(m0, t1, 0xF0); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); - - #define BLAKE2S_LOAD_MSG_1_4(buf) \ - t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_slli_si128(m3, 4); \ - t2 = _mm_blend_epi16(t0, t1, 0x0C); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); - - #define BLAKE2S_LOAD_MSG_2_1(buf) \ - t0 = _mm_unpackhi_epi32(m2,m3); \ - t1 = _mm_blend_epi16(m3,m1,0x0C); \ - t2 = _mm_blend_epi16(t0, t1, 0x0F); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); - - #define BLAKE2S_LOAD_MSG_2_2(buf) \ - t0 = _mm_unpacklo_epi32(m2,m0); \ - t1 = _mm_blend_epi16(t0, m0, 0xF0); \ - t2 = _mm_slli_si128(m3, 8); \ - buf = _mm_blend_epi16(t1, t2, 0xC0); - - #define BLAKE2S_LOAD_MSG_2_3(buf) \ - t0 = _mm_blend_epi16(m0, m2, 0x3C); \ - t1 = _mm_srli_si128(m1, 12); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); - - #define BLAKE2S_LOAD_MSG_2_4(buf) \ - t0 = _mm_slli_si128(m3, 4); \ - t1 = _mm_blend_epi16(m0, m1, 0x33); \ - t2 = _mm_blend_epi16(t1, t0, 0xC0); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); - - #define BLAKE2S_LOAD_MSG_3_1(buf) \ - t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_unpackhi_epi32(t0, m2); \ - t2 = _mm_blend_epi16(t1, m3, 0x0C); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); - - #define BLAKE2S_LOAD_MSG_3_2(buf) \ - t0 = _mm_slli_si128(m2, 8); \ - t1 = _mm_blend_epi16(m3,m0,0x0C); \ - t2 = _mm_blend_epi16(t1, t0, 0xC0); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); - - #define BLAKE2S_LOAD_MSG_3_3(buf) \ - t0 = _mm_blend_epi16(m0,m1,0x0F); \ - t1 = _mm_blend_epi16(t0, m3, 0xC0); \ - buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); - - #define BLAKE2S_LOAD_MSG_3_4(buf) \ - t0 = _mm_unpacklo_epi32(m0,m2); \ - t1 = _mm_unpackhi_epi32(m1,m2); \ - buf = _mm_unpacklo_epi64(t1,t0); - - #define BLAKE2S_LOAD_MSG_4_1(buf) \ - t0 = _mm_unpacklo_epi64(m1,m2); \ - t1 = _mm_unpackhi_epi64(m0,m2); \ - t2 = _mm_blend_epi16(t0,t1,0x33); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); - - #define BLAKE2S_LOAD_MSG_4_2(buf) \ - t0 = _mm_unpackhi_epi64(m1,m3); \ - t1 = _mm_unpacklo_epi64(m0,m1); \ - buf = _mm_blend_epi16(t0,t1,0x33); - - #define BLAKE2S_LOAD_MSG_4_3(buf) \ - t0 = _mm_unpackhi_epi64(m3,m1); \ - t1 = _mm_unpackhi_epi64(m2,m0); \ - buf = _mm_blend_epi16(t1,t0,0x33); - - #define BLAKE2S_LOAD_MSG_4_4(buf) \ - t0 = _mm_blend_epi16(m0,m2,0x03); \ - t1 = _mm_slli_si128(t0, 8); \ - t2 = _mm_blend_epi16(t1,m3,0x0F); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); - - #define BLAKE2S_LOAD_MSG_5_1(buf) \ - t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_unpacklo_epi32(m0,m2); \ - buf = _mm_unpacklo_epi64(t0,t1); - - #define BLAKE2S_LOAD_MSG_5_2(buf) \ - t0 = _mm_srli_si128(m2, 4); \ - t1 = _mm_blend_epi16(m0,m3,0x03); \ - buf = _mm_blend_epi16(t1,t0,0x3C); - - #define BLAKE2S_LOAD_MSG_5_3(buf) \ - t0 = _mm_blend_epi16(m1,m0,0x0C); \ - t1 = _mm_srli_si128(m3, 4); \ - t2 = _mm_blend_epi16(t0,t1,0x30); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); - - #define BLAKE2S_LOAD_MSG_5_4(buf) \ - t0 = _mm_unpacklo_epi64(m1,m2); \ - t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ - buf = _mm_blend_epi16(t0,t1,0x33); - - #define BLAKE2S_LOAD_MSG_6_1(buf) \ - t0 = _mm_slli_si128(m1, 12); \ - t1 = _mm_blend_epi16(m0,m3,0x33); \ - buf = _mm_blend_epi16(t1,t0,0xC0); - - #define BLAKE2S_LOAD_MSG_6_2(buf) \ - t0 = _mm_blend_epi16(m3,m2,0x30); \ - t1 = _mm_srli_si128(m1, 4); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); - - #define BLAKE2S_LOAD_MSG_6_3(buf) \ - t0 = _mm_unpacklo_epi64(m0,m2); \ - t1 = _mm_srli_si128(m1, 4); \ - buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); - - #define BLAKE2S_LOAD_MSG_6_4(buf) \ - t0 = _mm_unpackhi_epi32(m1,m2); \ - t1 = _mm_unpackhi_epi64(m0,t0); \ - buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); - - #define BLAKE2S_LOAD_MSG_7_1(buf) \ - t0 = _mm_unpackhi_epi32(m0,m1); \ - t1 = _mm_blend_epi16(t0,m3,0x0F); \ - buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); - - #define BLAKE2S_LOAD_MSG_7_2(buf) \ - t0 = _mm_blend_epi16(m2,m3,0x30); \ - t1 = _mm_srli_si128(m0,4); \ - t2 = _mm_blend_epi16(t0,t1,0x03); \ - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); - - #define BLAKE2S_LOAD_MSG_7_3(buf) \ - t0 = _mm_unpackhi_epi64(m0,m3); \ - t1 = _mm_unpacklo_epi64(m1,m2); \ - t2 = _mm_blend_epi16(t0,t1,0x3C); \ - buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); - - #define BLAKE2S_LOAD_MSG_7_4(buf) \ - t0 = _mm_unpacklo_epi32(m0,m1); \ - t1 = _mm_unpackhi_epi32(m1,m2); \ - buf = _mm_unpacklo_epi64(t0,t1); - - #define BLAKE2S_LOAD_MSG_8_1(buf) \ - t0 = _mm_unpackhi_epi32(m1,m3); \ - t1 = _mm_unpacklo_epi64(t0,m0); \ - t2 = _mm_blend_epi16(t1,m2,0xC0); \ - buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); - - #define BLAKE2S_LOAD_MSG_8_2(buf) \ - t0 = _mm_unpackhi_epi32(m0,m3); \ - t1 = _mm_blend_epi16(m2,t0,0xF0); \ - buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); - - #define BLAKE2S_LOAD_MSG_8_3(buf) \ - t0 = _mm_blend_epi16(m2,m0,0x0C); \ - t1 = _mm_slli_si128(t0,4); \ - buf = _mm_blend_epi16(t1,m3,0x0F); - - #define BLAKE2S_LOAD_MSG_8_4(buf) \ - t0 = _mm_blend_epi16(m1,m0,0x30); \ - buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); - - #define BLAKE2S_LOAD_MSG_9_1(buf) \ - t0 = _mm_blend_epi16(m0,m2,0x03); \ - t1 = _mm_blend_epi16(m1,m2,0x30); \ - t2 = _mm_blend_epi16(t1,t0,0x0F); \ - buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); - - #define BLAKE2S_LOAD_MSG_9_2(buf) \ - t0 = _mm_slli_si128(m0,4); \ - t1 = _mm_blend_epi16(m1,t0,0xC0); \ - buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); - - #define BLAKE2S_LOAD_MSG_9_3(buf) \ - t0 = _mm_unpackhi_epi32(m0,m3); \ - t1 = _mm_unpacklo_epi32(m2,m3); \ - t2 = _mm_unpackhi_epi64(t0,t1); \ - buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); - - #define BLAKE2S_LOAD_MSG_9_4(buf) \ - t0 = _mm_blend_epi16(m3,m2,0xC0); \ - t1 = _mm_unpacklo_epi32(m0,m3); \ - t2 = _mm_blend_epi16(t0,t1,0x0F); \ - buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); - -#ifdef __XOP__ -# define MM_ROTI_EPI32(r, c) \ - _mm_roti_epi32(r, c) -#else -# define MM_ROTI_EPI32(r, c) ( \ - (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ - : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \ - : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ), \ - _mm_slli_epi32( (r), 32-(-(c)) )) ) -#endif - -#define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ - row4 = _mm_xor_si128( row4, row1 ); \ - row4 = MM_ROTI_EPI32(row4, -16); \ - row3 = _mm_add_epi32( row3, row4 ); \ - row2 = _mm_xor_si128( row2, row3 ); \ - row2 = MM_ROTI_EPI32(row2, -12); - -#define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ - row4 = _mm_xor_si128( row4, row1 ); \ - row4 = MM_ROTI_EPI32(row4, -8); \ - row3 = _mm_add_epi32( row3, row4 ); \ - row2 = _mm_xor_si128( row2, row3 ); \ - row2 = MM_ROTI_EPI32(row2, -7); - -#define DIAGONALIZE(row1,row2,row3,row4) \ - row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ - row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ - row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); - -#define UNDIAGONALIZE(row1,row2,row3,row4) \ - row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ - row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ - row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); - -#define BLAKE2S_ROUND(r) \ - BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ - BLAKE2S_G1(row1,row2,row3,row4,buf1); \ - BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ - BLAKE2S_G2(row1,row2,row3,row4,buf2); \ - DIAGONALIZE(row1,row2,row3,row4); \ - BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ - BLAKE2S_G1(row1,row2,row3,row4,buf3); \ - BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ - BLAKE2S_G2(row1,row2,row3,row4,buf4); \ - UNDIAGONALIZE(row1,row2,row3,row4); - - __m128i row1, row2, row3, row4; - __m128i buf1, buf2, buf3, buf4; - __m128i t0, t1, t2, ff0, ff1; - - const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); - const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); - - const __m128i m0 = LOADU( input + 00 ); - const __m128i m1 = LOADU( input + 16 ); - const __m128i m2 = LOADU( input + 32 ); - const __m128i m3 = LOADU( input + 48 ); - - row1 = ff0 = LOADU( &state.h[0] ); - row2 = ff1 = LOADU( &state.h[4] ); - row3 = LOADU( &BLAKE2S_IV[0] ); - row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.t[0] ) ); - - BLAKE2S_ROUND( 0 ); - BLAKE2S_ROUND( 1 ); - BLAKE2S_ROUND( 2 ); - BLAKE2S_ROUND( 3 ); - BLAKE2S_ROUND( 4 ); - BLAKE2S_ROUND( 5 ); - BLAKE2S_ROUND( 6 ); - BLAKE2S_ROUND( 7 ); - BLAKE2S_ROUND( 8 ); - BLAKE2S_ROUND( 9 ); - - STOREU( &state.h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); - STOREU( &state.h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); -} - void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state) { #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ @@ -809,326 +500,6 @@ void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state #endif // CRYPTOPP_SSE41_AVAILABLE #if CRYPTOPP_ARM_NEON_AVAILABLE -void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State& state) -{ - #define BLAKE2S_LOAD_MSG_0_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_0_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_0_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_0_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ - t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \ - t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \ - t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \ - t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \ - t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \ - t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \ - t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \ - t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ - t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \ - t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ - t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \ - t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ - t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \ - t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ - t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ - t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) - - #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) - - #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) - - #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - do { \ - row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ - row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ - row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ - } while(0) - - #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - do { \ - row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ - row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ - row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ - } while(0) - - #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ - do { \ - row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ - } while(0) - - #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ - do { \ - row4 = vextq_u32(row4, row4, 1); \ - row3 = vextq_u32(row3, row3, 2); \ - row2 = vextq_u32(row2, row2, 3); \ - } while(0) - - #define BLAKE2S_ROUND(r) \ - do { \ - uint32x4_t buf1, buf2, buf3, buf4; \ - BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ - BLAKE2S_G1(row1,row2,row3,row4,buf1); \ - BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ - BLAKE2S_G2(row1,row2,row3,row4,buf2); \ - BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ - BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ - BLAKE2S_G1(row1,row2,row3,row4,buf3); \ - BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ - BLAKE2S_G2(row1,row2,row3,row4,buf4); \ - BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ - } while(0) - - CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf())); - - const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00))); - const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16))); - const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32))); - const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48))); - - uint32x4_t row1, row2, row3, row4; - - const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]); - const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]); - row3 = vld1q_u32(&BLAKE2S_IV[0]); - row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0])); - - BLAKE2S_ROUND(0); - BLAKE2S_ROUND(1); - BLAKE2S_ROUND(2); - BLAKE2S_ROUND(3); - BLAKE2S_ROUND(4); - BLAKE2S_ROUND(5); - BLAKE2S_ROUND(6); - BLAKE2S_ROUND(7); - BLAKE2S_ROUND(8); - BLAKE2S_ROUND(9); - - vst1q_u32(&state.h[0], veorq_u32(f0, veorq_u32(row1, row3))); - vst1q_u32(&state.h[4], veorq_u32(f1, veorq_u32(row2, row4))); -} - void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state) { #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ @@ -1384,376 +755,6 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state } #endif // CRYPTOPP_ARM_NEON_AVAILABLE -#if (CRYPTOPP_POWER7_AVAILABLE) - -inline uint32x4_p VectorLoad32(const void* p) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - return (uint32x4_p)vec_xl(0, (uint8_t*)p); -#else - return (uint32x4_p)vec_vsx_ld(0, (uint8_t*)p); -#endif -} - -inline uint32x4_p VectorLoad32LE(const void* p) -{ -#if __BIG_ENDIAN__ - const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - const uint32x4_p v = VectorLoad32(p); - return vec_perm(v, v, m); -#else - return VectorLoad32(p); -#endif -} - -inline void VectorStore32(void* p, const uint32x4_p x) -{ -#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) - vec_xst((uint8x16_p)x,0,(uint8_t*)p); -#else - vec_vsx_st((uint8x16_p)x,0,(uint8_t*)p); -#endif -} - -inline void VectorStore32LE(void* p, const uint32x4_p x) -{ -#if __BIG_ENDIAN__ - const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; - VectorStore32(p, vec_perm(x, x, m)); -#else - VectorStore32(p, x); -#endif -} - -template -inline uint8x16_p VectorShiftLeftOctet(const uint8x16_p a) -{ -#if __BIG_ENDIAN__ - return (uint8x16_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, C); -#else - return (uint8x16_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, 16-C); -#endif -} - -template -inline uint32x4_p VectorShiftLeftOctet(const uint32x4_p a) -{ -#if __BIG_ENDIAN__ - return (uint32x4_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, C); -#else - return (uint32x4_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, 16-C); -#endif -} - -template -inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b) -{ - // Re-index - enum {X=E1&3, Y=E2&3}; - - // Don't care element - const unsigned int DC = 31; - - // Element 3 combinations - if (X == 0 && Y == 0) - { - const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); - } - else if (X == 0 && Y == 1) - { - const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); - } - else if (X == 0 && Y == 2) - { - const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); - } - else if (X == 0 && Y == 3) - { - const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); - } - - // Element 1 combinations - else if (X == 1 && Y == 0) - { - const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); - } - else if (X == 1 && Y == 1) - { - const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); - } - else if (X == 1 && Y == 2) - { - const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); - } - else if (X == 1 && Y == 3) - { - const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); - } - - // Element 2 combinations - else if (X == 2 && Y == 0) - { - const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); - } - else if (X == 2 && Y == 1) - { - const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); - } - else if (X == 2 && Y == 2) - { - const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); - } - else if (X == 2 && Y == 3) - { - const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); - } - - // Element 3 combinations - else if (X == 3 && Y == 0) - { - const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, b, mask); - } - else if (X == 3 && Y == 1) - { - const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); - } - else if (X == 3 && Y == 2) - { - const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); - } - else if (X == 3 && Y == 3) - { - const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; - return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); - } -} - -template -inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b, - const uint32x4_p c, const uint32x4_p d) -{ - // Re-index - enum {W=E1&3, X=E2&3, Y=E3&3, Z=E4&3}; - - const uint32x4_p t0 = VectorSet32(a, b); - const uint32x4_p t1 = VectorSet32(c, d); - - // Power7 follows SSE2's implementation, and this is _mm_set_epi32. - const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3}; - const uint32x4_p r = vec_perm(t0, t1, mask); - return r; -} - -template<> -uint32x4_p VectorSet32<2,0,2,0>(const uint32x4_p a, const uint32x4_p b, - const uint32x4_p c, const uint32x4_p d) -{ - // a=b, c=d, mask is {2,0, 2,0} - const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11}; - const uint32x4_p r = vec_perm(a, c, mask); - return r; -} - -template<> -uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b, - const uint32x4_p c, const uint32x4_p d) -{ - // a=b, c=d, mask is {3,1, 3,1} - const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15}; - const uint32x4_p r = vec_perm(a, c, mask); - return r; -} - -void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2_State& state) -{ - # define m1 m0 - # define m2 m0 - # define m3 m0 - - # define m5 m4 - # define m6 m4 - # define m7 m4 - - # define m9 m8 - # define m10 m8 - # define m11 m8 - - # define m13 m12 - # define m14 m12 - # define m15 m12 - - // #define BLAKE2S_LOAD_MSG_0_1(buf) buf = VectorSet32<6,4,2,0>(m6,m4,m2,m0); - #define BLAKE2S_LOAD_MSG_0_1(buf) buf = VectorSet32<2,0,2,0>(m6,m4,m2,m0); - // #define BLAKE2S_LOAD_MSG_0_2(buf) buf = VectorSet32<7,5,3,1>(m7,m5,m3,m1); - #define BLAKE2S_LOAD_MSG_0_2(buf) buf = VectorSet32<3,1,3,1>(m7,m5,m3,m1); - // #define BLAKE2S_LOAD_MSG_0_3(buf) buf = VectorSet32<14,12,10,8>(m14,m12,m10,m8); - #define BLAKE2S_LOAD_MSG_0_3(buf) buf = VectorSet32<2,0,2,0>(m14,m12,m10,m8); - // #define BLAKE2S_LOAD_MSG_0_4(buf) buf = VectorSet32<15,13,11,9>(m15,m13,m11,m9); - #define BLAKE2S_LOAD_MSG_0_4(buf) buf = VectorSet32<3,1,3,1>(m15,m13,m11,m9); - - #define BLAKE2S_LOAD_MSG_1_1(buf) buf = VectorSet32<13,9,4,14>(m13,m9,m4,m14); - #define BLAKE2S_LOAD_MSG_1_2(buf) buf = VectorSet32<6,15,8,10>(m6,m15,m8,m10) - #define BLAKE2S_LOAD_MSG_1_3(buf) buf = VectorSet32<5,11,0,1>(m5,m11,m0,m1) - #define BLAKE2S_LOAD_MSG_1_4(buf) buf = VectorSet32<3,7,2,12>(m3,m7,m2,m12) - - #define BLAKE2S_LOAD_MSG_2_1(buf) buf = VectorSet32<15,5,12,11>(m15,m5,m12,m11) - #define BLAKE2S_LOAD_MSG_2_2(buf) buf = VectorSet32<13,2,0,8>(m13,m2,m0,m8) - #define BLAKE2S_LOAD_MSG_2_3(buf) buf = VectorSet32<9,7,3,10>(m9,m7,m3,m10) - #define BLAKE2S_LOAD_MSG_2_4(buf) buf = VectorSet32<4,1,6,14>(m4,m1,m6,m14) - - #define BLAKE2S_LOAD_MSG_3_1(buf) buf = VectorSet32<11,13,3,7>(m11,m13,m3,m7) - #define BLAKE2S_LOAD_MSG_3_2(buf) buf = VectorSet32<14,12,1,9>(m14,m12,m1,m9) - #define BLAKE2S_LOAD_MSG_3_3(buf) buf = VectorSet32<15,4,5,2>(m15,m4,m5,m2) - #define BLAKE2S_LOAD_MSG_3_4(buf) buf = VectorSet32<8,0,10,6>(m8,m0,m10,m6) - - #define BLAKE2S_LOAD_MSG_4_1(buf) buf = VectorSet32<10,2,5,9>(m10,m2,m5,m9) - #define BLAKE2S_LOAD_MSG_4_2(buf) buf = VectorSet32<15,4,7,0>(m15,m4,m7,m0) - #define BLAKE2S_LOAD_MSG_4_3(buf) buf = VectorSet32<3,6,11,14>(m3,m6,m11,m14) - #define BLAKE2S_LOAD_MSG_4_4(buf) buf = VectorSet32<13,8,12,1>(m13,m8,m12,m1) - - #define BLAKE2S_LOAD_MSG_5_1(buf) buf = VectorSet32<8,0,6,2>(m8,m0,m6,m2) - #define BLAKE2S_LOAD_MSG_5_2(buf) buf = VectorSet32<3,11,10,12>(m3,m11,m10,m12) - #define BLAKE2S_LOAD_MSG_5_3(buf) buf = VectorSet32<1,15,7,4>(m1,m15,m7,m4) - #define BLAKE2S_LOAD_MSG_5_4(buf) buf = VectorSet32<9,14,5,13>(m9,m14,m5,m13) - - #define BLAKE2S_LOAD_MSG_6_1(buf) buf = VectorSet32<4,14,1,12>(m4,m14,m1,m12) - #define BLAKE2S_LOAD_MSG_6_2(buf) buf = VectorSet32<10,13,15,5>(m10,m13,m15,m5) - #define BLAKE2S_LOAD_MSG_6_3(buf) buf = VectorSet32<8,9,6,0>(m8,m9,m6,m0) - #define BLAKE2S_LOAD_MSG_6_4(buf) buf = VectorSet32<11,2,3,7>(m11,m2,m3,m7) - - #define BLAKE2S_LOAD_MSG_7_1(buf) buf = VectorSet32<3,12,7,13>(m3,m12,m7,m13) - #define BLAKE2S_LOAD_MSG_7_2(buf) buf = VectorSet32<9,1,14,11>(m9,m1,m14,m11) - #define BLAKE2S_LOAD_MSG_7_3(buf) buf = VectorSet32<2,8,15,5>(m2,m8,m15,m5) - #define BLAKE2S_LOAD_MSG_7_4(buf) buf = VectorSet32<10,6,4,0>(m10,m6,m4,m0) - - #define BLAKE2S_LOAD_MSG_8_1(buf) buf = VectorSet32<0,11,14,6>(m0,m11,m14,m6) - #define BLAKE2S_LOAD_MSG_8_2(buf) buf = VectorSet32<8,3,9,15>(m8,m3,m9,m15) - #define BLAKE2S_LOAD_MSG_8_3(buf) buf = VectorSet32<10,1,13,12>(m10,m1,m13,m12) - #define BLAKE2S_LOAD_MSG_8_4(buf) buf = VectorSet32<5,4,7,2>(m5,m4,m7,m2) - - #define BLAKE2S_LOAD_MSG_9_1(buf) buf = VectorSet32<1,7,8,10>(m1,m7,m8,m10) - #define BLAKE2S_LOAD_MSG_9_2(buf) buf = VectorSet32<5,6,4,2>(m5,m6,m4,m2) - #define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15) - #define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11) - - // Altivec has packed 32-bit rotate, but in terms of left rotate - const uint32x4_p ROR16_MASK = { 32-16, 32-16, 32-16, 32-16 }; - const uint32x4_p ROR12_MASK = { 32-12, 32-12, 32-12, 32-12 }; - const uint32x4_p ROR8_MASK = { 32-8, 32-8, 32-8, 32-8 }; - const uint32x4_p ROR7_MASK = { 32-7, 32-7, 32-7, 32-7 }; - - #define vec_ror_16(x) vec_rl(x, ROR16_MASK) - #define vec_ror_12(x) vec_rl(x, ROR12_MASK) - #define vec_ror_8(x) vec_rl(x, ROR8_MASK) - #define vec_ror_7(x) vec_rl(x, ROR7_MASK) - - #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - row1 = vec_add( vec_add( row1, buf), row2 ); \ - row4 = vec_xor( row4, row1 ); \ - row4 = vec_ror_16(row4); \ - row3 = vec_add( row3, row4 ); \ - row2 = vec_xor( row2, row3 ); \ - row2 = vec_ror_12(row2); - - #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - row1 = vec_add( vec_add( row1, buf), row2 ); \ - row4 = vec_xor( row4, row1 ); \ - row4 = vec_ror_8(row4); \ - row3 = vec_add( row3, row4 ); \ - row2 = vec_xor( row2, row3 ); \ - row2 = vec_ror_7(row2); - - const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; - const uint8x16_p D1032_MASK = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; - const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; - - #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ - row4 = vec_perm( row4, row4, D2103_MASK ); \ - row3 = vec_perm( row3, row3, D1032_MASK ); \ - row2 = vec_perm( row2, row2, D0321_MASK ); - - #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ - row4 = vec_perm( row4, row4, D0321_MASK ); \ - row3 = vec_perm( row3, row3, D1032_MASK ); \ - row2 = vec_perm( row2, row2, D2103_MASK ); - - #define BLAKE2S_ROUND(r) \ - BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ - BLAKE2S_G1(row1,row2,row3,row4,buf1); \ - BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ - BLAKE2S_G2(row1,row2,row3,row4,buf2); \ - BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ - BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ - BLAKE2S_G1(row1,row2,row3,row4,buf3); \ - BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ - BLAKE2S_G2(row1,row2,row3,row4,buf4); \ - BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); - - uint32x4_p row1, row2, row3, row4; - uint32x4_p buf1, buf2, buf3, buf4; - uint32x4_p ff0, ff1; - - const uint32x4_p m0 = VectorLoad32LE(input + 0); - const uint32x4_p m4 = VectorLoad32LE(input + 16); - const uint32x4_p m8 = VectorLoad32LE(input + 32); - const uint32x4_p m12 = VectorLoad32LE(input + 48); - - row1 = ff0 = VectorLoad32LE( &state.h[0] ); - row2 = ff1 = VectorLoad32LE( &state.h[4] ); - row3 = VectorLoad32( &BLAKE2S_IV[0] ); - row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.t[0] ) ); - - BLAKE2S_ROUND( 0 ); - BLAKE2S_ROUND( 1 ); - BLAKE2S_ROUND( 2 ); - BLAKE2S_ROUND( 3 ); - BLAKE2S_ROUND( 4 ); - BLAKE2S_ROUND( 5 ); - BLAKE2S_ROUND( 6 ); - BLAKE2S_ROUND( 7 ); - BLAKE2S_ROUND( 8 ); - BLAKE2S_ROUND( 9 ); - - VectorStore32LE( &state.h[0], vec_xor( ff0, vec_xor( row1, row3 ) ) ); - VectorStore32LE( &state.h[4], vec_xor( ff1, vec_xor( row2, row4 ) ) ); - - #undef m0 - #undef m1 - #undef m2 - #undef m3 - - #undef m4 - #undef m5 - #undef m6 - #undef m7 - - #undef m8 - #undef m9 - #undef m10 - #undef m11 - - #undef m12 - #undef m13 - #undef m14 - #undef m15 -} -#endif // CRYPTOPP_POWER7_AVAILABLE - #if (CRYPTOPP_POWER8_AVAILABLE) inline uint64x2_p VectorLoad64(const void* p) diff --git a/blake2s-simd.cpp b/blake2s-simd.cpp new file mode 100644 index 00000000..ebb737dc --- /dev/null +++ b/blake2s-simd.cpp @@ -0,0 +1,1078 @@ + +// blake2-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to ARMv7a/ARMv8a +// NEON, Power7 and SSE4.1 instructions. A separate source file is +// needed because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" +#include "blake2.h" + +// Uncomment for benchmarking C++ against SSE2 or NEON. +// Do so in both blake2.cpp and blake2-simd.cpp. +// #undef CRYPTOPP_SSE41_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE +// #undef CRYPTOPP_POWER8_AVAILABLE + +// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about +// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367. +#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) +# undef CRYPTOPP_ARM_NEON_AVAILABLE +#endif + +#if (CRYPTOPP_SSE41_AVAILABLE) +# include +# include +# include +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include +#endif + +// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many +// compilers don't follow ACLE conventions for the include. +#if (CRYPTOPP_ARM_ACLE_AVAILABLE) +# include +# include +#endif + +#if defined(CRYPTOPP_POWER7_AVAILABLE) +# include "ppc-simd.h" +#endif + +// Disable POWER7 on PowerPC big-endian machines. BLAKE2s runs slower than C++. +#if defined(__powerpc__) && defined(__BIG_ENDIAN__) +# undef CRYPTOPP_POWER7_AVAILABLE +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::word32; +using CryptoPP::word64; + +#if (CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_POWER7_AVAILABLE) + +CRYPTOPP_ALIGN_DATA(16) +const word32 BLAKE2S_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + +#endif + +#if (CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_POWER8_AVAILABLE) + +CRYPTOPP_ALIGN_DATA(16) +const word64 BLAKE2B_IV[8] = { + W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b), + W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1), + W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f), + W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179) +}; + +#endif // CRYPTOPP_SSE41_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +#if CRYPTOPP_SSE41_AVAILABLE + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void*)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void*)(p), r) +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& state) +{ + #define BLAKE2S_LOAD_MSG_0_1(buf) \ + buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); + + #define BLAKE2S_LOAD_MSG_0_2(buf) \ + buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); + + #define BLAKE2S_LOAD_MSG_0_3(buf) \ + buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); + + #define BLAKE2S_LOAD_MSG_0_4(buf) \ + buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); + + #define BLAKE2S_LOAD_MSG_1_1(buf) \ + t0 = _mm_blend_epi16(m1, m2, 0x0C); \ + t1 = _mm_slli_si128(m3, 4); \ + t2 = _mm_blend_epi16(t0, t1, 0xF0); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); + + #define BLAKE2S_LOAD_MSG_1_2(buf) \ + t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ + t1 = _mm_blend_epi16(m1,m3,0xC0); \ + t2 = _mm_blend_epi16(t0, t1, 0xF0); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + #define BLAKE2S_LOAD_MSG_1_3(buf) \ + t0 = _mm_slli_si128(m1, 4); \ + t1 = _mm_blend_epi16(m2, t0, 0x30); \ + t2 = _mm_blend_epi16(m0, t1, 0xF0); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + #define BLAKE2S_LOAD_MSG_1_4(buf) \ + t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_slli_si128(m3, 4); \ + t2 = _mm_blend_epi16(t0, t1, 0x0C); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + #define BLAKE2S_LOAD_MSG_2_1(buf) \ + t0 = _mm_unpackhi_epi32(m2,m3); \ + t1 = _mm_blend_epi16(m3,m1,0x0C); \ + t2 = _mm_blend_epi16(t0, t1, 0x0F); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + + #define BLAKE2S_LOAD_MSG_2_2(buf) \ + t0 = _mm_unpacklo_epi32(m2,m0); \ + t1 = _mm_blend_epi16(t0, m0, 0xF0); \ + t2 = _mm_slli_si128(m3, 8); \ + buf = _mm_blend_epi16(t1, t2, 0xC0); + + #define BLAKE2S_LOAD_MSG_2_3(buf) \ + t0 = _mm_blend_epi16(m0, m2, 0x3C); \ + t1 = _mm_srli_si128(m1, 12); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); + + #define BLAKE2S_LOAD_MSG_2_4(buf) \ + t0 = _mm_slli_si128(m3, 4); \ + t1 = _mm_blend_epi16(m0, m1, 0x33); \ + t2 = _mm_blend_epi16(t1, t0, 0xC0); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); + + #define BLAKE2S_LOAD_MSG_3_1(buf) \ + t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_unpackhi_epi32(t0, m2); \ + t2 = _mm_blend_epi16(t1, m3, 0x0C); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + + #define BLAKE2S_LOAD_MSG_3_2(buf) \ + t0 = _mm_slli_si128(m2, 8); \ + t1 = _mm_blend_epi16(m3,m0,0x0C); \ + t2 = _mm_blend_epi16(t1, t0, 0xC0); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + + #define BLAKE2S_LOAD_MSG_3_3(buf) \ + t0 = _mm_blend_epi16(m0,m1,0x0F); \ + t1 = _mm_blend_epi16(t0, m3, 0xC0); \ + buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + + #define BLAKE2S_LOAD_MSG_3_4(buf) \ + t0 = _mm_unpacklo_epi32(m0,m2); \ + t1 = _mm_unpackhi_epi32(m1,m2); \ + buf = _mm_unpacklo_epi64(t1,t0); + + #define BLAKE2S_LOAD_MSG_4_1(buf) \ + t0 = _mm_unpacklo_epi64(m1,m2); \ + t1 = _mm_unpackhi_epi64(m0,m2); \ + t2 = _mm_blend_epi16(t0,t1,0x33); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + + #define BLAKE2S_LOAD_MSG_4_2(buf) \ + t0 = _mm_unpackhi_epi64(m1,m3); \ + t1 = _mm_unpacklo_epi64(m0,m1); \ + buf = _mm_blend_epi16(t0,t1,0x33); + + #define BLAKE2S_LOAD_MSG_4_3(buf) \ + t0 = _mm_unpackhi_epi64(m3,m1); \ + t1 = _mm_unpackhi_epi64(m2,m0); \ + buf = _mm_blend_epi16(t1,t0,0x33); + + #define BLAKE2S_LOAD_MSG_4_4(buf) \ + t0 = _mm_blend_epi16(m0,m2,0x03); \ + t1 = _mm_slli_si128(t0, 8); \ + t2 = _mm_blend_epi16(t1,m3,0x0F); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); + + #define BLAKE2S_LOAD_MSG_5_1(buf) \ + t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_unpacklo_epi32(m0,m2); \ + buf = _mm_unpacklo_epi64(t0,t1); + + #define BLAKE2S_LOAD_MSG_5_2(buf) \ + t0 = _mm_srli_si128(m2, 4); \ + t1 = _mm_blend_epi16(m0,m3,0x03); \ + buf = _mm_blend_epi16(t1,t0,0x3C); + + #define BLAKE2S_LOAD_MSG_5_3(buf) \ + t0 = _mm_blend_epi16(m1,m0,0x0C); \ + t1 = _mm_srli_si128(m3, 4); \ + t2 = _mm_blend_epi16(t0,t1,0x30); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); + + #define BLAKE2S_LOAD_MSG_5_4(buf) \ + t0 = _mm_unpacklo_epi64(m1,m2); \ + t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ + buf = _mm_blend_epi16(t0,t1,0x33); + + #define BLAKE2S_LOAD_MSG_6_1(buf) \ + t0 = _mm_slli_si128(m1, 12); \ + t1 = _mm_blend_epi16(m0,m3,0x33); \ + buf = _mm_blend_epi16(t1,t0,0xC0); + + #define BLAKE2S_LOAD_MSG_6_2(buf) \ + t0 = _mm_blend_epi16(m3,m2,0x30); \ + t1 = _mm_srli_si128(m1, 4); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); + + #define BLAKE2S_LOAD_MSG_6_3(buf) \ + t0 = _mm_unpacklo_epi64(m0,m2); \ + t1 = _mm_srli_si128(m1, 4); \ + buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); + + #define BLAKE2S_LOAD_MSG_6_4(buf) \ + t0 = _mm_unpackhi_epi32(m1,m2); \ + t1 = _mm_unpackhi_epi64(m0,t0); \ + buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + + #define BLAKE2S_LOAD_MSG_7_1(buf) \ + t0 = _mm_unpackhi_epi32(m0,m1); \ + t1 = _mm_blend_epi16(t0,m3,0x0F); \ + buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); + + #define BLAKE2S_LOAD_MSG_7_2(buf) \ + t0 = _mm_blend_epi16(m2,m3,0x30); \ + t1 = _mm_srli_si128(m0,4); \ + t2 = _mm_blend_epi16(t0,t1,0x03); \ + buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); + + #define BLAKE2S_LOAD_MSG_7_3(buf) \ + t0 = _mm_unpackhi_epi64(m0,m3); \ + t1 = _mm_unpacklo_epi64(m1,m2); \ + t2 = _mm_blend_epi16(t0,t1,0x3C); \ + buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); + + #define BLAKE2S_LOAD_MSG_7_4(buf) \ + t0 = _mm_unpacklo_epi32(m0,m1); \ + t1 = _mm_unpackhi_epi32(m1,m2); \ + buf = _mm_unpacklo_epi64(t0,t1); + + #define BLAKE2S_LOAD_MSG_8_1(buf) \ + t0 = _mm_unpackhi_epi32(m1,m3); \ + t1 = _mm_unpacklo_epi64(t0,m0); \ + t2 = _mm_blend_epi16(t1,m2,0xC0); \ + buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); + + #define BLAKE2S_LOAD_MSG_8_2(buf) \ + t0 = _mm_unpackhi_epi32(m0,m3); \ + t1 = _mm_blend_epi16(m2,t0,0xF0); \ + buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); + + #define BLAKE2S_LOAD_MSG_8_3(buf) \ + t0 = _mm_blend_epi16(m2,m0,0x0C); \ + t1 = _mm_slli_si128(t0,4); \ + buf = _mm_blend_epi16(t1,m3,0x0F); + + #define BLAKE2S_LOAD_MSG_8_4(buf) \ + t0 = _mm_blend_epi16(m1,m0,0x30); \ + buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); + + #define BLAKE2S_LOAD_MSG_9_1(buf) \ + t0 = _mm_blend_epi16(m0,m2,0x03); \ + t1 = _mm_blend_epi16(m1,m2,0x30); \ + t2 = _mm_blend_epi16(t1,t0,0x0F); \ + buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); + + #define BLAKE2S_LOAD_MSG_9_2(buf) \ + t0 = _mm_slli_si128(m0,4); \ + t1 = _mm_blend_epi16(m1,t0,0xC0); \ + buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); + + #define BLAKE2S_LOAD_MSG_9_3(buf) \ + t0 = _mm_unpackhi_epi32(m0,m3); \ + t1 = _mm_unpacklo_epi32(m2,m3); \ + t2 = _mm_unpackhi_epi64(t0,t1); \ + buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); + + #define BLAKE2S_LOAD_MSG_9_4(buf) \ + t0 = _mm_blend_epi16(m3,m2,0xC0); \ + t1 = _mm_unpacklo_epi32(m0,m3); \ + t2 = _mm_blend_epi16(t0,t1,0x0F); \ + buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); + +#ifdef __XOP__ +# define MM_ROTI_EPI32(r, c) \ + _mm_roti_epi32(r, c) +#else +# define MM_ROTI_EPI32(r, c) ( \ + (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ + : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \ + : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ), \ + _mm_slli_epi32( (r), 32-(-(c)) )) ) +#endif + +#define BLAKE2S_G1(row1,row2,row3,row4,buf) \ + row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = MM_ROTI_EPI32(row4, -16); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ + row2 = MM_ROTI_EPI32(row2, -12); + +#define BLAKE2S_G2(row1,row2,row3,row4,buf) \ + row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = MM_ROTI_EPI32(row4, -8); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ + row2 = MM_ROTI_EPI32(row2, -7); + +#define DIAGONALIZE(row1,row2,row3,row4) \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ + row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); + +#define UNDIAGONALIZE(row1,row2,row3,row4) \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ + row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); + +#define BLAKE2S_ROUND(r) \ + BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ + BLAKE2S_G1(row1,row2,row3,row4,buf1); \ + BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ + BLAKE2S_G2(row1,row2,row3,row4,buf2); \ + DIAGONALIZE(row1,row2,row3,row4); \ + BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ + BLAKE2S_G1(row1,row2,row3,row4,buf3); \ + BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ + BLAKE2S_G2(row1,row2,row3,row4,buf4); \ + UNDIAGONALIZE(row1,row2,row3,row4); + + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; + __m128i t0, t1, t2, ff0, ff1; + + const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); + const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); + + const __m128i m0 = LOADU( input + 00 ); + const __m128i m1 = LOADU( input + 16 ); + const __m128i m2 = LOADU( input + 32 ); + const __m128i m3 = LOADU( input + 48 ); + + row1 = ff0 = LOADU( &state.h[0] ); + row2 = ff1 = LOADU( &state.h[4] ); + row3 = LOADU( &BLAKE2S_IV[0] ); + row4 = _mm_xor_si128( LOADU( &BLAKE2S_IV[4] ), LOADU( &state.t[0] ) ); + + BLAKE2S_ROUND( 0 ); + BLAKE2S_ROUND( 1 ); + BLAKE2S_ROUND( 2 ); + BLAKE2S_ROUND( 3 ); + BLAKE2S_ROUND( 4 ); + BLAKE2S_ROUND( 5 ); + BLAKE2S_ROUND( 6 ); + BLAKE2S_ROUND( 7 ); + BLAKE2S_ROUND( 8 ); + BLAKE2S_ROUND( 9 ); + + STOREU( &state.h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); + STOREU( &state.h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); +} +#endif // CRYPTOPP_SSE41_AVAILABLE + +#if CRYPTOPP_ARM_NEON_AVAILABLE +void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State& state) +{ + #define BLAKE2S_LOAD_MSG_0_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_0_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_0_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_0_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ + t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \ + t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \ + t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \ + t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \ + t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \ + t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \ + t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \ + t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ + t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \ + t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ + t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \ + t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ + t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \ + t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ + t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ + t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) + + #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) + + #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) + + #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ + do { \ + row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ + row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ + row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ + } while(0) + + #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ + do { \ + row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ + row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ + row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ + } while(0) + + #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ + do { \ + row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ + } while(0) + + #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ + do { \ + row4 = vextq_u32(row4, row4, 1); \ + row3 = vextq_u32(row3, row3, 2); \ + row2 = vextq_u32(row2, row2, 3); \ + } while(0) + + #define BLAKE2S_ROUND(r) \ + do { \ + uint32x4_t buf1, buf2, buf3, buf4; \ + BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ + BLAKE2S_G1(row1,row2,row3,row4,buf1); \ + BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ + BLAKE2S_G2(row1,row2,row3,row4,buf2); \ + BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ + BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ + BLAKE2S_G1(row1,row2,row3,row4,buf3); \ + BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ + BLAKE2S_G2(row1,row2,row3,row4,buf4); \ + BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ + } while(0) + + CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf())); + + const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00))); + const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16))); + const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32))); + const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48))); + + uint32x4_t row1, row2, row3, row4; + + const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]); + const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]); + row3 = vld1q_u32(&BLAKE2S_IV[0]); + row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0])); + + BLAKE2S_ROUND(0); + BLAKE2S_ROUND(1); + BLAKE2S_ROUND(2); + BLAKE2S_ROUND(3); + BLAKE2S_ROUND(4); + BLAKE2S_ROUND(5); + BLAKE2S_ROUND(6); + BLAKE2S_ROUND(7); + BLAKE2S_ROUND(8); + BLAKE2S_ROUND(9); + + vst1q_u32(&state.h[0], veorq_u32(f0, veorq_u32(row1, row3))); + vst1q_u32(&state.h[4], veorq_u32(f1, veorq_u32(row2, row4))); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_POWER7_AVAILABLE) + +inline uint32x4_p VectorLoad32(const void* p) +{ +#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + return (uint32x4_p)vec_xl(0, (uint8_t*)p); +#else + return (uint32x4_p)vec_vsx_ld(0, (uint8_t*)p); +#endif +} + +inline uint32x4_p VectorLoad32LE(const void* p) +{ +#if __BIG_ENDIAN__ + const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; + const uint32x4_p v = VectorLoad32(p); + return vec_perm(v, v, m); +#else + return VectorLoad32(p); +#endif +} + +inline void VectorStore32(void* p, const uint32x4_p x) +{ +#if defined(__xlc__) || defined(__xlC__) || defined(__clang__) + vec_xst((uint8x16_p)x,0,(uint8_t*)p); +#else + vec_vsx_st((uint8x16_p)x,0,(uint8_t*)p); +#endif +} + +inline void VectorStore32LE(void* p, const uint32x4_p x) +{ +#if __BIG_ENDIAN__ + const uint8x16_p m = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; + VectorStore32(p, vec_perm(x, x, m)); +#else + VectorStore32(p, x); +#endif +} + +template +inline uint8x16_p VectorShiftLeftOctet(const uint8x16_p a) +{ +#if __BIG_ENDIAN__ + return (uint8x16_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, C); +#else + return (uint8x16_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, 16-C); +#endif +} + +template +inline uint32x4_p VectorShiftLeftOctet(const uint32x4_p a) +{ +#if __BIG_ENDIAN__ + return (uint32x4_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, C); +#else + return (uint32x4_p)vec_sld((uint8x16_p)a, (uint8x16_p)a, 16-C); +#endif +} + +template +inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b) +{ + // Re-index + enum {X=E1&3, Y=E2&3}; + + // Don't care element + const unsigned int DC = 31; + + // Element 3 combinations + if (X == 0 && Y == 0) + { + const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, b, mask); + } + else if (X == 0 && Y == 1) + { + const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + } + else if (X == 0 && Y == 2) + { + const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + } + else if (X == 0 && Y == 3) + { + const uint8x16_p mask = {0,1,2,3, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + } + + // Element 1 combinations + else if (X == 1 && Y == 0) + { + const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, b, mask); + } + else if (X == 1 && Y == 1) + { + const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + } + else if (X == 1 && Y == 2) + { + const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + } + else if (X == 1 && Y == 3) + { + const uint8x16_p mask = {4,5,6,7, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + } + + // Element 2 combinations + else if (X == 2 && Y == 0) + { + const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, b, mask); + } + else if (X == 2 && Y == 1) + { + const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + } + else if (X == 2 && Y == 2) + { + const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + } + else if (X == 2 && Y == 3) + { + const uint8x16_p mask = {8,9,10,11, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + } + + // Element 3 combinations + else if (X == 3 && Y == 0) + { + const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, b, mask); + } + else if (X == 3 && Y == 1) + { + const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<4>(b), mask); + } + else if (X == 3 && Y == 2) + { + const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<8>(b), mask); + } + else if (X == 3 && Y == 3) + { + const uint8x16_p mask = {12,13,14,15, 16,17,18,19, DC,DC,DC,DC, DC,DC,DC,DC}; + return vec_perm(a, VectorShiftLeftOctet<12>(b), mask); + } +} + +template +inline uint32x4_p VectorSet32(const uint32x4_p a, const uint32x4_p b, + const uint32x4_p c, const uint32x4_p d) +{ + // Re-index + enum {W=E1&3, X=E2&3, Y=E3&3, Z=E4&3}; + + const uint32x4_p t0 = VectorSet32(a, b); + const uint32x4_p t1 = VectorSet32(c, d); + + // Power7 follows SSE2's implementation, and this is _mm_set_epi32. + const uint8x16_p mask = {20,21,22,23, 16,17,18,19, 4,5,6,7, 0,1,2,3}; + const uint32x4_p r = vec_perm(t0, t1, mask); + return r; +} + +template<> +uint32x4_p VectorSet32<2,0,2,0>(const uint32x4_p a, const uint32x4_p b, + const uint32x4_p c, const uint32x4_p d) +{ + // a=b, c=d, mask is {2,0, 2,0} + const uint8x16_p mask = {16,17,18,19, 24,25,26,27, 0,1,2,3, 8,9,10,11}; + const uint32x4_p r = vec_perm(a, c, mask); + return r; +} + +template<> +uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b, + const uint32x4_p c, const uint32x4_p d) +{ + // a=b, c=d, mask is {3,1, 3,1} + const uint8x16_p mask = {20,21,22,23, 28,29,30,31, 4,5,6,7, 12,13,14,15}; + const uint32x4_p r = vec_perm(a, c, mask); + return r; +} + +void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2_State& state) +{ + # define m1 m0 + # define m2 m0 + # define m3 m0 + + # define m5 m4 + # define m6 m4 + # define m7 m4 + + # define m9 m8 + # define m10 m8 + # define m11 m8 + + # define m13 m12 + # define m14 m12 + # define m15 m12 + + // #define BLAKE2S_LOAD_MSG_0_1(buf) buf = VectorSet32<6,4,2,0>(m6,m4,m2,m0); + #define BLAKE2S_LOAD_MSG_0_1(buf) buf = VectorSet32<2,0,2,0>(m6,m4,m2,m0); + // #define BLAKE2S_LOAD_MSG_0_2(buf) buf = VectorSet32<7,5,3,1>(m7,m5,m3,m1); + #define BLAKE2S_LOAD_MSG_0_2(buf) buf = VectorSet32<3,1,3,1>(m7,m5,m3,m1); + // #define BLAKE2S_LOAD_MSG_0_3(buf) buf = VectorSet32<14,12,10,8>(m14,m12,m10,m8); + #define BLAKE2S_LOAD_MSG_0_3(buf) buf = VectorSet32<2,0,2,0>(m14,m12,m10,m8); + // #define BLAKE2S_LOAD_MSG_0_4(buf) buf = VectorSet32<15,13,11,9>(m15,m13,m11,m9); + #define BLAKE2S_LOAD_MSG_0_4(buf) buf = VectorSet32<3,1,3,1>(m15,m13,m11,m9); + + #define BLAKE2S_LOAD_MSG_1_1(buf) buf = VectorSet32<13,9,4,14>(m13,m9,m4,m14); + #define BLAKE2S_LOAD_MSG_1_2(buf) buf = VectorSet32<6,15,8,10>(m6,m15,m8,m10) + #define BLAKE2S_LOAD_MSG_1_3(buf) buf = VectorSet32<5,11,0,1>(m5,m11,m0,m1) + #define BLAKE2S_LOAD_MSG_1_4(buf) buf = VectorSet32<3,7,2,12>(m3,m7,m2,m12) + + #define BLAKE2S_LOAD_MSG_2_1(buf) buf = VectorSet32<15,5,12,11>(m15,m5,m12,m11) + #define BLAKE2S_LOAD_MSG_2_2(buf) buf = VectorSet32<13,2,0,8>(m13,m2,m0,m8) + #define BLAKE2S_LOAD_MSG_2_3(buf) buf = VectorSet32<9,7,3,10>(m9,m7,m3,m10) + #define BLAKE2S_LOAD_MSG_2_4(buf) buf = VectorSet32<4,1,6,14>(m4,m1,m6,m14) + + #define BLAKE2S_LOAD_MSG_3_1(buf) buf = VectorSet32<11,13,3,7>(m11,m13,m3,m7) + #define BLAKE2S_LOAD_MSG_3_2(buf) buf = VectorSet32<14,12,1,9>(m14,m12,m1,m9) + #define BLAKE2S_LOAD_MSG_3_3(buf) buf = VectorSet32<15,4,5,2>(m15,m4,m5,m2) + #define BLAKE2S_LOAD_MSG_3_4(buf) buf = VectorSet32<8,0,10,6>(m8,m0,m10,m6) + + #define BLAKE2S_LOAD_MSG_4_1(buf) buf = VectorSet32<10,2,5,9>(m10,m2,m5,m9) + #define BLAKE2S_LOAD_MSG_4_2(buf) buf = VectorSet32<15,4,7,0>(m15,m4,m7,m0) + #define BLAKE2S_LOAD_MSG_4_3(buf) buf = VectorSet32<3,6,11,14>(m3,m6,m11,m14) + #define BLAKE2S_LOAD_MSG_4_4(buf) buf = VectorSet32<13,8,12,1>(m13,m8,m12,m1) + + #define BLAKE2S_LOAD_MSG_5_1(buf) buf = VectorSet32<8,0,6,2>(m8,m0,m6,m2) + #define BLAKE2S_LOAD_MSG_5_2(buf) buf = VectorSet32<3,11,10,12>(m3,m11,m10,m12) + #define BLAKE2S_LOAD_MSG_5_3(buf) buf = VectorSet32<1,15,7,4>(m1,m15,m7,m4) + #define BLAKE2S_LOAD_MSG_5_4(buf) buf = VectorSet32<9,14,5,13>(m9,m14,m5,m13) + + #define BLAKE2S_LOAD_MSG_6_1(buf) buf = VectorSet32<4,14,1,12>(m4,m14,m1,m12) + #define BLAKE2S_LOAD_MSG_6_2(buf) buf = VectorSet32<10,13,15,5>(m10,m13,m15,m5) + #define BLAKE2S_LOAD_MSG_6_3(buf) buf = VectorSet32<8,9,6,0>(m8,m9,m6,m0) + #define BLAKE2S_LOAD_MSG_6_4(buf) buf = VectorSet32<11,2,3,7>(m11,m2,m3,m7) + + #define BLAKE2S_LOAD_MSG_7_1(buf) buf = VectorSet32<3,12,7,13>(m3,m12,m7,m13) + #define BLAKE2S_LOAD_MSG_7_2(buf) buf = VectorSet32<9,1,14,11>(m9,m1,m14,m11) + #define BLAKE2S_LOAD_MSG_7_3(buf) buf = VectorSet32<2,8,15,5>(m2,m8,m15,m5) + #define BLAKE2S_LOAD_MSG_7_4(buf) buf = VectorSet32<10,6,4,0>(m10,m6,m4,m0) + + #define BLAKE2S_LOAD_MSG_8_1(buf) buf = VectorSet32<0,11,14,6>(m0,m11,m14,m6) + #define BLAKE2S_LOAD_MSG_8_2(buf) buf = VectorSet32<8,3,9,15>(m8,m3,m9,m15) + #define BLAKE2S_LOAD_MSG_8_3(buf) buf = VectorSet32<10,1,13,12>(m10,m1,m13,m12) + #define BLAKE2S_LOAD_MSG_8_4(buf) buf = VectorSet32<5,4,7,2>(m5,m4,m7,m2) + + #define BLAKE2S_LOAD_MSG_9_1(buf) buf = VectorSet32<1,7,8,10>(m1,m7,m8,m10) + #define BLAKE2S_LOAD_MSG_9_2(buf) buf = VectorSet32<5,6,4,2>(m5,m6,m4,m2) + #define BLAKE2S_LOAD_MSG_9_3(buf) buf = VectorSet32<13,3,9,15>(m13,m3,m9,m15) + #define BLAKE2S_LOAD_MSG_9_4(buf) buf = VectorSet32<0,12,14,11>(m0,m12,m14,m11) + + // Altivec has packed 32-bit rotate, but in terms of left rotate + const uint32x4_p ROR16_MASK = { 32-16, 32-16, 32-16, 32-16 }; + const uint32x4_p ROR12_MASK = { 32-12, 32-12, 32-12, 32-12 }; + const uint32x4_p ROR8_MASK = { 32-8, 32-8, 32-8, 32-8 }; + const uint32x4_p ROR7_MASK = { 32-7, 32-7, 32-7, 32-7 }; + + #define vec_ror_16(x) vec_rl(x, ROR16_MASK) + #define vec_ror_12(x) vec_rl(x, ROR12_MASK) + #define vec_ror_8(x) vec_rl(x, ROR8_MASK) + #define vec_ror_7(x) vec_rl(x, ROR7_MASK) + + #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ + row1 = vec_add( vec_add( row1, buf), row2 ); \ + row4 = vec_xor( row4, row1 ); \ + row4 = vec_ror_16(row4); \ + row3 = vec_add( row3, row4 ); \ + row2 = vec_xor( row2, row3 ); \ + row2 = vec_ror_12(row2); + + #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ + row1 = vec_add( vec_add( row1, buf), row2 ); \ + row4 = vec_xor( row4, row1 ); \ + row4 = vec_ror_8(row4); \ + row3 = vec_add( row3, row4 ); \ + row2 = vec_xor( row2, row3 ); \ + row2 = vec_ror_7(row2); + + const uint8x16_p D2103_MASK = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; + const uint8x16_p D1032_MASK = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; + const uint8x16_p D0321_MASK = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; + + #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ + row4 = vec_perm( row4, row4, D2103_MASK ); \ + row3 = vec_perm( row3, row3, D1032_MASK ); \ + row2 = vec_perm( row2, row2, D0321_MASK ); + + #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ + row4 = vec_perm( row4, row4, D0321_MASK ); \ + row3 = vec_perm( row3, row3, D1032_MASK ); \ + row2 = vec_perm( row2, row2, D2103_MASK ); + + #define BLAKE2S_ROUND(r) \ + BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ + BLAKE2S_G1(row1,row2,row3,row4,buf1); \ + BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ + BLAKE2S_G2(row1,row2,row3,row4,buf2); \ + BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ + BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ + BLAKE2S_G1(row1,row2,row3,row4,buf3); \ + BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ + BLAKE2S_G2(row1,row2,row3,row4,buf4); \ + BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); + + uint32x4_p row1, row2, row3, row4; + uint32x4_p buf1, buf2, buf3, buf4; + uint32x4_p ff0, ff1; + + const uint32x4_p m0 = VectorLoad32LE(input + 0); + const uint32x4_p m4 = VectorLoad32LE(input + 16); + const uint32x4_p m8 = VectorLoad32LE(input + 32); + const uint32x4_p m12 = VectorLoad32LE(input + 48); + + row1 = ff0 = VectorLoad32LE( &state.h[0] ); + row2 = ff1 = VectorLoad32LE( &state.h[4] ); + row3 = VectorLoad32( &BLAKE2S_IV[0] ); + row4 = vec_xor( VectorLoad32( &BLAKE2S_IV[4] ), VectorLoad32( &state.t[0] ) ); + + BLAKE2S_ROUND( 0 ); + BLAKE2S_ROUND( 1 ); + BLAKE2S_ROUND( 2 ); + BLAKE2S_ROUND( 3 ); + BLAKE2S_ROUND( 4 ); + BLAKE2S_ROUND( 5 ); + BLAKE2S_ROUND( 6 ); + BLAKE2S_ROUND( 7 ); + BLAKE2S_ROUND( 8 ); + BLAKE2S_ROUND( 9 ); + + VectorStore32LE( &state.h[0], vec_xor( ff0, vec_xor( row1, row3 ) ) ); + VectorStore32LE( &state.h[4], vec_xor( ff1, vec_xor( row2, row4 ) ) ); + + #undef m0 + #undef m1 + #undef m2 + #undef m3 + + #undef m4 + #undef m5 + #undef m6 + #undef m7 + + #undef m8 + #undef m9 + #undef m10 + #undef m11 + + #undef m12 + #undef m13 + #undef m14 + #undef m15 +} +#endif // CRYPTOPP_POWER7_AVAILABLE + +NAMESPACE_END diff --git a/cryptest.nmake b/cryptest.nmake index 5464f567..0dc946ff 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -53,8 +53,8 @@ LIB_SRCS = \ cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp \ algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \ - authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp \ - blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \ + authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s-simd.cpp \ + blake2b-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \ cbcmac.cpp ccm.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \ cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \ dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \ @@ -81,8 +81,8 @@ LIB_SRCS = \ LIB_OBJS = \ cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \ algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \ - authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj \ - blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \ + authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s-simd.obj \ + blake2b-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \ cbcmac.obj ccm.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj channels.obj \ cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \ dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \ diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index 8794a5bb..b6917bf5 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -182,7 +182,8 @@ - + + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 197bbc77..de1bcea1 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -59,7 +59,10 @@ Source Files - + + Source Files + + Source Files