diff --git a/Filelist.txt b/Filelist.txt index 7c34dc16..81a6aef2 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -18,6 +18,7 @@ aria.cpp aria_simd.cpp aria.h argnames.h +arm_simd.h asn.cpp asn.h authenc.cpp @@ -146,6 +147,7 @@ gf256.cpp gf256.h gf2_32.cpp gf2_32.h +gf2n_simd.cpp gf2n.cpp gf2n.h gfpcrypt.cpp diff --git a/GNUmakefile b/GNUmakefile index 319755b0..c62098fd 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -305,6 +305,7 @@ ifeq ($(DETECT_FEATURES),1) HAVE_OPT = $(shell $(CXX) $(TCXXFLAGS) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l) ifeq ($(strip $(HAVE_OPT)),0) GCM_FLAG = $(SSSE3_FLAG) $(CLMUL_FLAG) + GF2N_FLAG = $(CLMUL_FLAG) SUN_LDFLAGS += $(CLMUL_FLAG) else CLMUL_FLAG = @@ -523,6 +524,7 @@ ifeq ($(IS_ARMV8),1) HAVE_OPT = $(shell $(CXX) $(CXXFLAGS) $(ACLE_FLAG) $(ZOPT) $(TOPT) $(TPROG) -o $(TOUT) 2>&1 | tr ' ' '\n' | wc -l) ifeq ($(strip $(HAVE_OPT)),0) GCM_FLAG = -march=armv8-a+crypto + GF2N_FLAG = -march=armv8-a+crypto else CXXFLAGS += -DCRYPTOPP_ARM_PMULL_AVAILABLE=0 endif @@ -618,6 +620,7 @@ ifeq ($(DETECT_FEATURES),1) BLAKE2B_FLAG = $(POWER8_FLAG) CRC_FLAG = $(POWER8_FLAG) GCM_FLAG = $(POWER8_FLAG) + GF2N_FLAG = $(POWER8_FLAG) AES_FLAG = $(POWER8_FLAG) SHA_FLAG = $(POWER8_FLAG) SHACAL2_FLAG = $(POWER8_FLAG) @@ -1444,6 +1447,10 @@ chacha_avx.o : chacha_avx.cpp cham_simd.o : cham_simd.cpp $(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $< +# SSE4.2 or ARMv8a available +crc_simd.o : crc_simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $< + # Power9 available darn.o : darn.cpp $(CXX) $(strip $(CXXFLAGS) $(DARN_FLAG) -c) $< @@ -1452,18 +1459,14 @@ darn.o : darn.cpp donna_sse.o : donna_sse.cpp $(CXX) $(strip $(CXXFLAGS) $(SSE2_FLAG) -c) $< -# SSE2 on i686 -sse_simd.o : sse_simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(SSE2_FLAG) -c) $< - -# SSE4.2 or ARMv8a available -crc_simd.o : crc_simd.cpp - $(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $< - -# PCLMUL or ARMv7a/ARMv8a available +# Carryless multiply gcm_simd.o : gcm_simd.cpp $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< +# Carryless multiply +gf2n_simd.o : gf2n_simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(GF2N_FLAG) -c) $< + # SSSE3 available lea_simd.o : lea_simd.cpp $(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $< @@ -1537,6 +1540,10 @@ sm3.o : sm3.cpp $(CXX) $(strip $(subst -O3,-O2,$(CXXFLAGS)) -c) $< endif +# SSE2 on i686 +sse_simd.o : sse_simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SSE2_FLAG) -c) $< + # Don't build Rijndael with UBsan. Too much noise due to unaligned data accesses. ifneq ($(findstring -fsanitize=undefined,$(CXXFLAGS)),) rijndael.o : rijndael.cpp diff --git a/adv_simd.h b/adv_simd.h index 517f0c29..37696c2c 100644 --- a/adv_simd.h +++ b/adv_simd.h @@ -59,17 +59,6 @@ # include #endif -// Thanks to Peter Cordes, https://stackoverflow.com/q/54016821/608639 -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -# ifndef PACK32x4 -# if defined(_MSC_VER) -# define PACK32x4(w,x,y,z) { ((w) + (word64(x) << 32)), ((y) + (word64(z) << 32)) } -# else -# define PACK32x4(w,x,y,z) { (w), (x), (y), (z) } -# endif -# endif // PACK32x4 -#endif // Microsoft workaround - #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) # include # include @@ -124,14 +113,10 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, CRYPTOPP_ASSERT(outBlocks); CRYPTOPP_ASSERT(length >= 8); -#if (CRYPTOPP_LITTLE_ENDIAN) - const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24); - const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24); -#else - // TODO: verify these constants on ARM-BE - //const uint32x4_t s_one = PACK32x4(0, 0, 0, 1); - //const uint32x4_t s_two = PACK32x4(0, 2, 0, 2); -#endif + const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; + const unsigned int w_two[] = {0, 2<<24, 0, 2<<24}; + const uint32x4_t s_one = vld1q_u32(w_one); + const uint32x4_t s_two = vld1q_u32(w_two); const size_t blockSize = 8; const size_t neonBlockSize = 16; @@ -369,14 +354,10 @@ inline size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6, CRYPTOPP_ASSERT(outBlocks); CRYPTOPP_ASSERT(length >= 16); -#if (CRYPTOPP_LITTLE_ENDIAN) - const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24); - //const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24); -#else - // TODO: verify these constants on ARM-BE - //const uint32x4_t s_one = PACK32x4(0, 0, 0, 1); - //const uint32x4_t s_two = PACK32x4(0, 2, 0, 2); -#endif + const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; + const unsigned int w_two[] = {0, 2<<24, 0, 2<<24}; + const uint32x4_t s_one = vld1q_u32(w_one); + const uint32x4_t s_two = vld1q_u32(w_two); const size_t blockSize = 16; // const size_t neonBlockSize = 16; @@ -529,14 +510,10 @@ inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4, CRYPTOPP_ASSERT(outBlocks); CRYPTOPP_ASSERT(length >= 16); -#if (CRYPTOPP_LITTLE_ENDIAN) - const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24); - //const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24); -#else - // TODO: verify these constants on ARM-BE - //const uint32x4_t s_one = PACK32x4(0, 0, 0, 1); - //const uint32x4_t s_two = PACK32x4(0, 2, 0, 2); -#endif + const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; + const unsigned int w_two[] = {0, 2<<24, 0, 2<<24}; + const uint32x4_t s_one = vld1q_u32(w_one); + const uint32x4_t s_two = vld1q_u32(w_two); const size_t blockSize = 16; // const size_t neonBlockSize = 16; @@ -669,14 +646,10 @@ inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, CRYPTOPP_ASSERT(outBlocks); CRYPTOPP_ASSERT(length >= 16); -#if (CRYPTOPP_LITTLE_ENDIAN) - const uint32x4_t s_one = PACK32x4(0, 0, 0, 1<<24); - //const uint32x4_t s_two = PACK32x4(0, 2<<24, 0, 2<<24); -#else - // TODO: verify these constants on ARM-BE - //const uint32x4_t s_one = PACK32x4(0, 0, 0, 1); - //const uint32x4_t s_two = PACK32x4(0, 2, 0, 2); -#endif + const unsigned int w_one[] = {0, 0<<24, 0, 1<<24}; + const unsigned int w_two[] = {0, 2<<24, 0, 2<<24}; + const uint32x4_t s_one = vld1q_u32(w_one); + const uint32x4_t s_two = vld1q_u32(w_two); const size_t blockSize = 16; // const size_t neonBlockSize = 16; diff --git a/arm_simd.h b/arm_simd.h new file mode 100644 index 00000000..454f1a8f --- /dev/null +++ b/arm_simd.h @@ -0,0 +1,125 @@ +// arm_simd.h - written and placed in public domain by Jeffrey Walton + +/// \file arm_simd.h +/// \brief Support functions for ARM and vector operations + +#ifndef CRYPTOPP_ARM_SIMD_H +#define CRYPTOPP_ARM_SIMD_H + +#include "config.h" + +// C1189: error: This header is specific to ARM targets +#if (CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(_M_ARM64) +# include +#endif + +#if (CRYPTOPP_ARM_ACLE_AVAILABLE) +# include +# include +#endif + +#if CRYPTOPP_ARM_PMULL_AVAILABLE + +inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) +{ +#if defined(_MSC_VER) + const __n64 x = { vgetq_lane_u64(a, 0) }; + const __n64 y = { vgetq_lane_u64(b, 0) }; + return vmull_p64(x, y); +#elif defined(__GNUC__) + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (a), "w" (b) ); + return r; +#else + return (uint64x2_t)(vmull_p64( + vgetq_lane_u64(vreinterpretq_u64_u8(a),0), + vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); +#endif +} + +inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) +{ +#if defined(_MSC_VER) + const __n64 x = { vgetq_lane_u64(a, 0) }; + const __n64 y = { vgetq_lane_u64(b, 1) }; + return vmull_p64(x, y); +#elif defined(__GNUC__) + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) ); + return r; +#else + return (uint64x2_t)(vmull_p64( + vgetq_lane_u64(vreinterpretq_u64_u8(a),0), + vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); +#endif +} + +inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) +{ +#if defined(_MSC_VER) + const __n64 x = { vgetq_lane_u64(a, 1) }; + const __n64 y = { vgetq_lane_u64(b, 0) }; + return vmull_p64(x, y); +#elif defined(__GNUC__) + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) ); + return r; +#else + return (uint64x2_t)(vmull_p64( + vgetq_lane_u64(vreinterpretq_u64_u8(a),1), + vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); +#endif +} + +inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) +{ +#if defined(_MSC_VER) + const __n64 x = { vgetq_lane_u64(a, 1) }; + const __n64 y = { vgetq_lane_u64(b, 1) }; + return vmull_p64(x, y); +#elif defined(__GNUC__) + uint64x2_t r; + __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" + :"=w" (r) : "w" (a), "w" (b) ); + return r; +#else + return (uint64x2_t)(vmull_p64( + vgetq_lane_u64(vreinterpretq_u64_u8(a),1), + vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); +#endif +} + +inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) +{ +#if defined(_MSC_VER) + return (uint64x2_t)vextq_u8( + vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c); +#else + uint64x2_t r; + __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" + :"=w" (r) : "w" (a), "w" (b), "I" (c) ); + return r; +#endif +} + +// https://github.com/weidai11/cryptopp/issues/366 +template +inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) +{ +#if defined(_MSC_VER) + return (uint64x2_t)vextq_u8( + vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C); +#else + uint64x2_t r; + __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" + :"=w" (r) : "w" (a), "w" (b), "I" (C) ); + return r; +#endif +} + +#endif // CRYPTOPP_ARM_PMULL_AVAILABLE + +#endif // CRYPTOPP_ARM_SIMD_H diff --git a/chacha_simd.cpp b/chacha_simd.cpp index 93b16aeb..65316384 100644 --- a/chacha_simd.cpp +++ b/chacha_simd.cpp @@ -69,17 +69,6 @@ ANONYMOUS_NAMESPACE_BEGIN // ***************************** NEON ***************************** // -// Thanks to Peter Cordes, https://stackoverflow.com/q/54016821/608639 -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -# ifndef PACK32x4 -# if defined(_MSC_VER) -# define PACK32x4(w,x,y,z) { ((w) + (word64(x) << 32)), ((y) + (word64(z) << 32)) } -# else -# define PACK32x4(w,x,y,z) { (w), (x), (y), (z) } -# endif -# endif // PACK32x4 -#endif // Microsoft workaround - #if (CRYPTOPP_ARM_NEON_AVAILABLE) template @@ -312,10 +301,9 @@ void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte * const uint32x4_t state2 = vld1q_u32(state + 2*4); const uint32x4_t state3 = vld1q_u32(state + 3*4); + const unsigned int w[] = {1,0,0,0, 2,0,0,0, 3,0,0,0}; const uint32x4_t CTRS[3] = { - PACK32x4(1,0,0,0), - PACK32x4(2,0,0,0), - PACK32x4(3,0,0,0) + vld1q_u32(w+0), vld1q_u32(w+4), vld1q_u32(w+8) }; uint32x4_t r0_0 = state0; diff --git a/cryptest.nmake b/cryptest.nmake index 37a2748f..fa71635a 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -66,8 +66,8 @@ LIB_SRCS = \ dll.cpp donna_32.cpp donna_64.cpp donna_sse.cpp dsa.cpp eax.cpp ec2n.cpp \ eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp \ filters.cpp fips140.cpp fipstest.cpp gcm.cpp gcm_simd.cpp gf256.cpp \ - gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp hc256.cpp \ - hex.cpp hight.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp \ + gf2_32.cpp gf2n.cpp gf2n_sind.cpp gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp \ + hc256.cpp hex.cpp hight.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp \ kalyna.cpp kalynatab.cpp keccak.cpp keccakc.cpp lea.cpp lea_simd.cpp \ luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp \ mqueue.cpp mqv.cpp nbtheory.cpp neon_simd.cpp oaep.cpp osrng.cpp \ @@ -96,8 +96,8 @@ LIB_OBJS = \ dll.obj donna_32.obj donna_64.obj donna_sse.obj dsa.obj eax.obj ec2n.obj \ eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj \ filters.obj fips140.obj fipstest.obj gcm.obj gcm_simd.obj gf256.obj \ - gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hc128.obj hc256.obj \ - hex.obj hight.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj \ + gf2_32.obj gf2n.obj gf2n_simd.obj gfpcrypt.obj gost.obj gzip.obj hc128.obj \ + hc256.obj hex.obj hight.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj \ kalyna.obj kalynatab.obj keccak.obj keccakc.obj lea.obj lea_simd.obj \ luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj \ mqueue.obj mqv.obj nbtheory.obj neon_simd.obj oaep.obj osrng.obj \ diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index 9bf9c1ad..50ed5447 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -238,6 +238,7 @@ + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 51fff69a..6256e62b 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -203,6 +203,9 @@ Source Files + + Source Files + Source Files diff --git a/eccrypto.cpp b/eccrypto.cpp index eed99a17..f76c495a 100644 --- a/eccrypto.cpp +++ b/eccrypto.cpp @@ -99,7 +99,12 @@ template<> struct EcRecommendedParameters StringSource ssA(a, true, new HexDecoder); StringSource ssB(b, true, new HexDecoder); if (t0 == 0) - return new EC2N(GF2NT(t2, t3, t4), EC2N::FieldElement(ssA, (size_t)ssA.MaxRetrievable()), EC2N::FieldElement(ssB, (size_t)ssB.MaxRetrievable())); + { + if (t2 == 233 && t3 == 74 && t4 == 0) + return new EC2N(GF2NT233(233, 74, 0), EC2N::FieldElement(ssA, (size_t)ssA.MaxRetrievable()), EC2N::FieldElement(ssB, (size_t)ssB.MaxRetrievable())); + else + return new EC2N(GF2NT(t2, t3, t4), EC2N::FieldElement(ssA, (size_t)ssA.MaxRetrievable()), EC2N::FieldElement(ssB, (size_t)ssB.MaxRetrievable())); + } else return new EC2N(GF2NPP(t0, t1, t2, t3, t4), EC2N::FieldElement(ssA, (size_t)ssA.MaxRetrievable()), EC2N::FieldElement(ssB, (size_t)ssB.MaxRetrievable())); }; diff --git a/gcm_simd.cpp b/gcm_simd.cpp index 293f3010..374ab8fa 100644 --- a/gcm_simd.cpp +++ b/gcm_simd.cpp @@ -39,6 +39,10 @@ # include #endif +#if defined(CRYPTOPP_ARM_PMULL_AVAILABLE) +# include "arm_simd.h" +#endif + #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) # include "ppc_simd.h" #endif @@ -52,31 +56,6 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif -// Thanks to Peter Cordes, https://stackoverflow.com/q/54016821/608639 -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -# ifndef PACK32x4 -# if defined(_MSC_VER) -# define PACK32x4(w,x,y,z) { ((w) + (word64(x) << 32)), ((y) + (word64(z) << 32)) } -# else -# define PACK32x4(w,x,y,z) { (w), (x), (y), (z) } -# endif -# endif // PACK32x4 - -# ifndef PACK8x16 -# if defined(_MSC_VER) -# define PACK8x16(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p) \ - PACK32x4( (a+(b<<8)+(c<<16)+(word32(d)<<24)), \ - (e+(f<<8)+(g<<16)+(word32(h)<<24)), \ - (i+(j<<8)+(k<<16)+(word32(l)<<24)), \ - (m+(n<<8)+(o<<16)+(word32(p)<<24)) ) -# else -# define PACK8x16(a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p) \ - { (a),(b),(c),(d),(e),(f),(g),(h),(i),(j),(k),(l),(m),(n),(o),(p) } -# endif -# endif // PACK8x16 - -#endif // Microsoft workaround - // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 #define M128_CAST(x) ((__m128i *)(void *)(x)) #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) @@ -90,113 +69,7 @@ extern const char GCM_SIMD_FNAME[] = __FILE__; ANONYMOUS_NAMESPACE_BEGIN -// *************************** ARM NEON *************************** // - -#if CRYPTOPP_ARM_PMULL_AVAILABLE - -inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) -{ -#if defined(_MSC_VER) - const __n64 x = { vgetq_lane_u64(a, 0) }; - const __n64 y = { vgetq_lane_u64(b, 0) }; - return vmull_p64(x, y); -#elif defined(__GNUC__) - uint64x2_t r; - __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" - :"=w" (r) : "w" (a), "w" (b) ); - return r; -#else - return (uint64x2_t)(vmull_p64( - vgetq_lane_u64(vreinterpretq_u64_u8(a),0), - vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); -#endif -} - -inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) -{ -#if defined(_MSC_VER) - const __n64 x = { vgetq_lane_u64(a, 0) }; - const __n64 y = { vgetq_lane_u64(b, 1) }; - return vmull_p64(x, y); -#elif defined(__GNUC__) - uint64x2_t r; - __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" - :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) ); - return r; -#else - return (uint64x2_t)(vmull_p64( - vgetq_lane_u64(vreinterpretq_u64_u8(a),0), - vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); -#endif -} - -inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) -{ -#if defined(_MSC_VER) - const __n64 x = { vgetq_lane_u64(a, 1) }; - const __n64 y = { vgetq_lane_u64(b, 0) }; - return vmull_p64(x, y); -#elif defined(__GNUC__) - uint64x2_t r; - __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" - :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) ); - return r; -#else - return (uint64x2_t)(vmull_p64( - vgetq_lane_u64(vreinterpretq_u64_u8(a),1), - vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); -#endif -} - -inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) -{ -#if defined(_MSC_VER) - const __n64 x = { vgetq_lane_u64(a, 1) }; - const __n64 y = { vgetq_lane_u64(b, 1) }; - return vmull_p64(x, y); -#elif defined(__GNUC__) - uint64x2_t r; - __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" - :"=w" (r) : "w" (a), "w" (b) ); - return r; -#else - return (uint64x2_t)(vmull_p64( - vgetq_lane_u64(vreinterpretq_u64_u8(a),1), - vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); -#endif -} - -inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) -{ -#if defined(_MSC_VER) - return (uint64x2_t)vextq_u8( - vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c); -#else - uint64x2_t r; - __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" - :"=w" (r) : "w" (a), "w" (b), "I" (c) ); - return r; -#endif -} - -// https://github.com/weidai11/cryptopp/issues/366 -template -inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) -{ -#if defined(_MSC_VER) - return (uint64x2_t)vextq_u8( - vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C); -#else - uint64x2_t r; - __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" - :"=w" (r) : "w" (a), "w" (b), "I" (C) ); - return r; -#endif -} - -#endif // CRYPTOPP_ARM_PMULL_AVAILABLE - -// ************************** Power 8 Crypto ************************** // +// ************************** Power8 Crypto ************************** // #if CRYPTOPP_POWER8_VMULL_AVAILABLE @@ -316,14 +189,18 @@ bool CPU_ProbePMULL() __try { // Linaro is missing a lot of pmull gear. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t a1={0,0x9090909090909090}, b1={0,0xb0b0b0b0b0b0b0b0}; - const uint8x16_t a2=PACK8x16(0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, - 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0), - b2=PACK8x16(0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, - 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0); + const uint64_t wa1[]={0,0x9090909090909090}, wb1[]={0,0xb0b0b0b0b0b0b0b0}; + const uint64x2_t a1=vld1q_u64(wa1), b1=vld1q_u64(wb1); + + const uint8_t wa2[]={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + wb2[]={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, + 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + const uint8x16_t a2=vld1q_u8(wa2), b2=vld1q_u8(wb2); const uint64x2_t r1 = PMULL_00(a1, b1); - const uint64x2_t r2 = PMULL_11((uint64x2_t)a2, (uint64x2_t)b2); + const uint64x2_t r2 = PMULL_11(vreinterpretq_u64_u8(a2), + vreinterpretq_u64_u8(b2)); result = !!(vgetq_lane_u64(r1,0) == 0x5300530053005300 && vgetq_lane_u64(r1,1) == 0x5300530053005300 && @@ -353,14 +230,18 @@ bool CPU_ProbePMULL() else { // Linaro is missing a lot of pmull gear. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t a1={0,0x9090909090909090}, b1={0,0xb0b0b0b0b0b0b0b0}; - const uint8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + const uint64_t wa1[]={0,0x9090909090909090}, wb1[]={0,0xb0b0b0b0b0b0b0b0}; + const uint64x2_t a1=vld1q_u64(wa1), b1=vld1q_u64(wb1); + + const uint8_t wa2[]={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, + wb2[]={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + const uint8x16_t a2=vld1q_u8(wa2), b2=vld1q_u8(wb2); const uint64x2_t r1 = PMULL_00(a1, b1); - const uint64x2_t r2 = PMULL_11((uint64x2_t)a2, (uint64x2_t)b2); + const uint64x2_t r2 = PMULL_11(vreinterpretq_u64_u8(a2), + vreinterpretq_u64_u8(b2)); result = !!(vgetq_lane_u64(r1,0) == 0x5300530053005300 && vgetq_lane_u64(r1,1) == 0x5300530053005300 && diff --git a/gf2n.cpp b/gf2n.cpp index c907e081..e2b0ab9e 100644 --- a/gf2n.cpp +++ b/gf2n.cpp @@ -13,8 +13,9 @@ #include "words.h" #include "misc.h" #include "gf2n.h" -#include "asn.h" #include "oids.h" +#include "asn.h" +#include "cpu.h" #include @@ -41,6 +42,10 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) +#if defined(CRYPTOPP_CLMUL_AVAILABLE) || defined(CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_POWER8_VMULL_AVAILABLE) +extern void GF2NT_233_Multiply_Reduce(const word* pA, const word* pB, word* pC); +#endif + PolynomialMod2::PolynomialMod2() { } @@ -75,7 +80,7 @@ void PolynomialMod2::Randomize(RandomNumberGenerator &rng, size_t nbits) PolynomialMod2 PolynomialMod2::AllOnes(size_t bitLength) { PolynomialMod2 result((word)0, bitLength); - SetWords(result.reg, word(SIZE_MAX), result.reg.size()); + SetWords(result.reg, ~(word(0)), result.reg.size()); if (bitLength%WORD_BITS) result.reg[result.reg.size()-1] = (word)Crop(result.reg[result.reg.size()-1], bitLength%WORD_BITS); return result; @@ -943,6 +948,112 @@ GF2NP * BERDecodeGF2NP(BufferedTransformation &bt) return result.release(); } +// ******************************************************** + +GF2NT233::GF2NT233(unsigned int c0, unsigned int c1, unsigned int c2) + : GF2NT(c0, c1, c2) +{ + CRYPTOPP_ASSERT(c0 > c1 && c1 > c2 && c2==0); +} + +const GF2NT::Element& GF2NT233::Multiply(const Element &a, const Element &b) const +{ +#if defined(CRYPTOPP_CLMUL_AVAILABLE) + if (HasCLMUL()) + { + CRYPTOPP_ASSERT(a.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(b.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(result.reg.size()*WORD_BITS == 256); + + const word* pA = a.reg.begin(); + const word* pB = b.reg.begin(); + word* pR = result.reg.begin(); + + GF2NT_233_Multiply_Reduce(pA, pB, pR); + return result; + } + else +#elif defined(CRYPTOPP_ARM_PMULL_AVAILABLE) + if (HasPMULL()) + { + CRYPTOPP_ASSERT(a.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(b.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(result.reg.size()*WORD_BITS == 256); + + const word* pA = a.reg.begin(); + const word* pB = b.reg.begin(); + word* pR = result.reg.begin(); + + GF2NT_233_Multiply_Reduce(pA, pB, pR); + return result; + } + else +#elif defined(CRYPTOPP_POWER8_VMULL_AVAILABLE) + if (HasPMULL()) + { + CRYPTOPP_ASSERT(a.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(b.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(result.reg.size()*WORD_BITS == 256); + + const word* pA = a.reg.begin(); + const word* pB = b.reg.begin(); + word* pR = result.reg.begin(); + + GF2NT_233_Multiply_Reduce(pA, pB, pR); + return result; + } + else +#endif + + return GF2NT::Multiply(a, b); +} + +const GF2NT::Element& GF2NT233::Square(const Element &a) const +{ +#if defined(CRYPTOPP_CLMUL_AVAILABLE) + if (HasCLMUL()) + { + CRYPTOPP_ASSERT(a.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(result.reg.size()*WORD_BITS == 256); + + const word* pA = a.reg.begin(); + word* pR = result.reg.begin(); + + GF2NT_233_Multiply_Reduce(pA, pA, pR); + return result; + } + else +#elif defined(CRYPTOPP_ARM_PMULL_AVAILABLE) + if (HasPMULL()) + { + CRYPTOPP_ASSERT(a.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(result.reg.size()*WORD_BITS == 256); + + const word* pA = a.reg.begin(); + word* pR = result.reg.begin(); + + GF2NT_233_Multiply_Reduce(pA, pA, pR); + return result; + } + else +#elif defined(CRYPTOPP_POWER8_VMULL_AVAILABLE) + if (HasPMULL()) + { + CRYPTOPP_ASSERT(a.reg.size()*WORD_BITS == 256); + CRYPTOPP_ASSERT(result.reg.size()*WORD_BITS == 256); + + const word* pA = a.reg.begin(); + word* pR = result.reg.begin(); + + GF2NT_233_Multiply_Reduce(pA, pA, pR); + return result; + } + else +#endif + + return GF2NT::Square(a); +} + NAMESPACE_END #endif diff --git a/gf2n.h b/gf2n.h index 4bb34b19..94f2c010 100644 --- a/gf2n.h +++ b/gf2n.h @@ -246,6 +246,7 @@ public: private: friend class GF2NT; + friend class GF2NT233; SecWordBlock reg; }; @@ -344,13 +345,29 @@ public: const Element& MultiplicativeInverse(const Element &a) const; -private: +protected: const Element& Reduced(const Element &a) const; unsigned int t0, t1; mutable PolynomialMod2 result; }; +/// \brief GF(2^n) for b233 and k233 +/// \details GF2NT233 is a specialization of GF2NT that provides Multiply() +/// and Square() operations when carryless multiplies is available. +class CRYPTOPP_DLL GF2NT233 : public GF2NT +{ +public: + // polynomial modulus = x^t0 + x^t1 + x^t2, t0 > t1 > t2 + GF2NT233(unsigned int t0, unsigned int t1, unsigned int t2); + + GF2NP * Clone() const {return new GF2NT233(*this);} + + const Element& Multiply(const Element &a, const Element &b) const; + + const Element& Square(const Element &a) const; +}; + /// \brief GF(2^n) with Pentanomial Basis class CRYPTOPP_DLL GF2NPP : public GF2NP { diff --git a/gf2n_simd.cpp b/gf2n_simd.cpp new file mode 100644 index 00000000..8401211e --- /dev/null +++ b/gf2n_simd.cpp @@ -0,0 +1,529 @@ +// gf2n_simd.cpp - written and placed in the public domain by Jeffrey Walton +// Also based on PCLMULQDQ code by Jankowski, Laurent and +// O'Mahony from Intel (see reference below). +// +// This source file uses intrinsics and built-ins to gain access to +// CLMUL, ARMv8a, and Power8 instructions. A separate source file is +// needed because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. +// +// Several speedups were taken from Intel Polynomial Multiplication +// Instruction and its Usage for Elliptic Curve Cryptography, by +// Krzysztof Jankowski, Pierre Laurent and Aidan O'Mahony, +// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/polynomial-multiplication-instructions-paper.pdf +// There may be more speedups available, see https://eprint.iacr.org/2011/589.pdf. +// The IACR paper performs some optimizations that the compiler is +// expected to perform, like Common Subexpression Elimination to save +// on variables (among others). Note that the compiler may miss the +// optimization so the IACR paper is useful. However, the code is GPL3 +// and toxic for some users of the library... + +#include "pch.h" +#include "config.h" + +#include "gf2n.h" + +#if (CRYPTOPP_CLMUL_AVAILABLE) +# include +# include +#endif + +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) +# include "arm_simd.h" +#endif + +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) +# include "ppc_simd.h" +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +// ************************** ARMv8 ************************** // + +using CryptoPP::word; + +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) + +// c1c0 = a * b +inline void +F2N_Multiply_128x128_ARMv8(uint64x2_t& c1, uint64x2_t& c0, const uint64x2_t& a, const uint64x2_t& b) +{ + uint64x2_t t1, t2, z0={0}; + + c0 = PMULL_00(a, b); + c1 = PMULL_11(a, b); + t1 = vmovq_n_u64(vgetq_lane_u64(a, 1)); + t1 = veorq_u64(a, t1); + t2 = vmovq_n_u64(vgetq_lane_u64(b, 1)); + t2 = veorq_u64(b, t2); + t1 = PMULL_00(t1, t2); + t1 = veorq_u64(c0, t1); + t1 = veorq_u64(c1, t1); + t2 = t1; + t1 = vextq_u64(z0, t1, 1); + t2 = vextq_u64(t2, z0, 1); + c0 = veorq_u64(c0, t1); + c1 = veorq_u64(c1, t2); +} + +// x = (x << n), z = 0 +template +inline uint64x2_t ShiftLeft128_ARMv8(uint64x2_t x) +{ + uint64x2_t u=x, v, z={0}; + x = vshlq_n_u64(x, N); + u = vshrq_n_u64(u, (64-N)); + v = vcombine_u64(vget_low_u64(z), vget_low_u64(u)); + x = vorrq_u64(x, v); + return x; +} + +// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at +// Intel paper or https://github.com/antonblanchard/crc32-vpmsum. +inline void +GF2NT_233_Reduce_ARMv8(uint64x2_t& c3, uint64x2_t& c2, uint64x2_t& c1, uint64x2_t& c0) +{ + const unsigned int mask[4] = { + 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff, + }; + + uint64x2_t b3, b2, b1, /*b0,*/ a1, a0, m0, z0={0}; + m0 = vreinterpretq_u64_u32(vld1q_u32(mask)); + b1 = c1; a1 = c1; + a0 = vcombine_u64(vget_low_u64(c1), vget_low_u64(z0)); + a1 = vshlq_n_u64(a1, 23); + a1 = vshrq_n_u64(a1, 23); + c1 = vorrq_u64(a1, a0); + b2 = vshrq_n_u64(c2, (64-23)); + c3 = ShiftLeft128_ARMv8<23>(c3); + a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); + c3 = vorrq_u64(c3, a0); + b1 = vshrq_n_u64(b1, (64-23)); + c2 = ShiftLeft128_ARMv8<23>(c2); + a0 = vcombine_u64(vget_high_u64(b1), vget_high_u64(z0)); + c2 = vorrq_u64(c2, a0); + b3 = c3; + b2 = vshrq_n_u64(c2, (64-10)); + b3 = ShiftLeft128_ARMv8<10>(b3); + a0 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); + b3 = vorrq_u64(b3, a0); + a0 = vcombine_u64(vget_high_u64(c3), vget_high_u64(z0)); + b3 = veorq_u64(b3, a0); + b1 = vshrq_n_u64(b3, (64-23)); + b3 = ShiftLeft128_ARMv8<23>(b3); + b3 = vcombine_u64(vget_high_u64(b3), vget_high_u64(z0)); + b3 = vorrq_u64(b3, b1); + c2 = veorq_u64(c2, b3); + b3 = c3; + b2 = vshrq_n_u64(c2, (64-10)); + b3 = ShiftLeft128_ARMv8<10>(b3); + b2 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); + b3 = vorrq_u64(b3, b2); + b2 = c2; + b2 = ShiftLeft128_ARMv8<10>(b2); + a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b2)); + c2 = veorq_u64(c2, a0); + a0 = vcombine_u64(vget_low_u64(z0), vget_low_u64(b3)); + a1 = vcombine_u64(vget_high_u64(b2), vget_high_u64(z0)); + a0 = vorrq_u64(a0, a1); + c3 = veorq_u64(c3, a0); + c0 = veorq_u64(c0, c2); + c1 = veorq_u64(c1, c3); + c1 = vandq_u64(c1, m0); +} + +inline void +GF2NT_233_Multiply_Reduce_ARMv8(const word* pA, const word* pB, word* pC) +{ + // word is either 32-bit or 64-bit, depending on the platform. + // Load using a 32-bit pointer to avoid possible alignment issues. + const uint32_t* pAA = reinterpret_cast(pA); + const uint32_t* pBB = reinterpret_cast(pB); + + uint64x2_t a0 = vreinterpretq_u64_u32(vld1q_u32(pAA+0)); + uint64x2_t a1 = vreinterpretq_u64_u32(vld1q_u32(pAA+4)); + uint64x2_t b0 = vreinterpretq_u64_u32(vld1q_u32(pBB+0)); + uint64x2_t b1 = vreinterpretq_u64_u32(vld1q_u32(pBB+4)); + + uint64x2_t c0, c1, c2, c3, c4, c5; + F2N_Multiply_128x128_ARMv8(c1, c0, a0, b0); + F2N_Multiply_128x128_ARMv8(c3, c2, a1, b1); + + a0 = veorq_u64(a0, a1); + b0 = veorq_u64(b0, b1); + + F2N_Multiply_128x128_ARMv8(c5, c4, a0, b0); + + c4 = veorq_u64(c4, c0); + c4 = veorq_u64(c4, c2); + c5 = veorq_u64(c5, c1); + c5 = veorq_u64(c5, c3); + c1 = veorq_u64(c1, c4); + c2 = veorq_u64(c2, c5); + + GF2NT_233_Reduce_ARMv8(c3, c2, c1, c0); + + uint32_t* pCC = reinterpret_cast(pC); + vst1q_u32(pCC+0, vreinterpretq_u32_u64(c0)); + vst1q_u32(pCC+4, vreinterpretq_u32_u64(c1)); +} + +#endif + +// ************************** SSE ************************** // + +#if defined(CRYPTOPP_CLMUL_AVAILABLE) + +using CryptoPP::word; + +// c1c0 = a * b +inline void +F2N_Multiply_128x128_CLMUL(__m128i& c1, __m128i& c0, const __m128i& a, const __m128i& b) +{ + __m128i t1, t2; + + c0 = _mm_clmulepi64_si128(a, b, 0x00); + c1 = _mm_clmulepi64_si128(a, b, 0x11); + t1 = _mm_shuffle_epi32(a, 0xEE); + t1 = _mm_xor_si128(a, t1); + t2 = _mm_shuffle_epi32(b, 0xEE); + t2 = _mm_xor_si128(b, t2); + t1 = _mm_clmulepi64_si128(t1, t2, 0x00); + t1 = _mm_xor_si128(c0, t1); + t1 = _mm_xor_si128(c1, t1); + t2 = t1; + t1 = _mm_slli_si128(t1, 8); + t2 = _mm_srli_si128(t2, 8); + c0 = _mm_xor_si128(c0, t1); + c1 = _mm_xor_si128(c1, t2); +} + +// x = (x << n), z = 0 +template +inline __m128i ShiftLeft128_SSE(__m128i x, const __m128i& z) +{ + __m128i u=x, v; + x = _mm_slli_epi64(x, N); + u = _mm_srli_epi64(u, (64-N)); + v = _mm_unpacklo_epi64(z, u); + x = _mm_or_si128(x, v); + return x; +} + +// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at +// Intel paper or https://github.com/antonblanchard/crc32-vpmsum. +inline void +GF2NT_233_Reduce_CLMUL(__m128i& c3, __m128i& c2, __m128i& c1, __m128i& c0) +{ + const unsigned int m[4] = { + 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff + }; + + __m128i b3, b2, b1, /*b0,*/ a1, a0, m0, z0; + m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]); + z0 = _mm_setzero_si128(); + b1 = c1; a1 = c1; + a0 = _mm_move_epi64(c1); + a1 = _mm_slli_epi64(a1, 23); + a1 = _mm_srli_epi64(a1, 23); + c1 = _mm_or_si128(a1, a0); + b2 = _mm_srli_epi64(c2, (64-23)); + c3 = ShiftLeft128_SSE<23>(c3, z0); + a0 = _mm_unpackhi_epi64(b2, z0); + c3 = _mm_or_si128(c3, a0); + b1 = _mm_srli_epi64(b1, (64-23)); + c2 = ShiftLeft128_SSE<23>(c2, z0); + a0 = _mm_unpackhi_epi64(b1, z0); + c2 = _mm_or_si128(c2, a0); + b3 = c3; + b2 = _mm_srli_epi64(c2, (64-10)); + b3 = ShiftLeft128_SSE<10>(b3, z0); + a0 = _mm_unpackhi_epi64(b2, z0); + b3 = _mm_or_si128(b3, a0); + a0 = _mm_unpackhi_epi64(c3, z0); + b3 = _mm_xor_si128(b3, a0); + b1 = _mm_srli_epi64(b3, (64-23)); + b3 = ShiftLeft128_SSE<23>(b3, z0); + b3 = _mm_unpackhi_epi64(b3, z0); + b3 = _mm_or_si128(b3, b1); + c2 = _mm_xor_si128(c2, b3); + b3 = c3; + b2 = _mm_srli_epi64(c2, (64-10)); + b3 = ShiftLeft128_SSE<10>(b3, z0); + b2 = _mm_unpackhi_epi64(b2, z0); + b3 = _mm_or_si128(b3, b2); + b2 = c2; + b2 = ShiftLeft128_SSE<10>(b2, z0); + a0 = _mm_unpacklo_epi64(z0, b2); + c2 = _mm_xor_si128(c2, a0); + a0 = _mm_unpacklo_epi64(z0, b3); + a1 = _mm_unpackhi_epi64(b2, z0); + a0 = _mm_or_si128(a0, a1); + c3 = _mm_xor_si128(c3, a0); + c0 = _mm_xor_si128(c0, c2); + c1 = _mm_xor_si128(c1, c3); + c1 = _mm_and_si128(c1, m0); +} + +inline void +GF2NT_233_Multiply_Reduce_CLMUL(const word* pA, const word* pB, word* pC) +{ + const __m128i* pAA = reinterpret_cast(pA); + const __m128i* pBB = reinterpret_cast(pB); + __m128i a0 = _mm_loadu_si128(pAA+0); + __m128i a1 = _mm_loadu_si128(pAA+1); + __m128i b0 = _mm_loadu_si128(pBB+0); + __m128i b1 = _mm_loadu_si128(pBB+1); + + __m128i c0, c1, c2, c3, c4, c5; + F2N_Multiply_128x128_CLMUL(c1, c0, a0, b0); + F2N_Multiply_128x128_CLMUL(c3, c2, a1, b1); + + a0 = _mm_xor_si128(a0, a1); + b0 = _mm_xor_si128(b0, b1); + + F2N_Multiply_128x128_CLMUL(c5, c4, a0, b0); + + c4 = _mm_xor_si128(c4, c0); + c4 = _mm_xor_si128(c4, c2); + c5 = _mm_xor_si128(c5, c1); + c5 = _mm_xor_si128(c5, c3); + c1 = _mm_xor_si128(c1, c4); + c2 = _mm_xor_si128(c2, c5); + + GF2NT_233_Reduce_CLMUL(c3, c2, c1, c0); + + __m128i* pCC = reinterpret_cast<__m128i*>(pC); + _mm_storeu_si128(pCC+0, c0); + _mm_storeu_si128(pCC+1, c1); +} + +#endif + +// ************************* Power8 ************************* // + +#if defined(CRYPTOPP_POWER8_VMULL_AVAILABLE) + +using CryptoPP::byte; +using CryptoPP::word; +using CryptoPP::uint8x16_p; +using CryptoPP::uint64x2_p; + +using CryptoPP::VecLoad; +using CryptoPP::VecLoadBE; +using CryptoPP::VecStore; + +using CryptoPP::VecOr; +using CryptoPP::VecXor; +using CryptoPP::VecAnd; + +using CryptoPP::VecGetLow; +using CryptoPP::VecGetHigh; +using CryptoPP::VecPermute; +using CryptoPP::VecShiftLeft; +using CryptoPP::VecShiftRight; +using CryptoPP::VecRotateLeftOctet; + +inline uint64x2_p VMULL2LE(const uint64x2_p& val) +{ +#if (CRYPTOPP_BIG_ENDIAN) + return VecRotateLeftOctet<8>(val); +#else + return val; +#endif +} + +// _mm_clmulepi64_si128(a, b, 0x00) +inline uint64x2_p VMULL_00LE(const uint64x2_p& a, const uint64x2_p& b) +{ +#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + return VMULL2LE(__vpmsumd (VecGetHigh(a), VecGetHigh(b))); +#elif defined(__clang__) + return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); +#else + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetHigh(a), VecGetHigh(b))); +#endif +} + +// _mm_clmulepi64_si128(a, b, 0x11) +inline uint64x2_p VMULL_11LE(const uint64x2_p& a, const uint64x2_p& b) +{ +#if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__)) + return VMULL2LE(__vpmsumd (VecGetLow(a), b)); +#elif defined(__clang__) + return VMULL2LE(__builtin_altivec_crypto_vpmsumd (VecGetLow(a), b)); +#else + return VMULL2LE(__builtin_crypto_vpmsumd (VecGetLow(a), b)); +#endif +} + +// c1c0 = a * b +inline void +F2N_Multiply_128x128_POWER8(uint64x2_p& c1, uint64x2_p& c0, const uint64x2_p& a, const uint64x2_p& b) +{ + const uint8_t mb1[] = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15}; + const uint8_t mb2[] = {8,9,10,11, 12,13,14,15, 16,17,18,19, 20,21,22,23}; + + const uint8x16_p m1 = (uint8x16_p)VecLoad(mb1); + const uint8x16_p m2 = (uint8x16_p)VecLoad(mb2); + + uint64x2_p t1, t2, z0={0}; + + c0 = VMULL_00LE(a, b); + c1 = VMULL_11LE(a, b); + t1 = VecPermute(a, a, m1); + t1 = VecXor(a, t1); + t2 = VecPermute(b, b, m1); + t2 = VecXor(b, t2); + t1 = VMULL_00LE(t1, t2); + t1 = VecXor(c0, t1); + t1 = VecXor(c1, t1); + t2 = t1; + t1 = VecPermute(z0, t1, m2); + t2 = VecPermute(t2, z0, m2); + c0 = VecXor(c0, t1); + c1 = VecXor(c1, t2); +} + +// x = (x << n), z = 0 +template +inline uint64x2_p ShiftLeft128_POWER8(uint64x2_p x) +{ + const uint8_t mb[] = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; + const uint8x16_p m = (uint8x16_p)VecLoad(mb); + + uint64x2_p u=x, v, z={0}; + x = VecShiftLeft(x); + u = VecShiftRight<64-N>(u); + v = VecPermute(z, u, m); + x = VecOr(x, v); + return x; +} + +// c1c0 = c3c2c1c0 MOD p. This is a Barrett reduction. Reading at +// Intel paper or https://github.com/antonblanchard/crc32-vpmsum. +inline void +GF2NT_233_Reduce_POWER8(uint64x2_p& c3, uint64x2_p& c2, uint64x2_p& c1, uint64x2_p& c0) +{ + const uint64_t mask[] = {0xffffffffffffffff, 0x01ffffffffff}; + const uint8_t lmb[] = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; + const uint8_t hmb[] = {8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; + + const uint64x2_p m0 = (uint64x2_p)VecLoad(mask); + const uint8x16_p lm = (uint8x16_p)VecLoad(lmb); + const uint8x16_p hm = (uint8x16_p)VecLoad(hmb); + + uint64x2_p b3, b2, b1, /*b0,*/ a1, a0, z0={0}; + b1 = c1; a1 = c1; + a0 = VecPermute(c1, z0, lm); + a1 = VecShiftLeft<23>(a1); + a1 = VecShiftRight<23>(a1); + c1 = VecOr(a1, a0); + b2 = VecShiftRight<64-23>(c2); + c3 = ShiftLeft128_POWER8<23>(c3); + a0 = VecPermute(b2, z0, hm); + c3 = VecOr(c3, a0); + b1 = VecShiftRight<64-23>(b1); + c2 = ShiftLeft128_POWER8<23>(c2); + a0 = VecPermute(b1, z0, hm); + c2 = VecOr(c2, a0); + b3 = c3; + b2 = VecShiftRight<64-10>(c2); + b3 = ShiftLeft128_POWER8<10>(b3); + a0 = VecPermute(b2, z0, hm); + b3 = VecOr(b3, a0); + a0 = VecPermute(c3, z0, hm); + b3 = VecXor(b3, a0); + b1 = VecShiftRight<64-23>(b3); + b3 = ShiftLeft128_POWER8<23>(b3); + b3 = VecPermute(b3, z0, hm); + b3 = VecOr(b3, b1); + c2 = VecXor(c2, b3); + b3 = c3; + b2 = VecShiftRight<64-10>(c2); + b3 = ShiftLeft128_POWER8<10>(b3); + b2 = VecPermute(b2, z0, hm); + b3 = VecOr(b3, b2); + b2 = c2; + b2 = ShiftLeft128_POWER8<10>(b2); + a0 = VecPermute(z0, b2, lm); + c2 = VecXor(c2, a0); + a0 = VecPermute(z0, b3, lm); + a1 = VecPermute(b2, z0, hm); + a0 = VecOr(a0, a1); + c3 = VecXor(c3, a0); + c0 = VecXor(c0, c2); + c1 = VecXor(c1, c3); + c1 = VecAnd(c1, m0); +} + +inline void +GF2NT_233_Multiply_Reduce_POWER8(const word* pA, const word* pB, word* pC) +{ + // word is either 32-bit or 64-bit, depending on the platform. + // Load using a byte pointer to avoid possible alignment issues. + const byte* pAA = reinterpret_cast(pA); + const byte* pBB = reinterpret_cast(pB); + + uint64x2_p a0 = (uint64x2_p)VecLoad(pAA+0); + uint64x2_p a1 = (uint64x2_p)VecLoad(pAA+16); + uint64x2_p b0 = (uint64x2_p)VecLoad(pBB+0); + uint64x2_p b1 = (uint64x2_p)VecLoad(pBB+16); + +#if (CRYPTOPP_BIG_ENDIAN) + const uint8_t mb[] = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; + const uint8x16_p m = (uint8x16_p)VecLoad(mb); + a0 = VecPermute(a0, m); + a1 = VecPermute(a1, m); + b0 = VecPermute(b0, m); + b1 = VecPermute(b1, m); +#endif + + uint64x2_p c0, c1, c2, c3, c4, c5; + F2N_Multiply_128x128_POWER8(c1, c0, a0, b0); + F2N_Multiply_128x128_POWER8(c3, c2, a1, b1); + + a0 = VecXor(a0, a1); + b0 = VecXor(b0, b1); + + F2N_Multiply_128x128_POWER8(c5, c4, a0, b0); + + c4 = VecXor(c4, c0); + c4 = VecXor(c4, c2); + c5 = VecXor(c5, c1); + c5 = VecXor(c5, c3); + c1 = VecXor(c1, c4); + c2 = VecXor(c2, c5); + + GF2NT_233_Reduce_POWER8(c3, c2, c1, c0); + +#if (CRYPTOPP_BIG_ENDIAN) + c0 = VecPermute(c0, m); + c1 = VecPermute(c1, m); +#endif + + byte* pCC = reinterpret_cast(pC); + VecStore(c0, pCC+0); + VecStore(c1, pCC+16); +} + +#endif + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +void GF2NT_233_Multiply_Reduce(const word* pA, const word* pB, word* pC) +{ +#if defined(CRYPTOPP_CLMUL_AVAILABLE) + return GF2NT_233_Multiply_Reduce_CLMUL(pA, pB, pC); +#elif (CRYPTOPP_ARM_PMULL_AVAILABLE) + return GF2NT_233_Multiply_Reduce_ARMv8(pA, pB, pC); +#elif defined(CRYPTOPP_POWER8_VMULL_AVAILABLE) + return GF2NT_233_Multiply_Reduce_POWER8(pA, pB, pC); +#else + CRYPTOPP_ASSERT(0); +#endif +} + +NAMESPACE_END diff --git a/ppc_simd.h b/ppc_simd.h index 08733435..51e5b726 100644 --- a/ppc_simd.h +++ b/ppc_simd.h @@ -1172,6 +1172,21 @@ inline uint32x4_p VecRotateLeft(const uint32x4_p vec) return vec_rl(vec, m); } +/// \brief Shift a packed vector left +/// \tparam C shift bit count +/// \param vec the vector +/// \returns vector +/// \details VecShiftLeft() rotates each element in a packed vector by bit count. +/// \par Wraps +/// vec_sl +/// \since Crypto++ 8.1 +template +inline uint32x4_p VecShiftLeft(const uint32x4_p vec) +{ + const uint32x4_p m = {C, C, C, C}; + return vec_sl(vec, m); +} + #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief Rotate a packed vector left @@ -1190,6 +1205,22 @@ inline uint64x2_p VecRotateLeft(const uint64x2_p vec) return vec_rl(vec, m); } +/// \brief Shift a packed vector left +/// \tparam C shift bit count +/// \param vec the vector +/// \returns vector +/// \details VecShiftLeft() rotates each element in a packed vector by bit count. +/// \details VecShiftLeft() with 64-bit elements is available on POWER8 and above. +/// \par Wraps +/// vec_sl +/// \since Crypto++ 8.1 +template +inline uint64x2_p VecShiftLeft(const uint64x2_p vec) +{ + const uint64x2_p m = {C, C}; + return vec_sl(vec, m); +} + #endif /// \brief Rotate a packed vector right @@ -1207,6 +1238,21 @@ inline uint32x4_p VecRotateRight(const uint32x4_p vec) return vec_rl(vec, m); } +/// \brief Shift a packed vector right +/// \tparam C shift bit count +/// \param vec the vector +/// \returns vector +/// \details VecShiftRight() rotates each element in a packed vector by bit count. +/// \par Wraps +/// vec_rl +/// \since Crypto++ 8.1 +template +inline uint32x4_p VecShiftRight(const uint32x4_p vec) +{ + const uint32x4_p m = {C, C, C, C}; + return vec_sr(vec, m); +} + #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief Rotate a packed vector right @@ -1225,6 +1271,22 @@ inline uint64x2_p VecRotateRight(const uint64x2_p vec) return vec_rl(vec, m); } +/// \brief Shift a packed vector right +/// \tparam C shift bit count +/// \param vec the vector +/// \returns vector +/// \details VecShiftRight() rotates each element in a packed vector by bit count. +/// \details VecShiftRight() with 64-bit elements is available on POWER8 and above. +/// \par Wraps +/// vec_sr +/// \since Crypto++ 8.1 +template +inline uint64x2_p VecShiftRight(const uint64x2_p vec) +{ + const uint64x2_p m = {C, C}; + return vec_sr(vec, m); +} + #endif /// \brief Exchange high and low double words diff --git a/sha_simd.cpp b/sha_simd.cpp index 7184f3e1..0b31ea37 100644 --- a/sha_simd.cpp +++ b/sha_simd.cpp @@ -46,17 +46,6 @@ # define EXCEPTION_EXECUTE_HANDLER 1 #endif -// Thanks to Peter Cordes, https://stackoverflow.com/q/54016821/608639 -#if (CRYPTOPP_ARM_NEON_AVAILABLE) -# ifndef PACK32x4 -# if defined(_MSC_VER) -# define PACK32x4(w,x,y,z) { ((w) + (word64(x) << 32)), ((y) + (word64(z) << 32)) } -# else -# define PACK32x4(w,x,y,z) { (w), (x), (y), (z) } -# endif -# endif // PACK32x4 -#endif // Microsoft workaround - // Clang __m128i casts #define M128_CAST(x) ((__m128i *)(void *)(x)) #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) @@ -95,9 +84,10 @@ bool CPU_ProbeSHA1() volatile bool result = true; __try { - uint32x4_t data1 = PACK32x4(1,2,3,4); - uint32x4_t data2 = PACK32x4(5,6,7,8); - uint32x4_t data3 = PACK32x4(9,10,11,12); + unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; + uint32x4_t data1 = vld1q_u32(w+0); + uint32x4_t data2 = vld1q_u32(w+4); + uint32x4_t data3 = vld1q_u32(w+8); uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); @@ -130,7 +120,10 @@ bool CPU_ProbeSHA1() result = false; else { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; + uint32x4_t data1 = vld1q_u32(w+0); + uint32x4_t data2 = vld1q_u32(w+4); + uint32x4_t data3 = vld1q_u32(w+8); uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); @@ -159,9 +152,10 @@ bool CPU_ProbeSHA2() volatile bool result = true; __try { - uint32x4_t data1 = PACK32x4(1,2,3,4); - uint32x4_t data2 = PACK32x4(5,6,7,8); - uint32x4_t data3 = PACK32x4(9,10,11,12); + unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; + uint32x4_t data1 = vld1q_u32(w+0); + uint32x4_t data2 = vld1q_u32(w+4); + uint32x4_t data3 = vld1q_u32(w+8); uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); @@ -193,7 +187,10 @@ bool CPU_ProbeSHA2() result = false; else { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12}; + uint32x4_t data1 = vld1q_u32(w+0); + uint32x4_t data2 = vld1q_u32(w+4); + uint32x4_t data3 = vld1q_u32(w+8); uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);