diff --git a/adv-simd.h b/adv-simd.h index 46bad837..cdb0311a 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -12,6 +12,7 @@ // using two encrypt (or decrypt) functions: one that operates on 4 blocks, // and one that operates on 1 block. // +// * AdvancedProcessBlocks64_2x1_SSE // * AdvancedProcessBlocks64_4x1_SSE // * AdvancedProcessBlocks128_4x1_SSE // * AdvancedProcessBlocks64_6x2_SSE @@ -718,6 +719,155 @@ NAMESPACE_END // CryptoPP NAMESPACE_BEGIN(CryptoPP) +template +inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2, + const W *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 8); + + CRYPTOPP_ALIGN_DATA(16) + const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24}; + CRYPTOPP_ALIGN_DATA(16) + const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24}; + + const ptrdiff_t blockSize = 8; + const ptrdiff_t xmmBlockSize = 16; + + ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0; + ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); + + if (flags & BT_ReverseDirection) + { + inBlocks += static_cast(length) - xmmBlockSize; + xorBlocks += static_cast(length) - xmmBlockSize; + outBlocks += static_cast(length) - xmmBlockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BT_AllowParallel) + { + while (length >= 2*xmmBlockSize) + { + __m128i block0, block1; + if (flags & BT_InBlockIsCounter) + { + // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes. + // After the dup load we have two counters in the XMM word. Then we need + // to increment the low ctr by 0 and the high ctr by 1. + block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks)))); + + // After initial increment of {0,1} remaining counters increment by {2,2}. + const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b); + block1 = _mm_add_epi32(be2, block0); + + // Store the next counter. UBsan false positive; mem_addr can be unaligned. + _mm_store_sd(DOUBLE_CAST(inBlocks), + _mm_castsi128_pd(_mm_add_epi64(be2, block1))); + } + else + { + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (xorInput) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func2(block0, block1, subKeys, static_cast(rounds)); + + if (xorOutput) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; + + length -= 2*xmmBlockSize; + } + } + + if (length) + { + // Adjust to real block size + if (flags & BT_ReverseDirection) + { + inIncrement += inIncrement ? blockSize : 0; + xorIncrement += xorIncrement ? blockSize : 0; + outIncrement += outIncrement ? blockSize : 0; + inBlocks -= inIncrement; + xorBlocks -= xorIncrement; + outBlocks -= outIncrement; + } + else + { + inIncrement -= inIncrement ? blockSize : 0; + xorIncrement -= xorIncrement ? blockSize : 0; + outIncrement -= outIncrement ? blockSize : 0; + } + + while (length >= blockSize) + { + __m128i block = _mm_castpd_si128( + // UBsan false positive; mem_addr can be unaligned. + _mm_load_sd(CONST_DOUBLE_CAST(inBlocks))); + + if (xorInput) + { + block = _mm_xor_si128(block, _mm_castpd_si128( + // UBsan false positive; mem_addr can be unaligned. + _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks)))); + } + + if (flags & BT_InBlockIsCounter) + const_cast(inBlocks)[7]++; + + func1(block, subKeys, static_cast(rounds)); + + if (xorOutput) + { + block = _mm_xor_si128(block, _mm_castpd_si128( + // UBsan false positive; mem_addr can be unaligned. + _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks)))); + } + + // UBsan false positive; mem_addr can be unaligned. + _mm_store_sd(DOUBLE_CAST(outBlocks), _mm_castsi128_pd(block)); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + } + + return length; +} + /// \brief AdvancedProcessBlocks for 2 and 6 blocks /// \tparam F2 function to process 2 64-bit blocks /// \tparam F6 function to process 6 64-bit blocks diff --git a/cham-simd.cpp b/cham-simd.cpp index 0a4ed668..9d10adf1 100644 --- a/cham-simd.cpp +++ b/cham-simd.cpp @@ -24,10 +24,754 @@ ANONYMOUS_NAMESPACE_BEGIN +using CryptoPP::word16; using CryptoPP::word32; #if (CRYPTOPP_SSSE3_AVAILABLE) +////////////////////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(W16) // CHAM64, 16-bit word size + +template +inline __m128i RotateLeft16(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi16(val, R), _mm_srli_epi16(val, 16-R)); +} + +template +inline __m128i RotateRight16(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi16(val, 16-R), _mm_srli_epi16(val, R)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateLeft16<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateRight16<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1); + return _mm_shuffle_epi8(val, mask); +} + +template +inline __m128i UnpackXMM(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpacklo_epi16(a, b); + const __m128i r2 = _mm_unpacklo_epi16(c, d); + const __m128i r3 = _mm_unpacklo_epi16(e, f); + const __m128i r4 = _mm_unpacklo_epi16(g, h); + + const __m128i r5 = _mm_unpacklo_epi32(r1, r2); + const __m128i r6 = _mm_unpacklo_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<1>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpacklo_epi16(a, b); + const __m128i r2 = _mm_unpacklo_epi16(c, d); + const __m128i r3 = _mm_unpacklo_epi16(e, f); + const __m128i r4 = _mm_unpacklo_epi16(g, h); + + const __m128i r5 = _mm_unpacklo_epi32(r1, r2); + const __m128i r6 = _mm_unpacklo_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<2>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpacklo_epi16(a, b); + const __m128i r2 = _mm_unpacklo_epi16(c, d); + const __m128i r3 = _mm_unpacklo_epi16(e, f); + const __m128i r4 = _mm_unpacklo_epi16(g, h); + + const __m128i r5 = _mm_unpackhi_epi32(r1, r2); + const __m128i r6 = _mm_unpackhi_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<3>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpacklo_epi16(a, b); + const __m128i r2 = _mm_unpacklo_epi16(c, d); + const __m128i r3 = _mm_unpacklo_epi16(e, f); + const __m128i r4 = _mm_unpacklo_epi16(g, h); + + const __m128i r5 = _mm_unpackhi_epi32(r1, r2); + const __m128i r6 = _mm_unpackhi_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<4>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpackhi_epi16(a, b); + const __m128i r2 = _mm_unpackhi_epi16(c, d); + const __m128i r3 = _mm_unpackhi_epi16(e, f); + const __m128i r4 = _mm_unpackhi_epi16(g, h); + + const __m128i r5 = _mm_unpacklo_epi32(r1, r2); + const __m128i r6 = _mm_unpacklo_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<5>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpackhi_epi16(a, b); + const __m128i r2 = _mm_unpackhi_epi16(c, d); + const __m128i r3 = _mm_unpackhi_epi16(e, f); + const __m128i r4 = _mm_unpackhi_epi16(g, h); + + const __m128i r5 = _mm_unpacklo_epi32(r1, r2); + const __m128i r6 = _mm_unpacklo_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<6>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpackhi_epi16(a, b); + const __m128i r2 = _mm_unpackhi_epi16(c, d); + const __m128i r3 = _mm_unpackhi_epi16(e, f); + const __m128i r4 = _mm_unpackhi_epi16(g, h); + + const __m128i r5 = _mm_unpackhi_epi32(r1, r2); + const __m128i r6 = _mm_unpackhi_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpacklo_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template <> +inline __m128i UnpackXMM<7>(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decrementryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpackhi_epi16(a, b); + const __m128i r2 = _mm_unpackhi_epi16(c, d); + const __m128i r3 = _mm_unpackhi_epi16(e, f); + const __m128i r4 = _mm_unpackhi_epi16(g, h); + + const __m128i r5 = _mm_unpackhi_epi32(r1, r2); + const __m128i r6 = _mm_unpackhi_epi32(r3, r4); + return _mm_shuffle_epi8(_mm_unpackhi_epi64(r5, r6), + _mm_set_epi8(14,15,12,13, 10,11,8,9, 6,7,4,5, 2,3,0,1)); +} + +template +inline __m128i UnpackXMM(__m128i v) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1)); +} + +template <> +inline __m128i UnpackXMM<1>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3, 2,3)); +} + +template <> +inline __m128i UnpackXMM<2>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5, 4,5)); +} + +template <> +inline __m128i UnpackXMM<3>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7, 6,7)); +} + +template <> +inline __m128i UnpackXMM<4>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9, 8,9)); +} + +template <> +inline __m128i UnpackXMM<5>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11, 10,11)); +} + +template <> +inline __m128i UnpackXMM<6>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13, 12,13)); +} + +template <> +inline __m128i UnpackXMM<7>(__m128i v) +{ + return _mm_shuffle_epi8(v, _mm_set_epi8(14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15, 14,15)); +} + +template +inline __m128i UnpackXMM(__m128i a, __m128i b) +{ + const __m128i z = _mm_setzero_si128(); + return UnpackXMM(a, b, z, z, z, z, z, z); +} + +template +inline __m128i RepackXMM(__m128i a, __m128i b, __m128i c, __m128i d, + __m128i e, __m128i f, __m128i g, __m128i h) +{ + return UnpackXMM(a, b, c, d, e, f, g, h); +} + +template +inline __m128i RepackXMM(__m128i v) +{ + return UnpackXMM(v); +} + +inline void GCC_NO_UBSAN CHAM64_Enc_Block(__m128i &block0, + const word16 *subkeys, unsigned int /*rounds*/) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... + __m128i a = UnpackXMM<0>(block0); + __m128i b = UnpackXMM<1>(block0); + __m128i c = UnpackXMM<2>(block0); + __m128i d = UnpackXMM<3>(block0); + __m128i e = UnpackXMM<4>(block0); + __m128i f = UnpackXMM<5>(block0); + __m128i g = UnpackXMM<6>(block0); + __m128i h = UnpackXMM<7>(block0); + + const unsigned int rounds = 80; + __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0); + __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1); + + const unsigned int MASK = 15; + for (int i=0; i(rounds); i+=8) + { + __m128i k, kr, t1, t2, t3, t4; + + k = _mm_loadu_si128((const __m128i*) &subkeys[i & MASK]); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); + + t1 = _mm_xor_si128(a, counter); + t3 = _mm_xor_si128(e, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + e = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); + + t1 = _mm_xor_si128(b, counter); + t3 = _mm_xor_si128(f, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + f = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); + + t1 = _mm_xor_si128(c, counter); + t3 = _mm_xor_si128(g, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + g = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); + + t1 = _mm_xor_si128(d, counter); + t3 = _mm_xor_si128(h, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + h = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(9,8,9,8, 9,8,9,8, 9,8,9,8, 9,8,9,8)); + + t1 = _mm_xor_si128(a, counter); + t3 = _mm_xor_si128(e, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + e = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,11,10, 11,10,11,10, 11,10,11,10, 11,10,11,10)); + + t1 = _mm_xor_si128(b, counter); + t3 = _mm_xor_si128(f, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + f = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(13,12,13,12, 13,12,13,12, 13,12,13,12, 13,12,13,12)); + + t1 = _mm_xor_si128(c, counter); + t3 = _mm_xor_si128(g, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + g = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,15,14, 15,14,15,14, 15,14,15,14, 15,14,15,14)); + + t1 = _mm_xor_si128(d, counter); + t3 = _mm_xor_si128(h, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + h = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + } + + // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... + block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); +} + +inline void GCC_NO_UBSAN CHAM64_Dec_Block(__m128i &block0, + const word16 *subkeys, unsigned int /*rounds*/) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... + __m128i a = UnpackXMM<0>(block0); + __m128i b = UnpackXMM<1>(block0); + __m128i c = UnpackXMM<2>(block0); + __m128i d = UnpackXMM<3>(block0); + __m128i e = UnpackXMM<4>(block0); + __m128i f = UnpackXMM<5>(block0); + __m128i g = UnpackXMM<6>(block0); + __m128i h = UnpackXMM<7>(block0); + + const unsigned int rounds = 80; + __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1); + __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1); + + const unsigned int MASK = 15; + for (int i = static_cast(rounds)-1; i >= 0; i-=8) + { + __m128i k, kr, t1, t2, t3, t4; + + k = _mm_loadu_si128((const __m128i*) &subkeys[(i-7) & MASK]); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,15,14, 15,14,15,14, 15,14,15,14, 15,14,15,14)); + + // Odd round + t1 = RotateRight16<1>(d); + t3 = RotateRight16<1>(h); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(13,12,13,12, 13,12,13,12, 13,12,13,12, 13,12,13,12)); + + // Even round + t1 = RotateRight16<8>(c); + t3 = RotateRight16<8>(g); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,11,10, 11,10,11,10, 11,10,11,10, 11,10,11,10)); + + // Odd round + t1 = RotateRight16<1>(b); + t3 = RotateRight16<1>(f); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(9,8,9,8, 9,8,9,8, 9,8,9,8, 9,8,9,8)); + + // Even round + t1 = RotateRight16<8>(a); + t3 = RotateRight16<8>(e); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); + + // Odd round + t1 = RotateRight16<1>(d); + t3 = RotateRight16<1>(h); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); + + // Even round + t1 = RotateRight16<8>(c); + t3 = RotateRight16<8>(g); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); + + // Odd round + t1 = RotateRight16<1>(b); + t3 = RotateRight16<1>(f); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); + + // Even round + t1 = RotateRight16<8>(a); + t3 = RotateRight16<8>(e); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + } + + // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... + block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); +} + +inline void GCC_NO_UBSAN CHAM64_Enc_2_Blocks(__m128i &block0, + __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... + __m128i a = UnpackXMM<0>(block0, block1); + __m128i b = UnpackXMM<1>(block0, block1); + __m128i c = UnpackXMM<2>(block0, block1); + __m128i d = UnpackXMM<3>(block0, block1); + __m128i e = UnpackXMM<4>(block0, block1); + __m128i f = UnpackXMM<5>(block0, block1); + __m128i g = UnpackXMM<6>(block0, block1); + __m128i h = UnpackXMM<7>(block0, block1); + + const unsigned int rounds = 80; + __m128i counter = _mm_set_epi16(0,0,0,0,0,0,0,0); + __m128i increment = _mm_set_epi16(1,1,1,1,1,1,1,1); + + const unsigned int MASK = 15; + for (int i=0; i(rounds); i+=8) + { + __m128i k, kr, t1, t2, t3, t4; + + k = _mm_loadu_si128((const __m128i*) &subkeys[i & MASK]); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); + + t1 = _mm_xor_si128(a, counter); + t3 = _mm_xor_si128(e, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + e = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); + + t1 = _mm_xor_si128(b, counter); + t3 = _mm_xor_si128(f, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + f = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); + + t1 = _mm_xor_si128(c, counter); + t3 = _mm_xor_si128(g, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + g = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); + + t1 = _mm_xor_si128(d, counter); + t3 = _mm_xor_si128(h, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + h = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(9,8,9,8, 9,8,9,8, 9,8,9,8, 9,8,9,8)); + + t1 = _mm_xor_si128(a, counter); + t3 = _mm_xor_si128(e, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + e = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,11,10, 11,10,11,10, 11,10,11,10, 11,10,11,10)); + + t1 = _mm_xor_si128(b, counter); + t3 = _mm_xor_si128(f, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + f = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(13,12,13,12, 13,12,13,12, 13,12,13,12, 13,12,13,12)); + + t1 = _mm_xor_si128(c, counter); + t3 = _mm_xor_si128(g, counter); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = RotateLeft16<8>(_mm_add_epi16(t1, t2)); + g = RotateLeft16<8>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,15,14, 15,14,15,14, 15,14,15,14, 15,14,15,14)); + + t1 = _mm_xor_si128(d, counter); + t3 = _mm_xor_si128(h, counter); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = RotateLeft16<1>(_mm_add_epi16(t1, t2)); + h = RotateLeft16<1>(_mm_add_epi16(t3, t4)); + + counter = _mm_add_epi16(counter, increment); + } + + // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... + block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); + block1 = RepackXMM<1>(a,b,c,d,e,f,g,h); +} + +inline void GCC_NO_UBSAN CHAM64_Dec_2_Blocks(__m128i &block0, + __m128i &block1, const word16 *subkeys, unsigned int /*rounds*/) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... => [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... + __m128i a = UnpackXMM<0>(block0, block1); + __m128i b = UnpackXMM<1>(block0, block1); + __m128i c = UnpackXMM<2>(block0, block1); + __m128i d = UnpackXMM<3>(block0, block1); + __m128i e = UnpackXMM<4>(block0, block1); + __m128i f = UnpackXMM<5>(block0, block1); + __m128i g = UnpackXMM<6>(block0, block1); + __m128i h = UnpackXMM<7>(block0, block1); + + const unsigned int rounds = 80; + __m128i counter = _mm_set_epi16(rounds-1,rounds-1,rounds-1,rounds-1, rounds-1,rounds-1,rounds-1,rounds-1); + __m128i decrement = _mm_set_epi16(1,1,1,1,1,1,1,1); + + const unsigned int MASK = 15; + for (int i = static_cast(rounds)-1; i >= 0; i-=8) + { + __m128i k, kr, t1, t2, t3, t4; + + k = _mm_loadu_si128((const __m128i*) &subkeys[(i-7) & MASK]); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,15,14, 15,14,15,14, 15,14,15,14, 15,14,15,14)); + + // Odd round + t1 = RotateRight16<1>(d); + t3 = RotateRight16<1>(h); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(13,12,13,12, 13,12,13,12, 13,12,13,12, 13,12,13,12)); + + // Even round + t1 = RotateRight16<8>(c); + t3 = RotateRight16<8>(g); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,11,10, 11,10,11,10, 11,10,11,10, 11,10,11,10)); + + // Odd round + t1 = RotateRight16<1>(b); + t3 = RotateRight16<1>(f); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(9,8,9,8, 9,8,9,8, 9,8,9,8, 9,8,9,8)); + + // Even round + t1 = RotateRight16<8>(a); + t3 = RotateRight16<8>(e); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,7,6, 7,6,7,6, 7,6,7,6, 7,6,7,6)); + + // Odd round + t1 = RotateRight16<1>(d); + t3 = RotateRight16<1>(h); + t2 = _mm_xor_si128(RotateLeft16<8>(a), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(e), kr); + d = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + h = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(5,4,5,4, 5,4,5,4, 5,4,5,4, 5,4,5,4)); + + // Even round + t1 = RotateRight16<8>(c); + t3 = RotateRight16<8>(g); + t2 = _mm_xor_si128(RotateLeft16<1>(d), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(h), kr); + c = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + g = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,3,2, 3,2,3,2, 3,2,3,2, 3,2,3,2)); + + // Odd round + t1 = RotateRight16<1>(b); + t3 = RotateRight16<1>(f); + t2 = _mm_xor_si128(RotateLeft16<8>(c), kr); + t4 = _mm_xor_si128(RotateLeft16<8>(g), kr); + b = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + f = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + kr = _mm_shuffle_epi8(k, _mm_set_epi8(1,0,1,0, 1,0,1,0, 1,0,1,0, 1,0,1,0)); + + // Even round + t1 = RotateRight16<8>(a); + t3 = RotateRight16<8>(e); + t2 = _mm_xor_si128(RotateLeft16<1>(b), kr); + t4 = _mm_xor_si128(RotateLeft16<1>(f), kr); + a = _mm_xor_si128(_mm_sub_epi16(t1, t2), counter); + e = _mm_xor_si128(_mm_sub_epi16(t3, t4), counter); + + counter = _mm_sub_epi16(counter, decrement); + } + + // [A1 B1 .. G1 H1][A2 B2 .. G2 H2] ... => [A1 A2 .. A6 A7][B1 B2 .. B6 B7] ... + block0 = RepackXMM<0>(a,b,c,d,e,f,g,h); + block1 = RepackXMM<1>(a,b,c,d,e,f,g,h); +} + +NAMESPACE_END // W16 + +////////////////////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(W32) // CHAM128, 32-bit word size + template inline __m128i RotateLeft32(const __m128i& val) { @@ -408,24 +1152,42 @@ inline void GCC_NO_UBSAN CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1, block3 = RepackXMM<3>(a,b,c,d); } -#endif +////////////////////////////////////////////////////////////////////////// + +NAMESPACE_END // W32 + +#endif // CRYPTOPP_SSSE3_AVAILABLE ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) #if defined(CRYPTOPP_SSSE3_AVAILABLE) +size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Enc_Block, W16::CHAM64_Enc_2_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_2x1_SSE(W16::CHAM64_Dec_Block, W16::CHAM64_Dec_2_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return AdvancedProcessBlocks128_4x1_SSE(CHAM128_Enc_Block, CHAM128_Enc_4_Blocks, + return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Enc_Block, W32::CHAM128_Enc_4_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return AdvancedProcessBlocks128_4x1_SSE(CHAM128_Dec_Block, CHAM128_Dec_4_Blocks, + return AdvancedProcessBlocks128_4x1_SSE(W32::CHAM128_Dec_Block, W32::CHAM128_Dec_4_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif // CRYPTOPP_SSSE3_AVAILABLE diff --git a/cham.cpp b/cham.cpp index efd869af..9ae866d6 100644 --- a/cham.cpp +++ b/cham.cpp @@ -96,7 +96,13 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) -#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +extern size_t CHAM64_Enc_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t CHAM64_Dec_AdvancedProcessBlocks_SSSE3(const word16* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -308,7 +314,27 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]); } -#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS +size_t CHAM64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ + if (HasSSSE3()) { + return CHAM64_Enc_AdvancedProcessBlocks_SSSE3(m_rk, 80, + inBlocks, xorBlocks, outBlocks, length, flags); + } + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t CHAM64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ + if (HasSSSE3()) { + return CHAM64_Dec_AdvancedProcessBlocks_SSSE3(m_rk, 80, + inBlocks, xorBlocks, outBlocks, length, flags); + } + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { @@ -330,6 +356,6 @@ size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor } return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } -#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS +#endif // CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS NAMESPACE_END diff --git a/cham.h b/cham.h index 1c8b02b1..0d0da501 100644 --- a/cham.h +++ b/cham.h @@ -16,7 +16,7 @@ #include "algparam.h" #if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86) -# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1 +# define CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS 1 #endif NAMESPACE_BEGIN(CryptoPP) @@ -74,6 +74,10 @@ public: { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; + +#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif }; /// \brief Provides implementation for encryption transformation @@ -84,6 +88,10 @@ public: { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; + +#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif }; typedef BlockCipherFinal Encryption; @@ -125,7 +133,7 @@ public: public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif }; @@ -139,7 +147,7 @@ public: public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS +#if CRYPTOPP_CHAM_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif };