Add SSSE3 intrinsics for SPECK-128 (GH #538)

Performance increased by about 100% on a 3.1 GHz Core i5 Skylake. Throughput went from about 7.3 cpb to about 3.5 cpb. Not bad for a software-based implementation of a block cipher
pull/548/head
Jeffrey Walton 2017-11-22 07:57:39 -05:00
parent c447918815
commit 39697d92bf
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 107 additions and 49 deletions

View File

@ -56,7 +56,13 @@ GCC_COMPILER := $(shell $(CXX) --version 2>/dev/null | $(GREP) -v -E '(llvm|clan
XLC_COMPILER := $(shell $(CXX) $(CXX) -qversion 2>/dev/null |$(GREP) -i -c "IBM XL")
CLANG_COMPILER := $(shell $(CXX) --version 2>/dev/null | $(GREP) -i -c -E '(llvm|clang)')
INTEL_COMPILER := $(shell $(CXX) --version 2>/dev/null | $(GREP) -i -c '\(icc\)')
# Various Port compilers on OS X
MACPORTS_COMPILER := $(shell $(CXX) --version 2>/dev/null | $(GREP) -i -c "macports")
HOMEBREW_COMPILER := $(shell $(CXX) --version 2>/dev/null | $(GREP) -i -c "homebrew")
ifneq ($(MACPORTS_COMPILER)$(HOMEBREW_COMPILER),00)
OSXPORT_COMPILER := 1
endif
# Sun Studio 12.0 provides SunCC 0x0510; and Sun Studio 12.3 provides SunCC 0x0512
SUNCC_510_OR_LATER := $(shell $(CXX) -V 2>&1 | $(GREP) -i -c -E "CC: (Sun|Studio) .* (5\.1[0-9]|5\.[2-9]|6\.)")
@ -175,7 +181,7 @@ endif # IS_MINGW32
ifneq ($(IS_X86)$(IS_X32)$(IS_X64),000)
# Fixup. Clang reports an error rather than "LLVM assembler" or similar.
ifneq ($(MACPORTS_COMPILER),1)
ifneq ($(OSXPORT_COMPILER),1)
HAVE_GAS := $(shell $(CXX) -xc -c /dev/null -Wa,-v -o/dev/null 2>&1 | $(GREP) -c "GNU assembler")
endif
@ -237,6 +243,7 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SPECK_FLAG = -mssse3
endif
ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),)
HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -dM -E - 2>/dev/null | $(GREP) -i -c __SSE4_1__)
@ -272,6 +279,7 @@ ifeq ($(SUN_COMPILER),1)
ifeq ($(COUNT),0)
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1
LDFLAGS += -xarch=ssse3
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
@ -307,9 +315,9 @@ ifneq ($(INTEL_COMPILER),0)
endif
endif
# Tell MacPorts GCC to use Clang integrated assembler
# Tell MacPorts and Homebrew GCC to use Clang integrated assembler
# http://github.com/weidai11/cryptopp/issues/190
ifeq ($(GCC_COMPILER)$(MACPORTS_COMPILER),11)
ifeq ($(GCC_COMPILER)$(OSXPORT_COMPILER),11)
ifeq ($(findstring -Wa,-q,$(CXXFLAGS)),)
CXXFLAGS += -Wa,-q
endif
@ -1033,6 +1041,10 @@ sha-simd.o : sha-simd.cpp
shacal2-simd.o : shacal2-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
# SSE4.1 or ARMv8a available
speck-simd.o : speck-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SPECK_FLAG) -c) $<
# Don't build Threefish with UBsan on Travis CI. Timeouts cause the build to fail.
# Also see http://stackoverflow.com/q/12983137/608639.
ifeq ($(findstring true,$(CI)),true)

127
speck.cpp
View File

@ -1,10 +1,14 @@
// speck.h - written and placed in the public domain by Jeffrey Walton
// speck.cpp - written and placed in the public domain by Jeffrey Walton
#include "pch.h"
#include "config.h"
#include "speck.h"
#include "misc.h"
#include "cpu.h"
// Uncomment to benchmark C/C++, and to isolate SSE code.
// #undef CRYPTOPP_SSSE3_AVAILABLE
ANONYMOUS_NAMESPACE_BEGIN
@ -21,7 +25,7 @@ using CryptoPP::rotrFixed;
//! additional template parameters also made calling SPECK_Encrypt and SPECK_Decrypt
//! kind of messy.
template <class W>
inline void TF83(W& x, W& y, const W& k)
inline void TF83(W& x, W& y, const W k)
{
x = rotrFixed(x, 8);
x += y; x ^= k;
@ -37,12 +41,12 @@ inline void TF83(W& x, W& y, const W& k)
//! additional template parameters also made calling SPECK_Encrypt and SPECK_Decrypt
//! kind of messy.
template <class W>
inline void TR83(W& x, W& y, const W& k)
inline void TR83(W& x, W& y, const W k)
{
y^=x;
y=rotrFixed(y,3);
x^=k; x-=y;
x=rotlFixed(x,8);
y ^= x;
y = rotrFixed(y,3);
x ^= k; x -= y;
x = rotlFixed(x,8);
}
//! \brief Forward transformation
@ -89,7 +93,7 @@ inline void SPECK_ExpandKey_2W(W key[R], const W k[2])
CRYPTOPP_ASSERT(R==32);
W i=0, B=k[0], A=k[1];
while(i<R-1)
while (i<R-1)
{
key[i]=A; TF83(B, A, i);
i++;
@ -110,7 +114,7 @@ inline void SPECK_ExpandKey_3W(W key[R], const W k[3])
W i=0, C=k[0], B=k[1], A=k[2];
unsigned int blocks = R/2;
while(blocks--)
while (blocks--)
{
key[i+0]=A; TF83(B, A, i+0);
key[i+1]=A; TF83(C, A, i+1);
@ -137,7 +141,7 @@ inline void SPECK_ExpandKey_4W(W key[R], const W k[4])
W i=0, D=k[0], C=k[1], B=k[2], A=k[3];
unsigned int blocks = R/3;
while(blocks--)
while (blocks--)
{
key[i+0]=A; TF83(B, A, i+0);
key[i+1]=A; TF83(C, A, i+1);
@ -163,6 +167,14 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
{
CRYPTOPP_ASSERT(keyLength == 12 || keyLength == 16);
@ -177,12 +189,14 @@ void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
switch (m_kwords)
{
case 3:
m_rkey.New(26);
SPECK_ExpandKey_3W<word32, 26>(m_rkey, m_wspace);
m_rkeys.New(26);
m_rounds = 26;
SPECK_ExpandKey_3W<word32, 26>(m_rkeys, m_wspace);
break;
case 4:
m_rkey.New(27);
SPECK_ExpandKey_4W<word32, 27>(m_rkey, m_wspace);
m_rkeys.New(27);
m_rounds = 27;
SPECK_ExpandKey_4W<word32, 27>(m_rkeys, m_wspace);
break;
default:
CRYPTOPP_ASSERT(0);;
@ -195,13 +209,13 @@ void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
typedef GetBlock<word32, BigEndian, false> InBlock;
InBlock iblk(inBlock); iblk(m_wspace[0])(m_wspace[1]);
switch (m_kwords)
switch (m_rounds)
{
case 3:
SPECK_Encrypt<word32, 26>(m_wspace+2, m_wspace+0, m_rkey);
case 26:
SPECK_Encrypt<word32, 26>(m_wspace+2, m_wspace+0, m_rkeys);
break;
case 4:
SPECK_Encrypt<word32, 27>(m_wspace+2, m_wspace+0, m_rkey);
case 27:
SPECK_Encrypt<word32, 27>(m_wspace+2, m_wspace+0, m_rkeys);
break;
default:
CRYPTOPP_ASSERT(0);;
@ -218,13 +232,13 @@ void SPECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
typedef GetBlock<word32, BigEndian, false> InBlock;
InBlock iblk(inBlock); iblk(m_wspace[0])(m_wspace[1]);
switch (m_kwords)
switch (m_rounds)
{
case 3:
SPECK_Decrypt<word32, 26>(m_wspace+2, m_wspace+0, m_rkey);
case 26:
SPECK_Decrypt<word32, 26>(m_wspace+2, m_wspace+0, m_rkeys);
break;
case 4:
SPECK_Decrypt<word32, 27>(m_wspace+2, m_wspace+0, m_rkey);
case 27:
SPECK_Decrypt<word32, 27>(m_wspace+2, m_wspace+0, m_rkeys);
break;
default:
CRYPTOPP_ASSERT(0);;
@ -251,16 +265,19 @@ void SPECK128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength
switch (m_kwords)
{
case 2:
m_rkey.New(32);
SPECK_ExpandKey_2W<word64, 32>(m_rkey, m_wspace);
m_rkeys.New(32);
m_rounds = 32;
SPECK_ExpandKey_2W<word64, 32>(m_rkeys, m_wspace);
break;
case 3:
m_rkey.New(33);
SPECK_ExpandKey_3W<word64, 33>(m_rkey, m_wspace);
m_rkeys.New(33);
m_rounds = 33;
SPECK_ExpandKey_3W<word64, 33>(m_rkeys, m_wspace);
break;
case 4:
m_rkey.New(34);
SPECK_ExpandKey_4W<word64, 34>(m_rkey, m_wspace);
m_rkeys.New(34);
m_rounds = 34;
SPECK_ExpandKey_4W<word64, 34>(m_rkeys, m_wspace);
break;
default:
CRYPTOPP_ASSERT(0);;
@ -273,16 +290,16 @@ void SPECK128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
typedef GetBlock<word64, BigEndian, false> InBlock;
InBlock iblk(inBlock); iblk(m_wspace[0])(m_wspace[1]);
switch (m_kwords)
switch (m_rounds)
{
case 2:
SPECK_Encrypt<word64, 32>(m_wspace+2, m_wspace+0, m_rkey);
case 32:
SPECK_Encrypt<word64, 32>(m_wspace+2, m_wspace+0, m_rkeys);
break;
case 3:
SPECK_Encrypt<word64, 33>(m_wspace+2, m_wspace+0, m_rkey);
case 33:
SPECK_Encrypt<word64, 33>(m_wspace+2, m_wspace+0, m_rkeys);
break;
case 4:
SPECK_Encrypt<word64, 34>(m_wspace+2, m_wspace+0, m_rkey);
case 34:
SPECK_Encrypt<word64, 34>(m_wspace+2, m_wspace+0, m_rkeys);
break;
default:
CRYPTOPP_ASSERT(0);;
@ -299,16 +316,16 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
typedef GetBlock<word64, BigEndian, false> InBlock;
InBlock iblk(inBlock); iblk(m_wspace[0])(m_wspace[1]);
switch (m_kwords)
switch (m_rounds)
{
case 2:
SPECK_Decrypt<word64, 32>(m_wspace+2, m_wspace+0, m_rkey);
case 32:
SPECK_Decrypt<word64, 32>(m_wspace+2, m_wspace+0, m_rkeys);
break;
case 3:
SPECK_Decrypt<word64, 33>(m_wspace+2, m_wspace+0, m_rkey);
case 33:
SPECK_Decrypt<word64, 33>(m_wspace+2, m_wspace+0, m_rkeys);
break;
case 4:
SPECK_Decrypt<word64, 34>(m_wspace+2, m_wspace+0, m_rkey);
case 34:
SPECK_Decrypt<word64, 34>(m_wspace+2, m_wspace+0, m_rkeys);
break;
default:
CRYPTOPP_ASSERT(0);;
@ -319,4 +336,26 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[2])(m_wspace[3]);
}
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3())
return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
{
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3())
return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif
NAMESPACE_END

11
speck.h
View File

@ -45,12 +45,13 @@ template <class W>
struct SPECK_Base
{
virtual ~SPECK_Base() {}
SPECK_Base() : m_kwords(0) {}
SPECK_Base() : m_kwords(0), m_rounds(0) {}
typedef SecBlock<W, AllocatorWithCleanup<W, true> > AlignedSecBlock;
mutable AlignedSecBlock m_wspace; // workspace
AlignedSecBlock m_rkey; // round keys
AlignedSecBlock m_rkeys; // round keys
unsigned int m_kwords; // number of key words
unsigned int m_rounds; // number of rounds
};
//! \class SPECK64
@ -141,6 +142,9 @@ public:
{
protected:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
//! \brief Provides implementation for encryption transformation
@ -151,6 +155,9 @@ public:
{
protected:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
#endif
};
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;