Port SIMON64 to Altivec

SIMON64 runs about 4x faster than C++ for POWER4 and friends. If POWER7 is available it goes back to full speed due to efficient unaligned loads
pull/739/head
Jeffrey Walton 2018-11-12 21:51:11 -05:00
parent a0608a6b80
commit 9550ccc9a3
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 107 additions and 29 deletions

View File

@ -668,8 +668,8 @@ ifeq ($(DETECT_FEATURES),1)
# Drop SIMON64 and SPECK64 to Power4 if Power7 not available
ifeq ($(SIMON64_FLAG)$(SPECK64_FLAG),)
SPECK64_FLAG = $(ALTIVEC_FLAG)
SIMON64_FLAG = $(ALTIVEC_FLAG)
SPECK64_FLAG = $(ALTIVEC_FLAG)
endif
# IBM XL C/C++
@ -751,8 +751,8 @@ ifeq ($(DETECT_FEATURES),1)
# Drop SIMON64 and SPECK64 to Power4 if Power7 not available
ifeq ($(SIMON64_FLAG)$(SPECK64_FLAG),)
SPECK64_FLAG = $(ALTIVEC_FLAG)
SIMON64_FLAG = $(ALTIVEC_FLAG)
SPECK64_FLAG = $(ALTIVEC_FLAG)
endif
ifeq ($(ALTIVEC_FLAG),)

View File

@ -7,6 +7,14 @@
#include "misc.h"
#include "cpu.h"
#ifndef CRYPTOPP_INLINE
# if defined(CRYPTOPP_DEBUG)
# define CRYPTOPP_INLINE static
# else
# define CRYPTOPP_INLINE inline
# endif
#endif
// Uncomment for benchmarking C++ against SSE or NEON.
// Do so in both simon.cpp and simon-simd.cpp.
// #undef CRYPTOPP_SSSE3_AVAILABLE
@ -228,11 +236,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
#if (CRYPTOPP_POWER7_AVAILABLE)
extern size_t SIMON64_Enc_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
extern size_t SIMON64_Dec_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
#endif
@ -255,9 +263,9 @@ std::string SIMON64::Base::AlgorithmProvider() const
if (HasNEON())
return "NEON";
# endif
# if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
return "Power7";
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return "Altivec";
# endif
#endif
return "C++";
@ -292,6 +300,12 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
default:
CRYPTOPP_ASSERT(0);;
}
// Altivec loads the current subkey as a 16-byte vector
// The extra elements ensure memory backs the last subkey.
#if CRYPTOPP_ALTIVEC_AVAILABLE
m_rkeys.Grow(m_rkeys.size()+4);
#endif
}
void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@ -463,9 +477,9 @@ size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
return SIMON64_Enc_AdvancedProcessBlocks_POWER7(m_rkeys, (size_t)m_rounds,
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
@ -484,9 +498,9 @@ size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
#if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower7())
return SIMON64_Dec_AdvancedProcessBlocks_POWER7(m_rkeys, (size_t)m_rounds,
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
if (HasAltivec())
return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);

View File

@ -1,7 +1,7 @@
// simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
//
// This source file uses intrinsics and built-ins to gain access to
// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
// source file is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
@ -46,10 +46,18 @@
# include <arm_acle.h>
#endif
#if defined(CRYPTOPP_POWER7_AVAILABLE)
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
# include "ppc_simd.h"
#endif
#ifndef CRYPTOPP_INLINE
# if defined(CRYPTOPP_DEBUG)
# define CRYPTOPP_INLINE static
# else
# define CRYPTOPP_INLINE inline
# endif
#endif
// Squash MS LNK4221 and libtool warnings
extern const char SIMON64_SIMD_FNAME[] = __FILE__;
@ -523,15 +531,16 @@ inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
#endif // CRYPTOPP_SSE41_AVAILABLE
// ***************************** Power7 ***************************** //
// ***************************** Altivec ***************************** //
#if defined(CRYPTOPP_POWER7_AVAILABLE)
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::VectorAnd;
using CryptoPP::VectorXor;
using CryptoPP::VectorLoad;
using CryptoPP::VectorLoadBE;
// Rotate left by bit count
@ -573,16 +582,29 @@ inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk1 = vec_splats(subkeys[i]);
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(0, subkeys+i);
uint32x4_p rk2 = VectorLoad(0, subkeys+i+1);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
}
if (rounds & 1)
{
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
std::swap(x1, y1);
}
@ -618,17 +640,30 @@ inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
if (rounds & 1)
{
std::swap(x1, y1);
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
rounds--;
}
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
const uint32x4_p rk2 = vec_splats(subkeys[i]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(0, subkeys+i+1);
uint32x4_p rk2 = VectorLoad(0, subkeys+i);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
#endif
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
}
@ -667,12 +702,20 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
{
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk1 = vec_splats(subkeys[i]);
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(0, subkeys+i);
uint32x4_p rk2 = VectorLoad(0, subkeys+i+1);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk1);
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk1);
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk2);
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk2);
@ -680,7 +723,13 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
if (rounds & 1)
{
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk);
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk);
@ -727,7 +776,14 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
if (rounds & 1)
{
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
rk = vec_perm(rk, rk, m);
#endif
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
y2 = VectorXor(VectorXor(y2, rk), SIMON64_f(x2));
y3 = VectorXor(VectorXor(y3, rk), SIMON64_f(x3));
@ -736,12 +792,20 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
{
#if CRYPTOPP_POWER7_AVAILABLE
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
const uint32x4_p rk2 = vec_splats(subkeys[i]);
#else
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
uint32x4_p rk1 = VectorLoad(0, subkeys+i+1);
uint32x4_p rk2 = VectorLoad(0, subkeys+i);
rk1 = vec_perm(rk1, rk1, m);
rk2 = vec_perm(rk2, rk2, m);
#endif
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk1);
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk1);
const uint32x4_p rk2 = vec_splats(subkeys[i]);
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk2);
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk2);
@ -764,7 +828,7 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
}
#endif // CRYPTOPP_POWER7_AVAILABLE
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
ANONYMOUS_NAMESPACE_END
@ -808,17 +872,17 @@ size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou
}
#endif
// ***************************** Power7 ***************************** //
// ***************************** Altivec ***************************** //
#if defined(CRYPTOPP_POWER7_AVAILABLE)
size_t SIMON64_Enc_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t SIMON64_Dec_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,