Port SIMON64 to Altivec
SIMON64 runs about 4x faster than C++ for POWER4 and friends. If POWER7 is available it goes back to full speed due to efficient unaligned loadspull/739/head
parent
a0608a6b80
commit
9550ccc9a3
|
|
@ -668,8 +668,8 @@ ifeq ($(DETECT_FEATURES),1)
|
|||
|
||||
# Drop SIMON64 and SPECK64 to Power4 if Power7 not available
|
||||
ifeq ($(SIMON64_FLAG)$(SPECK64_FLAG),)
|
||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||
SIMON64_FLAG = $(ALTIVEC_FLAG)
|
||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
|
||||
# IBM XL C/C++
|
||||
|
|
@ -751,8 +751,8 @@ ifeq ($(DETECT_FEATURES),1)
|
|||
|
||||
# Drop SIMON64 and SPECK64 to Power4 if Power7 not available
|
||||
ifeq ($(SIMON64_FLAG)$(SPECK64_FLAG),)
|
||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||
SIMON64_FLAG = $(ALTIVEC_FLAG)
|
||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||
endif
|
||||
|
||||
ifeq ($(ALTIVEC_FLAG),)
|
||||
|
|
|
|||
38
simon.cpp
38
simon.cpp
|
|
@ -7,6 +7,14 @@
|
|||
#include "misc.h"
|
||||
#include "cpu.h"
|
||||
|
||||
#ifndef CRYPTOPP_INLINE
|
||||
# if defined(CRYPTOPP_DEBUG)
|
||||
# define CRYPTOPP_INLINE static
|
||||
# else
|
||||
# define CRYPTOPP_INLINE inline
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Uncomment for benchmarking C++ against SSE or NEON.
|
||||
// Do so in both simon.cpp and simon-simd.cpp.
|
||||
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
||||
|
|
@ -228,11 +236,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si
|
|||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
extern size_t SIMON64_Enc_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t SIMON64_Dec_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
|
||||
extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
#endif
|
||||
|
||||
|
|
@ -255,9 +263,9 @@ std::string SIMON64::Base::AlgorithmProvider() const
|
|||
if (HasNEON())
|
||||
return "NEON";
|
||||
# endif
|
||||
# if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
return "Power7";
|
||||
# if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return "Altivec";
|
||||
# endif
|
||||
#endif
|
||||
return "C++";
|
||||
|
|
@ -292,6 +300,12 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength,
|
|||
default:
|
||||
CRYPTOPP_ASSERT(0);;
|
||||
}
|
||||
|
||||
// Altivec loads the current subkey as a 16-byte vector
|
||||
// The extra elements ensure memory backs the last subkey.
|
||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
m_rkeys.Grow(m_rkeys.size()+4);
|
||||
#endif
|
||||
}
|
||||
|
||||
void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
|
||||
|
|
@ -463,9 +477,9 @@ size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
|
|||
return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
return SIMON64_Enc_AdvancedProcessBlocks_POWER7(m_rkeys, (size_t)m_rounds,
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
|
|
@ -484,9 +498,9 @@ size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor
|
|||
return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||
if (HasPower7())
|
||||
return SIMON64_Dec_AdvancedProcessBlocks_POWER7(m_rkeys, (size_t)m_rounds,
|
||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
if (HasAltivec())
|
||||
return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
// simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
|
||||
//
|
||||
// This source file uses intrinsics and built-ins to gain access to
|
||||
// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
|
||||
// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate
|
||||
// source file is needed because additional CXXFLAGS are required to enable
|
||||
// the appropriate instructions sets in some build configurations.
|
||||
|
||||
|
|
@ -46,10 +46,18 @@
|
|||
# include <arm_acle.h>
|
||||
#endif
|
||||
|
||||
#if defined(CRYPTOPP_POWER7_AVAILABLE)
|
||||
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
# include "ppc_simd.h"
|
||||
#endif
|
||||
|
||||
#ifndef CRYPTOPP_INLINE
|
||||
# if defined(CRYPTOPP_DEBUG)
|
||||
# define CRYPTOPP_INLINE static
|
||||
# else
|
||||
# define CRYPTOPP_INLINE inline
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// Squash MS LNK4221 and libtool warnings
|
||||
extern const char SIMON64_SIMD_FNAME[] = __FILE__;
|
||||
|
||||
|
|
@ -523,15 +531,16 @@ inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
|
|||
|
||||
#endif // CRYPTOPP_SSE41_AVAILABLE
|
||||
|
||||
// ***************************** Power7 ***************************** //
|
||||
// ***************************** Altivec ***************************** //
|
||||
|
||||
#if defined(CRYPTOPP_POWER7_AVAILABLE)
|
||||
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
|
||||
using CryptoPP::uint8x16_p;
|
||||
using CryptoPP::uint32x4_p;
|
||||
|
||||
using CryptoPP::VectorAnd;
|
||||
using CryptoPP::VectorXor;
|
||||
using CryptoPP::VectorLoad;
|
||||
using CryptoPP::VectorLoadBE;
|
||||
|
||||
// Rotate left by bit count
|
||||
|
|
@ -573,16 +582,29 @@ inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk1 = vec_splats(subkeys[i]);
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
|
||||
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk1 = VectorLoad(0, subkeys+i);
|
||||
uint32x4_p rk2 = VectorLoad(0, subkeys+i+1);
|
||||
rk1 = vec_perm(rk1, rk1, m);
|
||||
rk2 = vec_perm(rk2, rk2, m);
|
||||
#endif
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
|
||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
|
||||
}
|
||||
|
||||
if (rounds & 1)
|
||||
{
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
|
||||
rk = vec_perm(rk, rk, m);
|
||||
#endif
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
|
||||
std::swap(x1, y1);
|
||||
}
|
||||
|
|
@ -618,17 +640,30 @@ inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1,
|
|||
if (rounds & 1)
|
||||
{
|
||||
std::swap(x1, y1);
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
|
||||
rk = vec_perm(rk, rk, m);
|
||||
#endif
|
||||
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
|
||||
rounds--;
|
||||
}
|
||||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
|
||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
|
||||
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk1 = VectorLoad(0, subkeys+i+1);
|
||||
uint32x4_p rk2 = VectorLoad(0, subkeys+i);
|
||||
rk1 = vec_perm(rk1, rk1, m);
|
||||
rk2 = vec_perm(rk2, rk2, m);
|
||||
#endif
|
||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
|
||||
}
|
||||
|
||||
|
|
@ -667,12 +702,20 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||
|
||||
for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
|
||||
{
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk1 = vec_splats(subkeys[i]);
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk1 = VectorLoad(0, subkeys+i);
|
||||
uint32x4_p rk2 = VectorLoad(0, subkeys+i+1);
|
||||
rk1 = vec_perm(rk1, rk1, m);
|
||||
rk2 = vec_perm(rk2, rk2, m);
|
||||
#endif
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1);
|
||||
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk1);
|
||||
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk1);
|
||||
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i+1]);
|
||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2);
|
||||
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk2);
|
||||
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk2);
|
||||
|
|
@ -680,7 +723,13 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||
|
||||
if (rounds & 1)
|
||||
{
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
|
||||
rk = vec_perm(rk, rk, m);
|
||||
#endif
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk);
|
||||
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk);
|
||||
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk);
|
||||
|
|
@ -727,7 +776,14 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||
if (rounds & 1)
|
||||
{
|
||||
std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
|
||||
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk = vec_splats(subkeys[rounds-1]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk = VectorLoad(0, subkeys+rounds-1);
|
||||
rk = vec_perm(rk, rk, m);
|
||||
#endif
|
||||
y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1));
|
||||
y2 = VectorXor(VectorXor(y2, rk), SIMON64_f(x2));
|
||||
y3 = VectorXor(VectorXor(y3, rk), SIMON64_f(x3));
|
||||
|
|
@ -736,12 +792,20 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||
|
||||
for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
|
||||
{
|
||||
#if CRYPTOPP_POWER7_AVAILABLE
|
||||
const uint32x4_p rk1 = vec_splats(subkeys[i+1]);
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||
#else
|
||||
const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3};
|
||||
uint32x4_p rk1 = VectorLoad(0, subkeys+i+1);
|
||||
uint32x4_p rk2 = VectorLoad(0, subkeys+i);
|
||||
rk1 = vec_perm(rk1, rk1, m);
|
||||
rk2 = vec_perm(rk2, rk2, m);
|
||||
#endif
|
||||
x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1);
|
||||
x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk1);
|
||||
x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk1);
|
||||
|
||||
const uint32x4_p rk2 = vec_splats(subkeys[i]);
|
||||
y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2);
|
||||
y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk2);
|
||||
y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk2);
|
||||
|
|
@ -764,7 +828,7 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
|
|||
block5 = (uint32x4_p)vec_perm(x3, y3, m4);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_POWER7_AVAILABLE
|
||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
||||
|
||||
ANONYMOUS_NAMESPACE_END
|
||||
|
||||
|
|
@ -808,17 +872,17 @@ size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou
|
|||
}
|
||||
#endif
|
||||
|
||||
// ***************************** Power7 ***************************** //
|
||||
// ***************************** Altivec ***************************** //
|
||||
|
||||
#if defined(CRYPTOPP_POWER7_AVAILABLE)
|
||||
size_t SIMON64_Enc_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
|
||||
#if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||
size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
|
||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t SIMON64_Dec_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds,
|
||||
size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
|
||||
|
|
|
|||
Loading…
Reference in New Issue