From 1d1a1507372b7be7cbccdfec818a9f9985be03c6 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 16 Apr 2017 13:00:45 -0400 Subject: [PATCH] Avoid extra loads of workspace variables --- aria.cpp | 24 +++++++++++------------- aria.h | 2 +- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/aria.cpp b/aria.cpp index 10bbb2e5..3f5b9058 100644 --- a/aria.cpp +++ b/aria.cpp @@ -248,21 +248,16 @@ inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16]) #if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS template -inline void ARIA_GSRK_NEON(const word32 X[4], const word32 Y[4], byte RK[16]) +inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16]) { static const unsigned int Q1 = (4-(N/32)) % 4; static const unsigned int Q2 = (3-(N/32)) % 4; static const unsigned int R = N % 32; - const uint32x4_t a = vld1q_u32((const uint32_t*)X); - const uint32x4_t t = vld1q_u32((const uint32_t*)Y); - const uint32x4_t b = vextq_u32(t, t, Q1); - const uint32x4_t c = vextq_u32(t, t, Q2); - vst1q_u32(reinterpret_cast(RK), - veorq_u32(a, veorq_u32( - vshrq_n_u32(b, R), - vshlq_n_u32(c, 32-R)))); + veorq_u32(X, veorq_u32( + vshrq_n_u32(vextq_u32(Y, Y, Q1), R), + vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))); } #endif @@ -401,6 +396,11 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam #if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS if (HasNEON()) { + const uint32x4_t w0 = vld1q_u32((const uint32_t*)(m_w.data()+0)); + const uint32x4_t w1 = vld1q_u32((const uint32_t*)(m_w.data()+8)); + const uint32x4_t w2 = vld1q_u32((const uint32_t*)(m_w.data()+12)); + const uint32x4_t w3 = vld1q_u32((const uint32_t*)(m_w.data()+16)); + ARIA_GSRK_NEON<19>(w0, w1, rk + 0); ARIA_GSRK_NEON<19>(w1, w2, rk + 16); ARIA_GSRK_NEON<19>(w2, w3, rk + 32); @@ -467,7 +467,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam #if CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS if (HasSSE2()) { - a=reinterpret_cast(rk); s=m_w.data()+24; z=a+r*4; + a=reinterpret_cast(rk); s=m_w.data()+24; z=a+r*4; _mm_store_si128((__m128i*)t, _mm_load_si128((const __m128i*)a)); _mm_store_si128((__m128i*)a, _mm_load_si128((const __m128i*)z)); _mm_store_si128((__m128i*)z, _mm_load_si128((const __m128i*)t)); @@ -492,7 +492,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam else #endif { - a=reinterpret_cast(rk); s=m_w.data()+24; z=a+r*4; + a=reinterpret_cast(rk); s=m_w.data()+24; z=a+r*4; ::memcpy(t, a, 16); ::memcpy(a, z, 16); ::memcpy(z, t, 16); a+=4; z-=4; @@ -585,7 +585,6 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b _mm_loadu_si128((const __m128i*)(outBlock)), _mm_loadu_si128((const __m128i*)(xorBlock)))); } - return; } else @@ -641,7 +640,6 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b vld1q_u32((const uint32_t*)outBlock), vld1q_u32((const uint32_t*)xorBlock))); } - return; } else #endif // CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS diff --git a/aria.h b/aria.h index 1102509e..86ccf6c9 100644 --- a/aria.h +++ b/aria.h @@ -48,7 +48,7 @@ public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; private: - // Reference implementation allocates a table of 17 sub-keys. + // Reference implementation allocates a table of 17 round keys. FixedSizeAlignedSecBlock m_rk; // round keys FixedSizeAlignedSecBlock m_w; // w0, w1, w2, w3, t and u unsigned int m_rounds;