diff --git a/blake2-simd.cpp b/blake2-simd.cpp index 8525e8c7..dbb5cd38 100644 --- a/blake2-simd.cpp +++ b/blake2-simd.cpp @@ -1430,7 +1430,12 @@ inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2_State& state) { - // Permute masks. High is element 0 (most significant), low is element 1 (least significant). + // Permute masks. High is element 0 (most significant), low is + // element 1 (least significant). We can use vec_mergeh(a,b) for + // vec_perm(a,b,HH_MASK) and vec_mergel(a,b) for vec_perm(a,b,LL_MASK). + // Benchmarks don't show a material difference. However, the code that + // uses vec_mergeh and vec_mergel is about 880 bytes shorter. + const uint8x16_p HH_MASK = { 0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23 }; const uint8x16_p HL_MASK = { 0,1,2,3,4,5,6,7, 24,25,26,27,28,29,30,31 }; const uint8x16_p LH_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 }; diff --git a/ppc-simd.h b/ppc-simd.h index 7da9886c..1a42c636 100644 --- a/ppc-simd.h +++ b/ppc-simd.h @@ -337,7 +337,7 @@ inline T VectorGetLow(const T& val) { //const T zero = {0}; //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 }; - //return (T)vec_perm(val, zero, mask); + //return (T)vec_perm(zero, val, mask); return VectorShiftRight<8>(VectorShiftLeft<8>(val)); } @@ -354,7 +354,7 @@ inline T VectorGetHigh(const T& val) { //const T zero = {0}; //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 }; - //return (T)vec_perm(val, zero, mask); + //return (T)vec_perm(zero, val, mask); return VectorShiftRight<8>(val); }