diff --git a/blake2.cpp b/blake2.cpp index cc1743df..790144b4 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -53,6 +53,7 @@ static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State& state); #endif +// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 #if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State& state); static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State& state); @@ -3461,341 +3462,341 @@ static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State } #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE +// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 #if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE - static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State& state) { - #undef LOAD_MSG_0_1 + #undef LOAD_MSG_0_1 #define LOAD_MSG_0_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \ t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_0_2 + #undef LOAD_MSG_0_2 #define LOAD_MSG_0_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \ t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_0_3 + #undef LOAD_MSG_0_3 #define LOAD_MSG_0_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \ t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_0_4 + #undef LOAD_MSG_0_4 #define LOAD_MSG_0_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \ t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_1_1 + #undef LOAD_MSG_1_1 #define LOAD_MSG_1_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_1_2 + #undef LOAD_MSG_1_2 #define LOAD_MSG_1_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_1_3 + #undef LOAD_MSG_1_3 #define LOAD_MSG_1_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \ t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_1_4 + #undef LOAD_MSG_1_4 #define LOAD_MSG_1_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \ t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_2_1 + #undef LOAD_MSG_2_1 #define LOAD_MSG_2_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \ t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_2_2 + #undef LOAD_MSG_2_2 #define LOAD_MSG_2_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_2_3 + #undef LOAD_MSG_2_3 #define LOAD_MSG_2_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \ t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_2_4 + #undef LOAD_MSG_2_4 #define LOAD_MSG_2_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \ t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_3_1 + #undef LOAD_MSG_3_1 #define LOAD_MSG_3_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_3_2 + #undef LOAD_MSG_3_2 #define LOAD_MSG_3_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \ t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_3_3 + #undef LOAD_MSG_3_3 #define LOAD_MSG_3_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_3_4 + #undef LOAD_MSG_3_4 #define LOAD_MSG_3_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_4_1 + #undef LOAD_MSG_4_1 #define LOAD_MSG_4_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \ t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_4_2 + #undef LOAD_MSG_4_2 #define LOAD_MSG_4_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_4_3 + #undef LOAD_MSG_4_3 #define LOAD_MSG_4_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_4_4 + #undef LOAD_MSG_4_4 #define LOAD_MSG_4_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_5_1 + #undef LOAD_MSG_5_1 #define LOAD_MSG_5_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \ t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_5_2 + #undef LOAD_MSG_5_2 #define LOAD_MSG_5_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \ t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_5_3 + #undef LOAD_MSG_5_3 #define LOAD_MSG_5_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \ t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_5_4 + #undef LOAD_MSG_5_4 #define LOAD_MSG_5_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_6_1 + #undef LOAD_MSG_6_1 #define LOAD_MSG_6_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \ t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_6_2 + #undef LOAD_MSG_6_2 #define LOAD_MSG_6_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_6_3 + #undef LOAD_MSG_6_3 #define LOAD_MSG_6_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \ t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_6_4 + #undef LOAD_MSG_6_4 #define LOAD_MSG_6_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_7_1 + #undef LOAD_MSG_7_1 #define LOAD_MSG_7_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_7_2 + #undef LOAD_MSG_7_2 #define LOAD_MSG_7_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_7_3 + #undef LOAD_MSG_7_3 #define LOAD_MSG_7_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_7_4 + #undef LOAD_MSG_7_4 #define LOAD_MSG_7_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \ t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_8_1 + #undef LOAD_MSG_8_1 #define LOAD_MSG_8_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \ t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_8_2 + #undef LOAD_MSG_8_2 #define LOAD_MSG_8_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \ buf = vcombine_u32(t0, t1); } while(0) - - #undef LOAD_MSG_8_3 + + #undef LOAD_MSG_8_3 #define LOAD_MSG_8_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \ t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_8_4 + #undef LOAD_MSG_8_4 #define LOAD_MSG_8_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_9_1 + #undef LOAD_MSG_9_1 #define LOAD_MSG_9_1(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_9_2 + #undef LOAD_MSG_9_2 #define LOAD_MSG_9_2(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \ t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_9_3 + #undef LOAD_MSG_9_3 #define LOAD_MSG_9_3(buf) \ do { uint32x2_t t0, t1; \ t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \ buf = vcombine_u32(t0, t1); } while(0) - #undef LOAD_MSG_9_4 + #undef LOAD_MSG_9_4 #define LOAD_MSG_9_4(buf) \ do { uint32x2_t t0, t1; \ t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \ buf = vcombine_u32(t0, t1); } while(0) - - #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) + + #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) - #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) + #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) - #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) - - #undef G1 - #define G1(row1,row2,row3,row4,buf) \ - do { \ - row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ - row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ - row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ - } while(0) + #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) + + #undef G1 + #define G1(row1,row2,row3,row4,buf) \ + do { \ + row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ + row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ + row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ + } while(0) - #undef G2 - #define G2(row1,row2,row3,row4,buf) \ - do { \ - row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ - row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ - row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ - } while(0) + #undef G2 + #define G2(row1,row2,row3,row4,buf) \ + do { \ + row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ + row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ + row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ + } while(0) - #undef DIAGONALIZE - #define DIAGONALIZE(row1,row2,row3,row4) \ - do { \ - row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ - } while(0) + #undef DIAGONALIZE + #define DIAGONALIZE(row1,row2,row3,row4) \ + do { \ + row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ + } while(0) - #undef UNDIAGONALIZE - #define UNDIAGONALIZE(row1,row2,row3,row4) \ - do { \ - row4 = vextq_u32(row4, row4, 1); \ - row3 = vextq_u32(row3, row3, 2); \ - row2 = vextq_u32(row2, row2, 3); \ - } while(0) + #undef UNDIAGONALIZE + #define UNDIAGONALIZE(row1,row2,row3,row4) \ + do { \ + row4 = vextq_u32(row4, row4, 1); \ + row3 = vextq_u32(row3, row3, 2); \ + row2 = vextq_u32(row2, row2, 3); \ + } while(0) - #undef ROUND - #define ROUND(r) \ - do { \ - uint32x4_t buf1, buf2, buf3, buf4; \ - LOAD_MSG_ ##r ##_1(buf1); \ - G1(row1,row2,row3,row4,buf1); \ - LOAD_MSG_ ##r ##_2(buf2); \ - G2(row1,row2,row3,row4,buf2); \ - DIAGONALIZE(row1,row2,row3,row4); \ - LOAD_MSG_ ##r ##_3(buf3); \ - G1(row1,row2,row3,row4,buf3); \ - LOAD_MSG_ ##r ##_4(buf4); \ - G2(row1,row2,row3,row4,buf4); \ - UNDIAGONALIZE(row1,row2,row3,row4); \ - } while(0) + #undef ROUND + #define ROUND(r) \ + do { \ + uint32x4_t buf1, buf2, buf3, buf4; \ + LOAD_MSG_ ##r ##_1(buf1); \ + G1(row1,row2,row3,row4,buf1); \ + LOAD_MSG_ ##r ##_2(buf2); \ + G2(row1,row2,row3,row4,buf2); \ + DIAGONALIZE(row1,row2,row3,row4); \ + LOAD_MSG_ ##r ##_3(buf3); \ + G1(row1,row2,row3,row4,buf3); \ + LOAD_MSG_ ##r ##_4(buf4); \ + G2(row1,row2,row3,row4,buf4); \ + UNDIAGONALIZE(row1,row2,row3,row4); \ + } while(0) CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); @@ -4058,7 +4059,7 @@ static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ } while(0) - #undef DIAGONALIZE + #undef DIAGONALIZE #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ do { \ uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ @@ -4068,7 +4069,7 @@ static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State row4l = t0; row4h = t1; \ } while(0) - #undef UNDIAGONALIZE + #undef UNDIAGONALIZE #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ do { \ uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \