diff --git a/blake2-simd.cpp b/blake2-simd.cpp
index 8525e8c7..dbb5cd38 100644
--- a/blake2-simd.cpp
+++ b/blake2-simd.cpp
@@ -1430,7 +1430,12 @@ inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b)
 
 void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2_State<word64, true>& state)
 {
-    // Permute masks. High is element 0 (most significant), low is element 1 (least significant).
+    // Permute masks. High is element 0 (most significant), low is
+    // element 1 (least significant). We can use vec_mergeh(a,b) for
+    // vec_perm(a,b,HH_MASK) and vec_mergel(a,b) for vec_perm(a,b,LL_MASK).
+    // Benchmarks don't show a material difference. However, the code that
+    // uses vec_mergeh and vec_mergel is about 880 bytes shorter.
+
     const uint8x16_p HH_MASK = { 0,1,2,3,4,5,6,7,       16,17,18,19,20,21,22,23 };
     const uint8x16_p HL_MASK = { 0,1,2,3,4,5,6,7,       24,25,26,27,28,29,30,31 };
     const uint8x16_p LH_MASK = { 8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23 };
diff --git a/ppc-simd.h b/ppc-simd.h
index 7da9886c..1a42c636 100644
--- a/ppc-simd.h
+++ b/ppc-simd.h
@@ -337,7 +337,7 @@ inline T VectorGetLow(const T& val)
 {
     //const T zero = {0};
     //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 8,9,10,11, 12,13,14,15 };
-    //return (T)vec_perm(val, zero, mask);
+    //return (T)vec_perm(zero, val, mask);
     return VectorShiftRight<8>(VectorShiftLeft<8>(val));
 }
 
@@ -354,7 +354,7 @@ inline T VectorGetHigh(const T& val)
 {
     //const T zero = {0};
     //const uint8x16_p mask = {16,16,16,16, 16,16,16,16, 0,1,2,3, 4,5,6,7 };
-    //return (T)vec_perm(val, zero, mask);
+    //return (T)vec_perm(zero, val, mask);
     return VectorShiftRight<8>(val);
 }