From 07970ea82f03fa88b90431ad04ab3eba02d0311e Mon Sep 17 00:00:00 2001
From: Pauli Oikkonen <pauli.oikkonen@tut.fi>
Date: Wed, 29 May 2019 19:29:07 +0300
Subject: [PATCH] Streamline by-the-book 8x8 matrix multiplication

Also chop up the forward transform into two tailored multiply functions
---
 src/strategies/avx2/dct-avx2.c | 344 +++++++++++++++++++--------------
 1 file changed, 203 insertions(+), 141 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index 82c4c086..3304c923 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -95,65 +95,6 @@ static void mul_clip_matrix_4x4_avx2(const int16_t *left, const int16_t *right,
   _mm256_storeu_si256((__m256i*)dst, result);
 }
 
-// 8x8 matrix multiplication with value clipping.
-// Parameters: Two 8x8 matrices containing 16-bit values in consecutive addresses,
-//             destination for the result and the shift value for clipping.
-//
-static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift)
-{
-  int i, j;
-  __m256i b[2], accu[8], even[2], odd[2];
-
-  const int32_t add = 1 << (shift - 1);
-
-  b[0] = _mm256_loadu_si256((__m256i*) right);
-
-  b[1] = _mm256_unpackhi_epi16(b[0], _mm256_castsi128_si256(_mm256_extracti128_si256(b[0], 1)));
-  b[0] = _mm256_unpacklo_epi16(b[0], _mm256_castsi128_si256(_mm256_extracti128_si256(b[0], 1)));
-  b[0] = _mm256_inserti128_si256(b[0], _mm256_castsi256_si128(b[1]), 1);
-
-  for (i = 0; i < 8; i += 2) {
-
-    even[0] = _mm256_set1_epi32(((int32_t*)left)[4 * i]);
-    even[0] = _mm256_madd_epi16(even[0], b[0]);
-    accu[i] = even[0];
-
-    odd[0] = _mm256_set1_epi32(((int32_t*)left)[4 * (i + 1)]);
-    odd[0] = _mm256_madd_epi16(odd[0], b[0]);
-    accu[i + 1] = odd[0];
-  }
-
-  for (j = 1; j < 4; ++j) {
-
-    b[0] = _mm256_loadu_si256((__m256i*)right + j);
-
-    b[1] = _mm256_unpackhi_epi16(b[0], _mm256_castsi128_si256(_mm256_extracti128_si256(b[0], 1)));
-    b[0] = _mm256_unpacklo_epi16(b[0], _mm256_castsi128_si256(_mm256_extracti128_si256(b[0], 1)));
-    b[0] = _mm256_inserti128_si256(b[0], _mm256_castsi256_si128(b[1]), 1);
-
-    for (i = 0; i < 8; i += 2) {
-
-      even[0] = _mm256_set1_epi32(((int32_t*)left)[4 * i + j]);
-      even[0] = _mm256_madd_epi16(even[0], b[0]);
-      accu[i] = _mm256_add_epi32(accu[i], even[0]);
-
-      odd[0] = _mm256_set1_epi32(((int32_t*)left)[4 * (i + 1) + j]);
-      odd[0] = _mm256_madd_epi16(odd[0], b[0]);
-      accu[i + 1] = _mm256_add_epi32(accu[i + 1], odd[0]);
-    }
-  }
-
-  for (i = 0; i < 8; i += 2) {
-    __m256i result, first_half, second_half;
-
-    first_half = _mm256_srai_epi32(_mm256_add_epi32(accu[i], _mm256_set1_epi32(add)), shift);
-    second_half = _mm256_srai_epi32(_mm256_add_epi32(accu[i + 1], _mm256_set1_epi32(add)), shift);
-    result = _mm256_permute4x64_epi64(_mm256_packs_epi32(first_half, second_half), 0 + 8 + 16 + 192);
-    _mm256_storeu_si256((__m256i*)dst + i / 2, result);
-
-  }
-}
-
 static INLINE __m256i swap_lanes(__m256i v)
 {
   return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(1, 0, 3, 2));
@@ -165,18 +106,83 @@ static INLINE __m256i truncate(__m256i v, __m256i debias, int32_t shift)
   return              _mm256_srai_epi32(truncable, shift);
 }
 
-static void matrix_dct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift)
 {
-  int32_t shift_1st = kvz_g_convert_to_bit[8] + 1 + (bitdepth - 8);
-  int32_t shift_2nd = kvz_g_convert_to_bit[8] + 8;
+  const __m256i transp_mask = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15));
 
-  const int32_t add1    = 1 << (shift_1st - 1);
-  const __m256i debias1 = _mm256_set1_epi32(add1);
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
 
-  const int32_t add2    = 1 << (shift_2nd - 1);
-  const __m256i debias2 = _mm256_set1_epi32(add2);
+  __m256i left_dr[4] = {
+    _mm256_loadu_si256((const __m256i *)left + 0),
+    _mm256_loadu_si256((const __m256i *)left + 1),
+    _mm256_loadu_si256((const __m256i *)left + 2),
+    _mm256_loadu_si256((const __m256i *)left + 3),
+  };
+  __m256i right_dr[4] = {
+    _mm256_loadu_si256((const __m256i *)right + 0),
+    _mm256_loadu_si256((const __m256i *)right + 1),
+    _mm256_loadu_si256((const __m256i *)right + 2),
+    _mm256_loadu_si256((const __m256i *)right + 3),
+  };
 
-  const __m256i *dct    = (__m256i *)&(kvz_g_dct_8[0][0]);
+  __m256i rdrs_rearr[8];
+
+  // Rearrange right matrix
+  for (int32_t dry = 0; dry < 4; dry++) {
+    __m256i rdr = right_dr[dry];
+    __m256i rdr_los = _mm256_permute4x64_epi64(rdr, _MM_SHUFFLE(2, 0, 2, 0));
+    __m256i rdr_his = _mm256_permute4x64_epi64(rdr, _MM_SHUFFLE(3, 1, 3, 1));
+
+    __m256i rdr_lo_rearr = _mm256_shuffle_epi8(rdr_los, transp_mask);
+    __m256i rdr_hi_rearr = _mm256_shuffle_epi8(rdr_his, transp_mask);
+
+    rdrs_rearr[dry * 2 + 0] = rdr_lo_rearr;
+    rdrs_rearr[dry * 2 + 1] = rdr_hi_rearr;
+  }
+
+  // Double-Row Y for destination matrix
+  for (int32_t dry = 0; dry < 4; dry++) {
+    __m256i ldr = left_dr[dry];
+
+    __m256i ldr_slice12 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i ldr_slice34 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i ldr_slice56 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i ldr_slice78 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(3, 3, 3, 3));
+
+    __m256i prod1 = _mm256_madd_epi16(ldr_slice12, rdrs_rearr[0]);
+    __m256i prod2 = _mm256_madd_epi16(ldr_slice12, rdrs_rearr[1]);
+    __m256i prod3 = _mm256_madd_epi16(ldr_slice34, rdrs_rearr[2]);
+    __m256i prod4 = _mm256_madd_epi16(ldr_slice34, rdrs_rearr[3]);
+    __m256i prod5 = _mm256_madd_epi16(ldr_slice56, rdrs_rearr[4]);
+    __m256i prod6 = _mm256_madd_epi16(ldr_slice56, rdrs_rearr[5]);
+    __m256i prod7 = _mm256_madd_epi16(ldr_slice78, rdrs_rearr[6]);
+    __m256i prod8 = _mm256_madd_epi16(ldr_slice78, rdrs_rearr[7]);
+
+    __m256i lo_1 = _mm256_add_epi32(prod1, prod3);
+    __m256i hi_1 = _mm256_add_epi32(prod2, prod4);
+    __m256i lo_2 = _mm256_add_epi32(prod5, prod7);
+    __m256i hi_2 = _mm256_add_epi32(prod6, prod8);
+
+    __m256i lo   = _mm256_add_epi32(lo_1,  lo_2);
+    __m256i hi   = _mm256_add_epi32(hi_1,  hi_2);
+
+    __m256i lo_tr = truncate(lo, debias, shift);
+    __m256i hi_tr = truncate(hi, debias, shift);
+
+    __m256i final_dr = _mm256_packs_epi32(lo_tr, hi_tr);
+
+    _mm256_storeu_si256((__m256i *)dst + dry, final_dr);
+  }
+}
+
+// Multiplies A by B_T's transpose and stores result's transpose in output,
+// which should be an array of 4 __m256i's
+static void matmul_8x8_a_bt_t(const int16_t *a, const int16_t *b_t,
+    const int8_t shift, __m256i *output)
+{
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
 
   // Keep upper row intact and swap neighboring 16-bit words in lower row
   const __m256i shuf_lorow_mask =
@@ -185,18 +191,113 @@ static void matrix_dct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *
                        18, 19, 16, 17, 22, 23, 20, 21,
                        26, 27, 24, 25, 30, 31, 28, 29);
 
-  __m256i tmpres[4];
+  const __m256i *b_t_256 = (const __m256i *)b_t;
 
   // Dual Rows, because two 8x16b words fit in one YMM
-  __m256i i_dr_0      = _mm256_loadu_si256((__m256i *)input + 0);
-  __m256i i_dr_1      = _mm256_loadu_si256((__m256i *)input + 1);
-  __m256i i_dr_2      = _mm256_loadu_si256((__m256i *)input + 2);
-  __m256i i_dr_3      = _mm256_loadu_si256((__m256i *)input + 3);
+  __m256i a_dr_0      = _mm256_loadu_si256((__m256i *)a + 0);
+  __m256i a_dr_1      = _mm256_loadu_si256((__m256i *)a + 1);
+  __m256i a_dr_2      = _mm256_loadu_si256((__m256i *)a + 2);
+  __m256i a_dr_3      = _mm256_loadu_si256((__m256i *)a + 3);
 
-  __m256i i_dr_0_swp  = swap_lanes(i_dr_0);
-  __m256i i_dr_1_swp  = swap_lanes(i_dr_1);
-  __m256i i_dr_2_swp  = swap_lanes(i_dr_2);
-  __m256i i_dr_3_swp  = swap_lanes(i_dr_3);
+  __m256i a_dr_0_swp  = swap_lanes(a_dr_0);
+  __m256i a_dr_1_swp  = swap_lanes(a_dr_1);
+  __m256i a_dr_2_swp  = swap_lanes(a_dr_2);
+  __m256i a_dr_3_swp  = swap_lanes(a_dr_3);
+
+  for (int dry = 0; dry < 4; dry++) {
+
+    // Read dual columns of B matrix by reading rows of its transpose
+    __m256i b_dc        = _mm256_loadu_si256(b_t_256 + dry);
+
+    __m256i prod0       = _mm256_madd_epi16(b_dc,     a_dr_0);
+    __m256i prod0_swp   = _mm256_madd_epi16(b_dc,     a_dr_0_swp);
+    __m256i prod1       = _mm256_madd_epi16(b_dc,     a_dr_1);
+    __m256i prod1_swp   = _mm256_madd_epi16(b_dc,     a_dr_1_swp);
+    __m256i prod2       = _mm256_madd_epi16(b_dc,     a_dr_2);
+    __m256i prod2_swp   = _mm256_madd_epi16(b_dc,     a_dr_2_swp);
+    __m256i prod3       = _mm256_madd_epi16(b_dc,     a_dr_3);
+    __m256i prod3_swp   = _mm256_madd_epi16(b_dc,     a_dr_3_swp);
+
+    __m256i hsum0       = _mm256_hadd_epi32(prod0,    prod0_swp);
+    __m256i hsum1       = _mm256_hadd_epi32(prod1,    prod1_swp);
+    __m256i hsum2       = _mm256_hadd_epi32(prod2,    prod2_swp);
+    __m256i hsum3       = _mm256_hadd_epi32(prod3,    prod3_swp);
+
+    __m256i hsum2c_0    = _mm256_hadd_epi32(hsum0,    hsum1);
+    __m256i hsum2c_1    = _mm256_hadd_epi32(hsum2,    hsum3);
+
+    __m256i hsum2c_0_tr = truncate(hsum2c_0, debias, shift);
+    __m256i hsum2c_1_tr = truncate(hsum2c_1, debias, shift);
+
+    __m256i tmp_dc      = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr);
+
+    output[dry]         = _mm256_shuffle_epi8(tmp_dc, shuf_lorow_mask);
+  }
+}
+
+// Multiplies A by B_T's transpose and stores result in output
+// which should be an array of 4 __m256i's
+static void matmul_8x8_a_bt(const int16_t *a, const __m256i *b_t,
+    const int8_t shift, int16_t *output)
+{
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i shuf_lorow_mask =
+      _mm256_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                       8,  9,  10, 11, 12, 13, 14, 15,
+                       18, 19, 16, 17, 22, 23, 20, 21,
+                       26, 27, 24, 25, 30, 31, 28, 29);
+
+  const __m256i *a_256 = (const __m256i *)a;
+
+  __m256i b_dc_0      = b_t[0];
+  __m256i b_dc_1      = b_t[1];
+  __m256i b_dc_2      = b_t[2];
+  __m256i b_dc_3      = b_t[3];
+
+  __m256i b_dc_0_swp  = swap_lanes(b_dc_0);
+  __m256i b_dc_1_swp  = swap_lanes(b_dc_1);
+  __m256i b_dc_2_swp  = swap_lanes(b_dc_2);
+  __m256i b_dc_3_swp  = swap_lanes(b_dc_3);
+
+  for (int dry = 0; dry < 4; dry++) {
+    __m256i a_dr        = _mm256_loadu_si256(a_256 + dry);
+
+    __m256i prod0       = _mm256_madd_epi16(a_dr,     b_dc_0);
+    __m256i prod0_swp   = _mm256_madd_epi16(a_dr,     b_dc_0_swp);
+    __m256i prod1       = _mm256_madd_epi16(a_dr,     b_dc_1);
+    __m256i prod1_swp   = _mm256_madd_epi16(a_dr,     b_dc_1_swp);
+    __m256i prod2       = _mm256_madd_epi16(a_dr,     b_dc_2);
+    __m256i prod2_swp   = _mm256_madd_epi16(a_dr,     b_dc_2_swp);
+    __m256i prod3       = _mm256_madd_epi16(a_dr,     b_dc_3);
+    __m256i prod3_swp   = _mm256_madd_epi16(a_dr,     b_dc_3_swp);
+
+    __m256i hsum0       = _mm256_hadd_epi32(prod0,    prod0_swp);
+    __m256i hsum1       = _mm256_hadd_epi32(prod1,    prod1_swp);
+    __m256i hsum2       = _mm256_hadd_epi32(prod2,    prod2_swp);
+    __m256i hsum3       = _mm256_hadd_epi32(prod3,    prod3_swp);
+
+    __m256i hsum2c_0    = _mm256_hadd_epi32(hsum0,    hsum1);
+    __m256i hsum2c_1    = _mm256_hadd_epi32(hsum2,    hsum3);
+
+    __m256i hsum2c_0_tr = truncate(hsum2c_0, debias, shift);
+    __m256i hsum2c_1_tr = truncate(hsum2c_1, debias, shift);
+
+    __m256i tmp_dr      = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr);
+
+    __m256i final_dr    = _mm256_shuffle_epi8(tmp_dr, shuf_lorow_mask);
+
+    _mm256_storeu_si256((__m256i *)output + dry, final_dr);
+  }
+}
+
+static void matrix_dct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit[8] + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit[8] + 8;
+
+  const int16_t *dct  = &kvz_g_dct_8[0][0];
 
   /*
    * Multiply input by the tranpose of DCT matrix into tmpres, and DCT matrix
@@ -210,75 +311,36 @@ static void matrix_dct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *
    * instead, since it will be used in similar fashion as the right operand
    * in the second multiplication.
    */
-  for (int dry = 0; dry < 4; dry++) {
 
-    // Read columns of DCT matrix's transpose by reading rows of DCT matrix
-    __m256i d_dr        = _mm256_loadu_si256(dct + dry);
+  __m256i tmpres[4];
 
-    __m256i prod0       = _mm256_madd_epi16(d_dr,     i_dr_0);
-    __m256i prod0_swp   = _mm256_madd_epi16(d_dr,     i_dr_0_swp);
-    __m256i prod1       = _mm256_madd_epi16(d_dr,     i_dr_1);
-    __m256i prod1_swp   = _mm256_madd_epi16(d_dr,     i_dr_1_swp);
-    __m256i prod2       = _mm256_madd_epi16(d_dr,     i_dr_2);
-    __m256i prod2_swp   = _mm256_madd_epi16(d_dr,     i_dr_2_swp);
-    __m256i prod3       = _mm256_madd_epi16(d_dr,     i_dr_3);
-    __m256i prod3_swp   = _mm256_madd_epi16(d_dr,     i_dr_3_swp);
+  matmul_8x8_a_bt_t(input,  dct, shift_1st, tmpres);
+  matmul_8x8_a_bt  (dct, tmpres, shift_2nd, output);
+}
 
-    __m256i hsum0       = _mm256_hadd_epi32(prod0,    prod0_swp);
-    __m256i hsum1       = _mm256_hadd_epi32(prod1,    prod1_swp);
-    __m256i hsum2       = _mm256_hadd_epi32(prod2,    prod2_swp);
-    __m256i hsum3       = _mm256_hadd_epi32(prod3,    prod3_swp);
+static void matrix_idct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+  int16_t tmp[8 * 8];
 
-    __m256i hsum2c_0    = _mm256_hadd_epi32(hsum0,    hsum1);
-    __m256i hsum2c_1    = _mm256_hadd_epi32(hsum2,    hsum3);
+  const int16_t *tdct = &kvz_g_dct_8_t[0][0];
+  const int16_t *dct  = &kvz_g_dct_8  [0][0];
 
-    __m256i hsum2c_0_tr = truncate(hsum2c_0, debias1, shift_1st);
-    __m256i hsum2c_1_tr = truncate(hsum2c_1, debias1, shift_1st);
+  mul_clip_matrix_8x8_avx2(tdct, input, tmp,    shift_1st);
+  mul_clip_matrix_8x8_avx2(tmp,  dct,   output, shift_2nd);
 
-    __m256i tmp_dr      = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr);
-
-    tmpres[dry]         = _mm256_shuffle_epi8(tmp_dr, shuf_lorow_mask);
-  }
-
-  __m256i t_dr_0      = tmpres[0];
-  __m256i t_dr_1      = tmpres[1];
-  __m256i t_dr_2      = tmpres[2];
-  __m256i t_dr_3      = tmpres[3];
-
-  __m256i t_dr_0_swp  = swap_lanes(t_dr_0);
-  __m256i t_dr_1_swp  = swap_lanes(t_dr_1);
-  __m256i t_dr_2_swp  = swap_lanes(t_dr_2);
-  __m256i t_dr_3_swp  = swap_lanes(t_dr_3);
-
-  for (int dry = 0; dry < 4; dry++) {
-    __m256i d_dr        = _mm256_loadu_si256(dct + dry);
-
-    __m256i prod0       = _mm256_madd_epi16(d_dr,     t_dr_0);
-    __m256i prod0_swp   = _mm256_madd_epi16(d_dr,     t_dr_0_swp);
-    __m256i prod1       = _mm256_madd_epi16(d_dr,     t_dr_1);
-    __m256i prod1_swp   = _mm256_madd_epi16(d_dr,     t_dr_1_swp);
-    __m256i prod2       = _mm256_madd_epi16(d_dr,     t_dr_2);
-    __m256i prod2_swp   = _mm256_madd_epi16(d_dr,     t_dr_2_swp);
-    __m256i prod3       = _mm256_madd_epi16(d_dr,     t_dr_3);
-    __m256i prod3_swp   = _mm256_madd_epi16(d_dr,     t_dr_3_swp);
-
-    __m256i hsum0       = _mm256_hadd_epi32(prod0,    prod0_swp);
-    __m256i hsum1       = _mm256_hadd_epi32(prod1,    prod1_swp);
-    __m256i hsum2       = _mm256_hadd_epi32(prod2,    prod2_swp);
-    __m256i hsum3       = _mm256_hadd_epi32(prod3,    prod3_swp);
-
-    __m256i hsum2c_0    = _mm256_hadd_epi32(hsum0,    hsum1);
-    __m256i hsum2c_1    = _mm256_hadd_epi32(hsum2,    hsum3);
-
-    __m256i hsum2c_0_tr = truncate(hsum2c_0, debias2, shift_2nd);
-    __m256i hsum2c_1_tr = truncate(hsum2c_1, debias2, shift_2nd);
-
-    __m256i tmp_dr      = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr);
-
-    __m256i final_dr    = _mm256_shuffle_epi8(tmp_dr, shuf_lorow_mask);
-
-    _mm256_storeu_si256(((__m256i *)output) + dry, final_dr);
-  }
+  /*
+   * Because:
+   * out = tdct * input * dct = tdct * (input * dct) = tdct * (input * transpose(tdct))
+   * This could almost be done this way:
+   *
+   * matmul_8x8_a_bt_t(input, tdct, debias1, shift_1st, tmp);
+   * matmul_8x8_a_bt  (tdct,  tmp,  debias2, shift_2nd, output);
+   *
+   * But not really, since it will fall victim to some very occasional
+   * rounding errors. Sadly.
+   */
 }
 
 // 16x16 matrix multiplication with value clipping.
@@ -468,15 +530,15 @@ static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const
 TRANSFORM(dst, 4);
 TRANSFORM(dct, 4);
 
-// Ha, we've got a tailored implementation for this
+// Ha, we've got a tailored implementation for these
 // TRANSFORM(dct, 8);
+// ITRANSFORM(dct, 8);
 
 TRANSFORM(dct, 16);
 TRANSFORM(dct, 32);
 
 ITRANSFORM(dst, 4);
 ITRANSFORM(dct, 4);
-ITRANSFORM(dct, 8);
 ITRANSFORM(dct, 16);
 ITRANSFORM(dct, 32);