From b78f9aff1725db2f6dd8fdb74305660174b543e5 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 26 Jul 2023 10:45:39 +0300
Subject: [PATCH] [avx2] Inverses work when ISP is not enabled

---
 src/strategies/avx2/dct-avx2.c        | 58 +++++++++++++++------------
 src/strategies/avx2/dct_avx2_tables.h | 40 ++++++++++++++----
 2 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/src/strategies/avx2/dct-avx2.c b/src/strategies/avx2/dct-avx2.c
index f875a581..71361feb 100644
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@@ -2752,9 +2752,9 @@ void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
   }
 
   __m256i v_hor_pass_out;
-  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
 
-  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
 
 
@@ -3568,39 +3568,46 @@ static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const
 
 static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
 {
-  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
   const __m256i debias = _mm256_set1_epi32(add);
 
   const __m256i* v_coeff = (const __m256i*)coeff;
   const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver);
   const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver);
 
-  __m256i v_madd_0 = _mm256_madd_epi16(src[0], v_coeff[0]);
-  __m256i v_madd_1 = _mm256_madd_epi16(src[0], v_coeff[1]);
-  __m256i v_madd_2 = _mm256_madd_epi16(src[0], v_coeff[2]);
-  __m256i v_madd_3 = _mm256_madd_epi16(src[0], v_coeff[3]);
-  __m256i v_madd_4 = _mm256_madd_epi16(src[0], v_coeff[4]);
-  __m256i v_madd_5 = _mm256_madd_epi16(src[0], v_coeff[5]);
-  __m256i v_madd_6 = _mm256_madd_epi16(src[0], v_coeff[6]);
-  __m256i v_madd_7 = _mm256_madd_epi16(src[0], v_coeff[7]);
+  // Duplicate sources to enable vertical addition
+  __m256i v_src_0 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(1, 1, 0, 0));
+  __m256i v_src_1 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(3, 3, 2, 2));
 
-  __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
-  __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
-  __m256i v_add_2 = _mm256_add_epi32(v_madd_4, v_madd_5);
-  __m256i v_add_3 = _mm256_add_epi32(v_madd_6, v_madd_7);
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
+  
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
+  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
+
+  __m256i v_madd_20 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
+  __m256i v_madd_21 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
+  
+  __m256i v_madd_30 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
+  __m256i v_madd_31 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
+
+  __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_01);
+  __m256i v_add_1 = _mm256_add_epi32(v_madd_10, v_madd_11);
+  __m256i v_add_2 = _mm256_add_epi32(v_madd_20, v_madd_21);
+  __m256i v_add_3 = _mm256_add_epi32(v_madd_30, v_madd_31);
 
   __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift);
   __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift);
 
   __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
-  v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
-  v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
-  v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
+  //v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
 
   _mm256_store_si256((__m256i*)dst, v_result);
 }
 
-void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
 {
   const int width = 8;
   const int height = 2;
@@ -3617,11 +3624,10 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
 
   __m256i v_ver_pass_out;
   fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
-  
+
   fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
 }
 
-
 void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
   const int width = 8;
@@ -4062,9 +4068,9 @@ void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
   }
 
   __m256i v_hor_pass_out[4];
-  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
 
-  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
 
 
@@ -5636,9 +5642,9 @@ void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
   }
 
   __m256i v_hor_pass_out[16];
-  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
 
-  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
 
 
@@ -8152,7 +8158,7 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
     success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
 
     success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
-    //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
+    success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
 
   }
 #endif // UVG_BIT_DEPTH == 8
diff --git a/src/strategies/avx2/dct_avx2_tables.h b/src/strategies/avx2/dct_avx2_tables.h
index 2233916b..f56cb2cc 100644
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
@@ -749,16 +749,40 @@ const int16_t ff_dst7_2x8_coeff_ver[128] = {
 
 
 ALIGNED(32) const int16_t  fi_dct2_2x8_coeff_ver[128] = {
- 64,  89,  83,  75,  64,  89,  83,  75,  64,  75,  36, -18,  64,  75,  36, -18,
- 64,  50,  36,  18,  64,  50,  36,  18, -64, -89, -83, -50, -64, -89, -83, -50,
- 64,  50, -36, -89,  64,  50, -36, -89,  64,  18, -83, -50,  64,  18, -83, -50,
--64,  18,  83,  75, -64,  18,  83,  75,  64,  75, -36, -89,  64,  75, -36, -89,
- 64, -18, -83,  50,  64, -18, -83,  50,  64, -50, -36,  89,  64, -50, -36,  89,
- 64, -75, -36,  89,  64, -75, -36,  89, -64, -18,  83, -75, -64, -18,  83, -75,
- 64, -75,  36,  18,  64, -75,  36,  18,  64, -89,  83, -75,  64, -89,  83, -75,
--64,  89, -83,  50, -64,  89, -83,  50,  64, -50,  36, -18,  64, -50,  36, -18,
+ 64,  89,  83,  75,  64,  75,  36, -18,  64,  89,  83,  75,  64,  75,  36, -18,
+ 64,  50,  36,  18, -64, -89, -83, -50,  64,  50,  36,  18, -64, -89, -83, -50,
+ 64,  50, -36, -89,  64,  18, -83, -50,  64,  50, -36, -89,  64,  18, -83, -50,
+-64,  18,  83,  75,  64,  75, -36, -89, -64,  18,  83,  75,  64,  75, -36, -89,
+ 64, -18, -83,  50,  64, -50, -36,  89,  64, -18, -83,  50,  64, -50, -36,  89,
+ 64, -75, -36,  89, -64, -18,  83, -75,  64, -75, -36,  89, -64, -18,  83, -75,
+ 64, -75,  36,  18,  64, -89,  83, -75,  64, -75,  36,  18,  64, -89,  83, -75,
+-64,  89, -83,  50,  64, -50,  36, -18, -64,  89, -83,  50,  64, -50,  36, -18,
 };
 
+ALIGNED(32) const int16_t  fi_dst7_2x8_coeff_ver[128] = {
+ 17,  46,  71,  85,  32,  78,  85,  46,  17,  46,  71,  85,  32,  78,  85,  46,
+ 86,  78,  60,  32, -17, -71, -86, -60,  86,  78,  60,  32, -17, -71, -86, -60,
+ 46,  86,  32, -60,  60,  71, -46, -78,  46,  86,  32, -60,  60,  71, -46, -78,
+-85, -17,  71,  78,  32,  85, -17, -86, -85, -17,  71,  78,  32,  85, -17, -86,
+ 71,  32, -86,  17,  78, -17, -60,  86,  71,  32, -86,  17,  78, -17, -60,  86,
+ 78, -60, -46,  85, -46, -32,  85, -71,  78, -60, -46,  85, -46, -32,  85, -71,
+ 85, -60,  17,  32,  86, -85,  78, -71,  85, -60,  17,  32,  86, -85,  78, -71,
+-71,  86, -78,  46,  60, -46,  32, -17, -71,  86, -78,  46,  60, -46,  32, -17,
+};
+
+ALIGNED(32) const int16_t  fi_dct8_2x8_coeff_ver[128] = {
+ 86,  85,  78,  71,  85,  60,  17, -32,  86,  85,  78,  71,  85,  60,  17, -32,
+ 60,  46,  32,  17, -71, -86, -78, -46,  60,  46,  32,  17, -71, -86, -78, -46,
+ 78,  17, -60, -86,  71, -32, -86, -17,  78,  17, -60, -86,  71, -32, -86, -17,
+-46,  32,  85,  71,  78,  60, -46, -85, -46,  32,  85,  71,  78,  60, -46, -85,
+ 60, -71, -46,  78,  46, -86,  32,  60,  60, -71, -46,  78,  46, -86,  32,  60,
+ 32, -85, -17,  86, -85,  17,  71, -78,  32, -85, -17,  86, -85,  17,  71, -78,
+ 32, -78,  85, -46,  17, -46,  71, -85,  32, -78,  85, -46,  17, -46,  71, -85,
+-17,  71, -86,  60,  86, -78,  60, -32, -17,  71, -86,  60,  86, -78,  60, -32,
+};
+
+
+
 ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
  64,  90,  89,  87,  64,  90,  89,  87,  64,  57,  50,  43,  64,  57,  50,  43,  // 0
  83,  80,  75,  70,  83,  80,  75,  70,  36,  25,  18,   9,  36,  25,  18,   9,