[avx2] Inverses work when ISP is not enabled

2024-11-23 18:14:06 +00:00 · 2023-07-26 10:45:39 +03:00 · 2023-07-26 10:45:39 +03:00 · b78f9aff17
parent 4dccbcc30d
commit b78f9aff17
2 changed files with 64 additions and 34 deletions
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@ -2752,9 +2752,9 @@ void fast_inverse_tr_4x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
  }
  __m256i v_hor_pass_out;
-  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_4x4_avx2_hor(src, &v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
-  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_4x4_avx2_ver(&v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
@ -3568,39 +3568,46 @@ static void fast_inverse_tr_8x2_avx2_ver(const int16_t* src, __m256i* dst, const
 static void fast_inverse_tr_8x2_avx2_hor(const __m256i* src, int16_t* dst, const int16_t* coeff, int32_t shift, int line, int skip_line, int skip_line2)
 {
-  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0; // ISP_TODO: optimize (shift > 0) check out if shift is always gt 0
+  const int32_t    add = (shift > 0) ? (1 << (shift - 1)) : 0;
  const __m256i debias = _mm256_set1_epi32(add);
  const __m256i* v_coeff = (const __m256i*)coeff;
  const __m256i v_shuffle1 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle1_ver);
  const __m256i v_shuffle2 = _mm256_load_si256((const __m256i*)fi_tr_2x8_result_shuffle2_ver);
-  __m256i v_madd_0 = _mm256_madd_epi16(src[0], v_coeff[0]);
+  // Duplicate sources to enable vertical addition
-  __m256i v_madd_1 = _mm256_madd_epi16(src[0], v_coeff[1]);
+  __m256i v_src_0 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(1, 1, 0, 0));
-  __m256i v_madd_2 = _mm256_madd_epi16(src[0], v_coeff[2]);
+  __m256i v_src_1 = _mm256_permute4x64_epi64(*src, _MM_SHUFFLE(3, 3, 2, 2));
  __m256i v_madd_3 = _mm256_madd_epi16(src[0], v_coeff[3]);
  __m256i v_madd_4 = _mm256_madd_epi16(src[0], v_coeff[4]);
  __m256i v_madd_5 = _mm256_madd_epi16(src[0], v_coeff[5]);
  __m256i v_madd_6 = _mm256_madd_epi16(src[0], v_coeff[6]);
  __m256i v_madd_7 = _mm256_madd_epi16(src[0], v_coeff[7]);
-  __m256i v_add_0 = _mm256_add_epi32(v_madd_0, v_madd_1);
+  __m256i v_madd_00 = _mm256_madd_epi16(v_src_0, v_coeff[0]);
-  __m256i v_add_1 = _mm256_add_epi32(v_madd_2, v_madd_3);
+  __m256i v_madd_01 = _mm256_madd_epi16(v_src_1, v_coeff[1]);
-  __m256i v_add_2 = _mm256_add_epi32(v_madd_4, v_madd_5);
+  
-  __m256i v_add_3 = _mm256_add_epi32(v_madd_6, v_madd_7);
+  __m256i v_madd_10 = _mm256_madd_epi16(v_src_0, v_coeff[2]);
  __m256i v_madd_11 = _mm256_madd_epi16(v_src_1, v_coeff[3]);
  __m256i v_madd_20 = _mm256_madd_epi16(v_src_0, v_coeff[4]);
  __m256i v_madd_21 = _mm256_madd_epi16(v_src_1, v_coeff[5]);
  __m256i v_madd_30 = _mm256_madd_epi16(v_src_0, v_coeff[6]);
  __m256i v_madd_31 = _mm256_madd_epi16(v_src_1, v_coeff[7]);
  __m256i v_add_0 = _mm256_add_epi32(v_madd_00, v_madd_01);
  __m256i v_add_1 = _mm256_add_epi32(v_madd_10, v_madd_11);
  __m256i v_add_2 = _mm256_add_epi32(v_madd_20, v_madd_21);
  __m256i v_add_3 = _mm256_add_epi32(v_madd_30, v_madd_31);
  __m256i v_trunc_0 = truncate_avx2(_mm256_hadd_epi32(v_add_0, v_add_1), debias, shift);
  __m256i v_trunc_1 = truncate_avx2(_mm256_hadd_epi32(v_add_2, v_add_3), debias, shift);
  __m256i v_result = _mm256_packs_epi32(v_trunc_0, v_trunc_1);
-  v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle1);
-  v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
+  //v_result = _mm256_permute4x64_epi64(v_result, _MM_SHUFFLE(3, 1, 2, 0));
-  v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
+  //v_result = _mm256_shuffle_epi8(v_result, v_shuffle2);
  _mm256_store_si256((__m256i*)dst, v_result);
 }
-void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
+void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, const int mts_type)
 {
  const int width = 8;
  const int height = 2;
@ -3617,11 +3624,10 @@ void fast_inverse_tr_8x2_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
  __m256i v_ver_pass_out;
  fast_inverse_tr_8x2_avx2_ver(src, &v_ver_pass_out, ver_coeff, shift_1st, width, skip_width, skip_height);
-  
+
  fast_inverse_tr_8x2_avx2_hor(&v_ver_pass_out, dst, hor_coeff, shift_2nd, height, 0, skip_width);
 }
 void fast_forward_tr_8x4_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, tr_type_t ver)
 {
  const int width = 8;
@ -4062,9 +4068,9 @@ void fast_inverse_tr_8x8_avx2(const int16_t* src, int16_t* dst, tr_type_t hor, t
  }
  __m256i v_hor_pass_out[4];
-  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_8x8_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
-  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_8x8_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
@ -5636,9 +5642,9 @@ void fast_inverse_tr_16x16_avx2(const int16_t* src, int16_t* dst, tr_type_t hor,
  }
  __m256i v_hor_pass_out[16];
-  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, hor_coeff, shift_1st, height, 0, skip_width);
+  fast_inverse_tr_16x16_avx2_hor(src, v_hor_pass_out, ver_coeff, shift_1st, height, 0, skip_width);
-  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, ver_coeff, shift_2nd, width, skip_width, skip_height);
+  fast_inverse_tr_16x16_avx2_ver(v_hor_pass_out, dst, hor_coeff, shift_2nd, width, skip_width, skip_height);
 }
@ -8152,7 +8158,7 @@ int uvg_strategy_register_dct_avx2(void* opaque, uint8_t bitdepth)
    success &= uvg_strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
    success &= uvg_strategyselector_register(opaque, "mts_dct", "avx2", 40, &mts_dct_avx2);
-    //success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
+    success &= uvg_strategyselector_register(opaque, "mts_idct", "avx2", 40, &mts_idct_avx2);
  }
 #endif // UVG_BIT_DEPTH == 8
--- a/src/strategies/avx2/dct_avx2_tables.h
+++ b/src/strategies/avx2/dct_avx2_tables.h
@ -749,16 +749,40 @@ const int16_t ff_dst7_2x8_coeff_ver[128] = {
 ALIGNED(32) const int16_t  fi_dct2_2x8_coeff_ver[128] = {
- 64,  89,  83,  75,  64,  89,  83,  75,  64,  75,  36, -18,  64,  75,  36, -18,
+ 64,  89,  83,  75,  64,  75,  36, -18,  64,  89,  83,  75,  64,  75,  36, -18,
- 64,  50,  36,  18,  64,  50,  36,  18, -64, -89, -83, -50, -64, -89, -83, -50,
+ 64,  50,  36,  18, -64, -89, -83, -50,  64,  50,  36,  18, -64, -89, -83, -50,
- 64,  50, -36, -89,  64,  50, -36, -89,  64,  18, -83, -50,  64,  18, -83, -50,
+ 64,  50, -36, -89,  64,  18, -83, -50,  64,  50, -36, -89,  64,  18, -83, -50,
-64,  18,  83,  75, -64,  18,  83,  75,  64,  75, -36, -89,  64,  75, -36, -89,
+-64,  18,  83,  75,  64,  75, -36, -89, -64,  18,  83,  75,  64,  75, -36, -89,
- 64, -18, -83,  50,  64, -18, -83,  50,  64, -50, -36,  89,  64, -50, -36,  89,
+ 64, -18, -83,  50,  64, -50, -36,  89,  64, -18, -83,  50,  64, -50, -36,  89,
- 64, -75, -36,  89,  64, -75, -36,  89, -64, -18,  83, -75, -64, -18,  83, -75,
+ 64, -75, -36,  89, -64, -18,  83, -75,  64, -75, -36,  89, -64, -18,  83, -75,
- 64, -75,  36,  18,  64, -75,  36,  18,  64, -89,  83, -75,  64, -89,  83, -75,
+ 64, -75,  36,  18,  64, -89,  83, -75,  64, -75,  36,  18,  64, -89,  83, -75,
-64,  89, -83,  50, -64,  89, -83,  50,  64, -50,  36, -18,  64, -50,  36, -18,
+-64,  89, -83,  50,  64, -50,  36, -18, -64,  89, -83,  50,  64, -50,  36, -18,
 };
 ALIGNED(32) const int16_t  fi_dst7_2x8_coeff_ver[128] = {
 17,  46,  71,  85,  32,  78,  85,  46,  17,  46,  71,  85,  32,  78,  85,  46,
 86,  78,  60,  32, -17, -71, -86, -60,  86,  78,  60,  32, -17, -71, -86, -60,
 46,  86,  32, -60,  60,  71, -46, -78,  46,  86,  32, -60,  60,  71, -46, -78,
 -85, -17,  71,  78,  32,  85, -17, -86, -85, -17,  71,  78,  32,  85, -17, -86,
 71,  32, -86,  17,  78, -17, -60,  86,  71,  32, -86,  17,  78, -17, -60,  86,
 78, -60, -46,  85, -46, -32,  85, -71,  78, -60, -46,  85, -46, -32,  85, -71,
 85, -60,  17,  32,  86, -85,  78, -71,  85, -60,  17,  32,  86, -85,  78, -71,
 -71,  86, -78,  46,  60, -46,  32, -17, -71,  86, -78,  46,  60, -46,  32, -17,
 };
 ALIGNED(32) const int16_t  fi_dct8_2x8_coeff_ver[128] = {
 86,  85,  78,  71,  85,  60,  17, -32,  86,  85,  78,  71,  85,  60,  17, -32,
 60,  46,  32,  17, -71, -86, -78, -46,  60,  46,  32,  17, -71, -86, -78, -46,
 78,  17, -60, -86,  71, -32, -86, -17,  78,  17, -60, -86,  71, -32, -86, -17,
 -46,  32,  85,  71,  78,  60, -46, -85, -46,  32,  85,  71,  78,  60, -46, -85,
 60, -71, -46,  78,  46, -86,  32,  60,  60, -71, -46,  78,  46, -86,  32,  60,
 32, -85, -17,  86, -85,  17,  71, -78,  32, -85, -17,  86, -85,  17,  71, -78,
 32, -78,  85, -46,  17, -46,  71, -85,  32, -78,  85, -46,  17, -46,  71, -85,
 -17,  71, -86,  60,  86, -78,  60, -32, -17,  71, -86,  60,  86, -78,  60, -32,
 };
 ALIGNED(32) const int16_t  fi_dct2_2x16_coeff_ver[512] = {
 64,  90,  89,  87,  64,  90,  89,  87,  64,  57,  50,  43,  64,  57,  50,  43,  // 0
 83,  80,  75,  70,  83,  80,  75,  70,  36,  25,  18,   9,  36,  25,  18,   9,