Added 32x32 IDCT

2024-11-28 03:34:06 +00:00 · 2014-09-25 17:17:47 +03:00 · 2014-09-25 17:17:47 +03:00 · e7bcb58846
parent eacf173b7e
commit e7bcb58846
1 changed files with 14 additions and 75 deletions
--- a/src/strategies/avx2/dct-avx2.c
+++ b/src/strategies/avx2/dct-avx2.c
@ -552,84 +552,16 @@ static void matrix_transform_2d_32x32_avx2(const int16_t *src, int16_t *dst, con
  mul_matrix_32x32_avx2(transform, tmp, dst, shift1);
 }
-
+static void matrix_itransform_2d_32x32_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
 static void partial_butterfly_inverse_32_avx2(int16_t *src, int16_t *dst,
  int32_t shift)
 {
-  int32_t j, k;
+  int16_t tmp[32 * 32];
-  int32_t e[16], o[16];
+  int16_t transposed[32 * 32];
  int32_t ee[8], eo[8];
  int32_t eee[4], eeo[4];
  int32_t eeee[2], eeeo[2];
  int32_t add = 1 << (shift - 1);
  const int32_t line = 32;
-  for (j = 0; j<line; j++) {
+  transpose_32x32_16bit(transform, transposed);
-    // Utilizing symmetry properties to the maximum to minimize the number of multiplications
+  mul_matrix_32x32_avx2(transposed, src, tmp, shift0);
-    for (k = 0; k < 16; k++) {
+  mul_matrix_32x32_avx2(tmp, transform, dst, shift1);
      o[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
        g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
        g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
        g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
    }
    for (k = 0; k < 8; k++) {
      eo[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
        g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
    }
    for (k = 0; k < 4; k++) {
      eeo[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
    }
    eeeo[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
    eeeo[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
    eeee[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
    eeee[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
    // Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector
    eee[0] = eeee[0] + eeeo[0];
    eee[3] = eeee[0] - eeeo[0];
    eee[1] = eeee[1] + eeeo[1];
    eee[2] = eeee[1] - eeeo[1];
    for (k = 0; k < 4; k++) {
      ee[k] = eee[k] + eeo[k];
      ee[k + 4] = eee[3 - k] - eeo[3 - k];
    }
    for (k = 0; k < 8; k++) {
      e[k] = ee[k] + eo[k];
      e[k + 8] = ee[7 - k] - eo[7 - k];
    }
    for (k = 0; k<16; k++) {
      dst[k] = (short)MAX(-32768, MIN(32767, (e[k] + o[k] + add) >> shift));
      dst[k + 16] = (short)MAX(-32768, MIN(32767, (e[15 - k] - o[15 - k] + add) >> shift));
    }
    src++;
    dst += 32;
  }
 }
 #define DCT_NXN_AVX2(n) \
 static void dct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
  \
  int16_t tmp[n*n]; \
  int32_t shift_1st = g_convert_to_bit[n] + 1 + (bitdepth - 8); \
  int32_t shift_2nd = g_convert_to_bit[n] + 8; \
  \
  partial_butterfly_ ## n ## _avx2(block, tmp, shift_1st); \
  partial_butterfly_ ## n ## _avx2(tmp, coeff, shift_2nd); \
 }
 #define IDCT_NXN_AVX2(n) \
 static void idct_ ## n ## x ## n ## _avx2(int8_t bitdepth, int16_t *block, int16_t *coeff) { \
 \
  int16_t tmp[n*n]; \
  int32_t shift_1st = 7; \
  int32_t shift_2nd = 12 - (bitdepth - 8); \
 \
  partial_butterfly_inverse_ ## n ## _avx2(coeff, tmp, shift_1st); \
  partial_butterfly_inverse_ ## n ## _avx2(tmp, block, shift_2nd); \
 }
 IDCT_NXN_AVX2(32);
 static void matrix_dst_4x4_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
 {
  int32_t shift_1st = g_convert_to_bit[4] + 1 + (bitdepth - 8);
@ -692,6 +624,13 @@ static void matrix_dct_32x32_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
  int32_t shift_2nd = g_convert_to_bit[32] + 8;
  matrix_transform_2d_32x32_avx2(src, dst, (const int16_t*)g_t32, shift_1st, shift_2nd);
 }
 static void matrix_idct_32x32_avx2(int8_t bitdepth, int16_t *dst, int16_t *src)
 {
  int32_t shift_1st = 7;
  int32_t shift_2nd = 12 - (bitdepth - 8);
  matrix_itransform_2d_32x32_avx2(src, dst, (const int16_t*)g_t32, shift_1st, shift_2nd);
 }
 #endif //COMPILE_INTEL_AVX2
 int strategy_register_dct_avx2(void* opaque)
@ -710,7 +649,7 @@ int strategy_register_dct_avx2(void* opaque)
  success &= strategyselector_register(opaque, "idct_4x4", "avx2", 40, &matrix_idct_4x4_avx2);
  success &= strategyselector_register(opaque, "idct_8x8", "avx2", 40, &matrix_idct_8x8_avx2);
  success &= strategyselector_register(opaque, "idct_16x16", "avx2", 40, &matrix_idct_16x16_avx2);
-  success &= strategyselector_register(opaque, "idct_32x32", "avx2", 40, &idct_32x32_avx2);
+  success &= strategyselector_register(opaque, "idct_32x32", "avx2", 40, &matrix_idct_32x32_avx2);
 #endif //COMPILE_INTEL_AVX2  
  return success;
 }