Transform functions in dct-avx2.c are now generated with macros.

This commit is contained in:
Ari Lemmetti 2014-10-02 13:11:46 +03:00 committed by Ari Koivula
parent 9407610555
commit 61e1510480
3 changed files with 119 additions and 232 deletions

View file

@ -30,17 +30,17 @@
#if COMPILE_INTEL_AVX2
#include <immintrin.h>
extern const int16_t g_dst[4][4];
extern const int16_t g_t4[4][4];
extern const int16_t g_t8[8][8];
extern const int16_t g_t16[16][16];
extern const int16_t g_t32[32][32];
extern const int16_t g_dst_4[4][4];
extern const int16_t g_dct_4[4][4];
extern const int16_t g_dct_8[8][8];
extern const int16_t g_dct_16[16][16];
extern const int16_t g_dct_32[32][32];
extern const int16_t g_dst_t[4][4];
extern const int16_t g_t4_t[4][4];
extern const int16_t g_t8_t[8][8];
extern const int16_t g_t16_t[16][16];
extern const int16_t g_t32_t[32][32];
extern const int16_t g_dst_4_t[4][4];
extern const int16_t g_dct_4_t[4][4];
extern const int16_t g_dct_8_t[8][8];
extern const int16_t g_dct_16_t[16][16];
extern const int16_t g_dct_32_t[32][32];
/**
* \brief AVX2 transform functions
@ -481,155 +481,42 @@ static void mul_clip_matrix_32x32_avx2(const int16_t *first, const int16_t *seco
}
}
static void matrix_dst_2d_4x4_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[4 * 4];
#define TRANSFORM(type, n) \
\
static void matrix_ ## type ## _ ## n ## x ## n ## _avx2(int8_t bitdepth, const int16_t *src, int16_t *dst)\
{\
int32_t shift_1st = g_convert_to_bit[n] + 1 + (bitdepth - 8); \
int32_t shift_2nd = g_convert_to_bit[n] + 8; \
int16_t tmp[n * n];\
\
mul_clip_matrix_ ## n ## x ## n ## _avx2(src, (int16_t*)g_ ## type ## _ ## n ## _t, tmp, shift_1st);\
mul_clip_matrix_ ## n ## x ## n ## _avx2((int16_t*)g_ ## type ## _ ## n ##, tmp, dst, shift_2nd);\
}\
mul_clip_matrix_4x4_avx2(src, (int16_t*)g_dst_t, tmp, shift0);
mul_clip_matrix_4x4_avx2((int16_t*)g_dst, tmp, dst, shift1);
}
#define ITRANSFORM(type, n) \
\
static void matrix_i ## type ## _## n ## x ## n ## _avx2(int8_t bitdepth, const int16_t *dst, int16_t *src)\
{\
int32_t shift_1st = 7; \
int32_t shift_2nd = 12 - (bitdepth - 8); \
int16_t tmp[n * n];\
\
mul_clip_matrix_ ## n ## x ## n ## _avx2((int16_t*)g_ ## type ## _ ## n ## _t, src, tmp, shift_1st);\
mul_clip_matrix_ ## n ## x ## n ## _avx2(tmp, (int16_t*)g_ ## type ## _ ## n ##, dst, shift_2nd);\
}\
static void matrix_idst_2d_4x4_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[4 * 4];
TRANSFORM(dst, 4);
TRANSFORM(dct, 4);
TRANSFORM(dct, 8);
TRANSFORM(dct, 16);
TRANSFORM(dct, 32);
mul_clip_matrix_4x4_avx2((int16_t*)g_dst_t, src, tmp, shift0);
mul_clip_matrix_4x4_avx2(tmp, (int16_t*)g_dst, dst, shift1);
}
ITRANSFORM(dst, 4);
ITRANSFORM(dct, 4);
ITRANSFORM(dct, 8);
ITRANSFORM(dct, 16);
ITRANSFORM(dct, 32);
static void matrix_transform_2d_4x4_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[4 * 4];
mul_clip_matrix_4x4_avx2(src, (int16_t*)g_t4_t, tmp, shift0);
mul_clip_matrix_4x4_avx2((int16_t*)g_t4, tmp, dst, shift1);
}
static void matrix_itransform_2d_4x4_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[4*4];
mul_clip_matrix_4x4_avx2((int16_t*)g_t4_t, src, tmp, shift0);
mul_clip_matrix_4x4_avx2(tmp, (int16_t*)g_t4, dst, shift1);
}
static void matrix_transform_2d_8x8_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[8 * 8];
mul_clip_matrix_8x8_avx2(src, (int16_t*)g_t8_t, tmp, shift0);
mul_clip_matrix_8x8_avx2((int16_t*)g_t8, tmp, dst, shift1);
}
static void matrix_itransform_2d_8x8_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[8 * 8];
mul_clip_matrix_8x8_avx2((int16_t*)g_t8_t, src, tmp, shift0);
mul_clip_matrix_8x8_avx2(tmp, (int16_t*)g_t8, dst, shift1);
}
static void matrix_transform_2d_16x16_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[16 * 16];
mul_clip_matrix_16x16_avx2(src, (int16_t*)g_t16_t, tmp, shift0);
mul_clip_matrix_16x16_avx2((int16_t*)g_t16, tmp, dst, shift1);
}
static void matrix_itransform_2d_16x16_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[16 * 16];
mul_clip_matrix_16x16_avx2((int16_t*)g_t16_t, src, tmp, shift0);
mul_clip_matrix_16x16_avx2(tmp, (int16_t*)g_t16, dst, shift1);
}
static void matrix_transform_2d_32x32_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[32 * 32];
mul_clip_matrix_32x32_avx2(src, (int16_t*)g_t32_t, tmp, shift0);
mul_clip_matrix_32x32_avx2((int16_t*)g_t32, tmp, dst, shift1);
}
static void matrix_itransform_2d_32x32_avx2(const int16_t *src, int16_t *dst, const int16_t *transform, const int16_t shift0, const int16_t shift1)
{
int16_t tmp[32 * 32];
mul_clip_matrix_32x32_avx2((int16_t*) g_t32_t, src, tmp, shift0);
mul_clip_matrix_32x32_avx2(tmp, (int16_t*)g_t32, dst, shift1);
}
static void matrix_dst_4x4_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
{
int32_t shift_1st = g_convert_to_bit[4] + 1 + (bitdepth - 8);
int32_t shift_2nd = g_convert_to_bit[4] + 8;
matrix_dst_2d_4x4_avx2(src, dst, (const int16_t*)g_dst, shift_1st, shift_2nd);
}
static void matrix_idst_4x4_avx2(int8_t bitdepth, int16_t *dst, int16_t *src)
{
int32_t shift_1st = 7;
int32_t shift_2nd = 12 - (bitdepth - 8);
matrix_idst_2d_4x4_avx2(src, dst, (const int16_t*)g_dst, shift_1st, shift_2nd);
}
static void matrix_dct_4x4_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
{
int32_t shift_1st = g_convert_to_bit[4] + 1 + (bitdepth - 8);
int32_t shift_2nd = g_convert_to_bit[4] + 8;
matrix_transform_2d_4x4_avx2(src, dst, (const int16_t*)g_t4, shift_1st, shift_2nd);
}
static void matrix_idct_4x4_avx2(int8_t bitdepth, int16_t *dst, int16_t *src)
{
int32_t shift_1st = 7;
int32_t shift_2nd = 12 - (bitdepth - 8);
matrix_itransform_2d_4x4_avx2(src, dst, (const int16_t*)g_t4, shift_1st, shift_2nd);
}
static void matrix_dct_8x8_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
{
int32_t shift_1st = g_convert_to_bit[8] + 1 + (bitdepth - 8);
int32_t shift_2nd = g_convert_to_bit[8] + 8;
matrix_transform_2d_8x8_avx2(src, dst, (const int16_t*)g_t8, shift_1st, shift_2nd);
}
static void matrix_idct_8x8_avx2(int8_t bitdepth, int16_t *dst, int16_t *src)
{
int32_t shift_1st = 7;
int32_t shift_2nd = 12 - (bitdepth - 8);
matrix_itransform_2d_8x8_avx2(src, dst, (const int16_t*)g_t8_t, shift_1st, shift_2nd);
}
static void matrix_dct_16x16_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
{
int32_t shift_1st = g_convert_to_bit[16] + 1 + (bitdepth - 8);
int32_t shift_2nd = g_convert_to_bit[16] + 8;
matrix_transform_2d_16x16_avx2(src, dst, (const int16_t*)g_t16, shift_1st, shift_2nd);
}
static void matrix_idct_16x16_avx2(int8_t bitdepth, int16_t *dst, int16_t *src)
{
int32_t shift_1st = 7;
int32_t shift_2nd = 12 - (bitdepth - 8);
matrix_itransform_2d_16x16_avx2(src, dst, (const int16_t*)g_t16, shift_1st, shift_2nd);
}
static void matrix_dct_32x32_avx2(int8_t bitdepth, int16_t *src, int16_t *dst)
{
int32_t shift_1st = g_convert_to_bit[32] + 1 + (bitdepth - 8);
int32_t shift_2nd = g_convert_to_bit[32] + 8;
matrix_transform_2d_32x32_avx2(src, dst, (const int16_t*)g_t32, shift_1st, shift_2nd);
}
static void matrix_idct_32x32_avx2(int8_t bitdepth, int16_t *dst, int16_t *src)
{
int32_t shift_1st = 7;
int32_t shift_2nd = 12 - (bitdepth - 8);
matrix_itransform_2d_32x32_avx2(src, dst, (const int16_t*)g_t32, shift_1st, shift_2nd);
}
#endif //COMPILE_INTEL_AVX2
int strategy_register_dct_avx2(void* opaque)

View file

@ -26,7 +26,7 @@
#include "strategyselector.h"
#include "encoder.h"
const int16_t g_dst[4][4] =
const int16_t g_dst_4[4][4] =
{
{ 29, 55, 74, 84 },
{ 74, 74, 0, -74 },
@ -34,7 +34,7 @@ const int16_t g_dst[4][4] =
{ 55, -84, 74, -29 }
};
const int16_t g_t4[4][4] =
const int16_t g_dct_4[4][4] =
{
{ 64, 64, 64, 64 },
{ 83, 36, -36, -83 },
@ -42,7 +42,7 @@ const int16_t g_t4[4][4] =
{ 36, -83, 83, -36 }
};
const int16_t g_t8[8][8] =
const int16_t g_dct_8[8][8] =
{
{ 64, 64, 64, 64, 64, 64, 64, 64 },
{ 89, 75, 50, 18, -18, -50, -75, -89 },
@ -54,7 +54,7 @@ const int16_t g_t8[8][8] =
{ 18, -50, 75, -89, 89, -75, 50, -18 }
};
const int16_t g_t16[16][16] =
const int16_t g_dct_16[16][16] =
{
{ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
{ 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 },
@ -74,7 +74,7 @@ const int16_t g_t16[16][16] =
{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 }
};
const int16_t g_t32[32][32] =
const int16_t g_dct_32[32][32] =
{
{ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
{ 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
@ -110,7 +110,7 @@ const int16_t g_t32[32][32] =
{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }
};
const int16_t g_dst_t[4][4] =
const int16_t g_dst_4_t[4][4] =
{
{ 29, 74, 84, 55 },
{ 55, 74, -29, -84 },
@ -118,7 +118,7 @@ const int16_t g_dst_t[4][4] =
{ 84, -74, 55, -29 }
};
const int16_t g_t4_t[4][4] =
const int16_t g_dct_4_t[4][4] =
{
{ 64, 83, 64, 36, },
{ 64, 36, -64, -83, },
@ -126,7 +126,7 @@ const int16_t g_t4_t[4][4] =
{ 64, -83, 64, -36 }
};
const int16_t g_t8_t[8][8] =
const int16_t g_dct_8_t[8][8] =
{
{ 64, 89, 83, 75, 64, 50, 36, 18, },
{ 64, 75, 36, -18, -64, -89, -83, -50, },
@ -138,7 +138,7 @@ const int16_t g_t8_t[8][8] =
{ 64, -89, 83, -75, 64, -50, 36, -18 }
};
const int16_t g_t16_t[16][16] =
const int16_t g_dct_16_t[16][16] =
{
{ 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, },
{ 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, },
@ -158,7 +158,7 @@ const int16_t g_t16_t[16][16] =
{ 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 }
};
const int16_t g_t32_t[32][32] =
const int16_t g_dct_32_t[32][32] =
{
{ 64, 90, 90, 90, 89, 88, 87, 85, 83, 82, 80, 78, 75, 73, 70, 67, 64, 61, 57, 54, 50, 46, 43, 38, 36, 31, 25, 22, 18, 13, 9, 4, },
{ 64, 90, 87, 82, 75, 67, 57, 46, 36, 22, 9, -4, -18, -31, -43, -54, -64, -73, -80, -85, -89, -90, -90, -88, -83, -78, -70, -61, -50, -38, -25, -13, },
@ -258,10 +258,10 @@ static void partial_butterfly_4_generic(short *src, short *dst,
e[1] = src[1] + src[2];
o[1] = src[1] - src[2];
dst[0] = (short)((g_t4[0][0] * e[0] + g_t4[0][1] * e[1] + add) >> shift);
dst[2 * line] = (short)((g_t4[2][0] * e[0] + g_t4[2][1] * e[1] + add) >> shift);
dst[line] = (short)((g_t4[1][0] * o[0] + g_t4[1][1] * o[1] + add) >> shift);
dst[3 * line] = (short)((g_t4[3][0] * o[0] + g_t4[3][1] * o[1] + add) >> shift);
dst[0] = (short)((g_dct_4[0][0] * e[0] + g_dct_4[0][1] * e[1] + add) >> shift);
dst[2 * line] = (short)((g_dct_4[2][0] * e[0] + g_dct_4[2][1] * e[1] + add) >> shift);
dst[line] = (short)((g_dct_4[1][0] * o[0] + g_dct_4[1][1] * o[1] + add) >> shift);
dst[3 * line] = (short)((g_dct_4[3][0] * o[0] + g_dct_4[3][1] * o[1] + add) >> shift);
src += 4;
dst++;
@ -279,10 +279,10 @@ static void partial_butterfly_inverse_4_generic(short *src, short *dst,
for (j = 0; j < line; j++) {
// Utilizing symmetry properties to the maximum to minimize the number of multiplications
o[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
o[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
e[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
e[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
o[0] = g_dct_4[1][0] * src[line] + g_dct_4[3][0] * src[3 * line];
o[1] = g_dct_4[1][1] * src[line] + g_dct_4[3][1] * src[3 * line];
e[0] = g_dct_4[0][0] * src[0] + g_dct_4[2][0] * src[2 * line];
e[1] = g_dct_4[0][1] * src[0] + g_dct_4[2][1] * src[2 * line];
// Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector
dst[0] = (short)CLIP(-32768, 32767, (e[0] + o[0] + add) >> shift);
@ -317,15 +317,15 @@ static void partial_butterfly_8_generic(short *src, short *dst,
ee[1] = e[1] + e[2];
eo[1] = e[1] - e[2];
dst[0] = (short)((g_t8[0][0] * ee[0] + g_t8[0][1] * ee[1] + add) >> shift);
dst[4 * line] = (short)((g_t8[4][0] * ee[0] + g_t8[4][1] * ee[1] + add) >> shift);
dst[2 * line] = (short)((g_t8[2][0] * eo[0] + g_t8[2][1] * eo[1] + add) >> shift);
dst[6 * line] = (short)((g_t8[6][0] * eo[0] + g_t8[6][1] * eo[1] + add) >> shift);
dst[0] = (short)((g_dct_8[0][0] * ee[0] + g_dct_8[0][1] * ee[1] + add) >> shift);
dst[4 * line] = (short)((g_dct_8[4][0] * ee[0] + g_dct_8[4][1] * ee[1] + add) >> shift);
dst[2 * line] = (short)((g_dct_8[2][0] * eo[0] + g_dct_8[2][1] * eo[1] + add) >> shift);
dst[6 * line] = (short)((g_dct_8[6][0] * eo[0] + g_dct_8[6][1] * eo[1] + add) >> shift);
dst[line] = (short)((g_t8[1][0] * o[0] + g_t8[1][1] * o[1] + g_t8[1][2] * o[2] + g_t8[1][3] * o[3] + add) >> shift);
dst[3 * line] = (short)((g_t8[3][0] * o[0] + g_t8[3][1] * o[1] + g_t8[3][2] * o[2] + g_t8[3][3] * o[3] + add) >> shift);
dst[5 * line] = (short)((g_t8[5][0] * o[0] + g_t8[5][1] * o[1] + g_t8[5][2] * o[2] + g_t8[5][3] * o[3] + add) >> shift);
dst[7 * line] = (short)((g_t8[7][0] * o[0] + g_t8[7][1] * o[1] + g_t8[7][2] * o[2] + g_t8[7][3] * o[3] + add) >> shift);
dst[line] = (short)((g_dct_8[1][0] * o[0] + g_dct_8[1][1] * o[1] + g_dct_8[1][2] * o[2] + g_dct_8[1][3] * o[3] + add) >> shift);
dst[3 * line] = (short)((g_dct_8[3][0] * o[0] + g_dct_8[3][1] * o[1] + g_dct_8[3][2] * o[2] + g_dct_8[3][3] * o[3] + add) >> shift);
dst[5 * line] = (short)((g_dct_8[5][0] * o[0] + g_dct_8[5][1] * o[1] + g_dct_8[5][2] * o[2] + g_dct_8[5][3] * o[3] + add) >> shift);
dst[7 * line] = (short)((g_dct_8[7][0] * o[0] + g_dct_8[7][1] * o[1] + g_dct_8[7][2] * o[2] + g_dct_8[7][3] * o[3] + add) >> shift);
src += 8;
dst++;
@ -345,13 +345,13 @@ static void partial_butterfly_inverse_8_generic(int16_t *src, int16_t *dst,
for (j = 0; j < line; j++) {
// Utilizing symmetry properties to the maximum to minimize the number of multiplications
for (k = 0; k < 4; k++) {
o[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
o[k] = g_dct_8[1][k] * src[line] + g_dct_8[3][k] * src[3 * line] + g_dct_8[5][k] * src[5 * line] + g_dct_8[7][k] * src[7 * line];
}
eo[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
eo[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
ee[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
ee[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
eo[0] = g_dct_8[2][0] * src[2 * line] + g_dct_8[6][0] * src[6 * line];
eo[1] = g_dct_8[2][1] * src[2 * line] + g_dct_8[6][1] * src[6 * line];
ee[0] = g_dct_8[0][0] * src[0] + g_dct_8[4][0] * src[4 * line];
ee[1] = g_dct_8[0][1] * src[0] + g_dct_8[4][1] * src[4 * line];
// Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector
e[0] = ee[0] + eo[0];
@ -395,18 +395,18 @@ static void partial_butterfly_16_generic(short *src, short *dst,
eee[1] = ee[1] + ee[2];
eeo[1] = ee[1] - ee[2];
dst[0] = (short)((g_t16[0][0] * eee[0] + g_t16[0][1] * eee[1] + add) >> shift);
dst[8 * line] = (short)((g_t16[8][0] * eee[0] + g_t16[8][1] * eee[1] + add) >> shift);
dst[4 * line] = (short)((g_t16[4][0] * eeo[0] + g_t16[4][1] * eeo[1] + add) >> shift);
dst[12 * line] = (short)((g_t16[12][0] * eeo[0] + g_t16[12][1] * eeo[1] + add) >> shift);
dst[0] = (short)((g_dct_16[0][0] * eee[0] + g_dct_16[0][1] * eee[1] + add) >> shift);
dst[8 * line] = (short)((g_dct_16[8][0] * eee[0] + g_dct_16[8][1] * eee[1] + add) >> shift);
dst[4 * line] = (short)((g_dct_16[4][0] * eeo[0] + g_dct_16[4][1] * eeo[1] + add) >> shift);
dst[12 * line] = (short)((g_dct_16[12][0] * eeo[0] + g_dct_16[12][1] * eeo[1] + add) >> shift);
for (k = 2; k < 16; k += 4) {
dst[k*line] = (short)((g_t16[k][0] * eo[0] + g_t16[k][1] * eo[1] + g_t16[k][2] * eo[2] + g_t16[k][3] * eo[3] + add) >> shift);
dst[k*line] = (short)((g_dct_16[k][0] * eo[0] + g_dct_16[k][1] * eo[1] + g_dct_16[k][2] * eo[2] + g_dct_16[k][3] * eo[3] + add) >> shift);
}
for (k = 1; k < 16; k += 2) {
dst[k*line] = (short)((g_t16[k][0] * o[0] + g_t16[k][1] * o[1] + g_t16[k][2] * o[2] + g_t16[k][3] * o[3] +
g_t16[k][4] * o[4] + g_t16[k][5] * o[5] + g_t16[k][6] * o[6] + g_t16[k][7] * o[7] + add) >> shift);
dst[k*line] = (short)((g_dct_16[k][0] * o[0] + g_dct_16[k][1] * o[1] + g_dct_16[k][2] * o[2] + g_dct_16[k][3] * o[3] +
g_dct_16[k][4] * o[4] + g_dct_16[k][5] * o[5] + g_dct_16[k][6] * o[6] + g_dct_16[k][7] * o[7] + add) >> shift);
}
src += 16;
@ -428,16 +428,16 @@ static void partial_butterfly_inverse_16_generic(int16_t *src, int16_t *dst,
for (j = 0; j < line; j++) {
// Utilizing symmetry properties to the maximum to minimize the number of multiplications
for (k = 0; k < 8; k++) {
o[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
o[k] = g_dct_16[1][k] * src[line] + g_dct_16[3][k] * src[3 * line] + g_dct_16[5][k] * src[5 * line] + g_dct_16[7][k] * src[7 * line] +
g_dct_16[9][k] * src[9 * line] + g_dct_16[11][k] * src[11 * line] + g_dct_16[13][k] * src[13 * line] + g_dct_16[15][k] * src[15 * line];
}
for (k = 0; k < 4; k++) {
eo[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
eo[k] = g_dct_16[2][k] * src[2 * line] + g_dct_16[6][k] * src[6 * line] + g_dct_16[10][k] * src[10 * line] + g_dct_16[14][k] * src[14 * line];
}
eeo[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
eee[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
eeo[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
eee[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
eeo[0] = g_dct_16[4][0] * src[4 * line] + g_dct_16[12][0] * src[12 * line];
eee[0] = g_dct_16[0][0] * src[0] + g_dct_16[8][0] * src[8 * line];
eeo[1] = g_dct_16[4][1] * src[4 * line] + g_dct_16[12][1] * src[12 * line];
eee[1] = g_dct_16[0][1] * src[0] + g_dct_16[8][1] * src[8 * line];
// Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector
for (k = 0; k < 2; k++) {
@ -491,22 +491,22 @@ static void partial_butterfly_32_generic(short *src, short *dst,
eeee[1] = eee[1] + eee[2];
eeeo[1] = eee[1] - eee[2];
dst[0] = (short)((g_t32[0][0] * eeee[0] + g_t32[0][1] * eeee[1] + add) >> shift);
dst[16 * line] = (short)((g_t32[16][0] * eeee[0] + g_t32[16][1] * eeee[1] + add) >> shift);
dst[8 * line] = (short)((g_t32[8][0] * eeeo[0] + g_t32[8][1] * eeeo[1] + add) >> shift);
dst[24 * line] = (short)((g_t32[24][0] * eeeo[0] + g_t32[24][1] * eeeo[1] + add) >> shift);
dst[0] = (short)((g_dct_32[0][0] * eeee[0] + g_dct_32[0][1] * eeee[1] + add) >> shift);
dst[16 * line] = (short)((g_dct_32[16][0] * eeee[0] + g_dct_32[16][1] * eeee[1] + add) >> shift);
dst[8 * line] = (short)((g_dct_32[8][0] * eeeo[0] + g_dct_32[8][1] * eeeo[1] + add) >> shift);
dst[24 * line] = (short)((g_dct_32[24][0] * eeeo[0] + g_dct_32[24][1] * eeeo[1] + add) >> shift);
for (k = 4; k < 32; k += 8) {
dst[k*line] = (short)((g_t32[k][0] * eeo[0] + g_t32[k][1] * eeo[1] + g_t32[k][2] * eeo[2] + g_t32[k][3] * eeo[3] + add) >> shift);
dst[k*line] = (short)((g_dct_32[k][0] * eeo[0] + g_dct_32[k][1] * eeo[1] + g_dct_32[k][2] * eeo[2] + g_dct_32[k][3] * eeo[3] + add) >> shift);
}
for (k = 2; k < 32; k += 4) {
dst[k*line] = (short)((g_t32[k][0] * eo[0] + g_t32[k][1] * eo[1] + g_t32[k][2] * eo[2] + g_t32[k][3] * eo[3] +
g_t32[k][4] * eo[4] + g_t32[k][5] * eo[5] + g_t32[k][6] * eo[6] + g_t32[k][7] * eo[7] + add) >> shift);
dst[k*line] = (short)((g_dct_32[k][0] * eo[0] + g_dct_32[k][1] * eo[1] + g_dct_32[k][2] * eo[2] + g_dct_32[k][3] * eo[3] +
g_dct_32[k][4] * eo[4] + g_dct_32[k][5] * eo[5] + g_dct_32[k][6] * eo[6] + g_dct_32[k][7] * eo[7] + add) >> shift);
}
for (k = 1; k < 32; k += 2) {
dst[k*line] = (short)((g_t32[k][0] * o[0] + g_t32[k][1] * o[1] + g_t32[k][2] * o[2] + g_t32[k][3] * o[3] +
g_t32[k][4] * o[4] + g_t32[k][5] * o[5] + g_t32[k][6] * o[6] + g_t32[k][7] * o[7] +
g_t32[k][8] * o[8] + g_t32[k][9] * o[9] + g_t32[k][10] * o[10] + g_t32[k][11] * o[11] +
g_t32[k][12] * o[12] + g_t32[k][13] * o[13] + g_t32[k][14] * o[14] + g_t32[k][15] * o[15] + add) >> shift);
dst[k*line] = (short)((g_dct_32[k][0] * o[0] + g_dct_32[k][1] * o[1] + g_dct_32[k][2] * o[2] + g_dct_32[k][3] * o[3] +
g_dct_32[k][4] * o[4] + g_dct_32[k][5] * o[5] + g_dct_32[k][6] * o[6] + g_dct_32[k][7] * o[7] +
g_dct_32[k][8] * o[8] + g_dct_32[k][9] * o[9] + g_dct_32[k][10] * o[10] + g_dct_32[k][11] * o[11] +
g_dct_32[k][12] * o[12] + g_dct_32[k][13] * o[13] + g_dct_32[k][14] * o[14] + g_dct_32[k][15] * o[15] + add) >> shift);
}
src += 32;
dst++;
@ -528,22 +528,22 @@ static void partial_butterfly_inverse_32_generic(int16_t *src, int16_t *dst,
for (j = 0; j<line; j++) {
// Utilizing symmetry properties to the maximum to minimize the number of multiplications
for (k = 0; k < 16; k++) {
o[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
o[k] = g_dct_32[1][k] * src[line] + g_dct_32[3][k] * src[3 * line] + g_dct_32[5][k] * src[5 * line] + g_dct_32[7][k] * src[7 * line] +
g_dct_32[9][k] * src[9 * line] + g_dct_32[11][k] * src[11 * line] + g_dct_32[13][k] * src[13 * line] + g_dct_32[15][k] * src[15 * line] +
g_dct_32[17][k] * src[17 * line] + g_dct_32[19][k] * src[19 * line] + g_dct_32[21][k] * src[21 * line] + g_dct_32[23][k] * src[23 * line] +
g_dct_32[25][k] * src[25 * line] + g_dct_32[27][k] * src[27 * line] + g_dct_32[29][k] * src[29 * line] + g_dct_32[31][k] * src[31 * line];
}
for (k = 0; k < 8; k++) {
eo[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
eo[k] = g_dct_32[2][k] * src[2 * line] + g_dct_32[6][k] * src[6 * line] + g_dct_32[10][k] * src[10 * line] + g_dct_32[14][k] * src[14 * line] +
g_dct_32[18][k] * src[18 * line] + g_dct_32[22][k] * src[22 * line] + g_dct_32[26][k] * src[26 * line] + g_dct_32[30][k] * src[30 * line];
}
for (k = 0; k < 4; k++) {
eeo[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
eeo[k] = g_dct_32[4][k] * src[4 * line] + g_dct_32[12][k] * src[12 * line] + g_dct_32[20][k] * src[20 * line] + g_dct_32[28][k] * src[28 * line];
}
eeeo[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
eeeo[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
eeee[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
eeee[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
eeeo[0] = g_dct_32[8][0] * src[8 * line] + g_dct_32[24][0] * src[24 * line];
eeeo[1] = g_dct_32[8][1] * src[8 * line] + g_dct_32[24][1] * src[24 * line];
eeee[0] = g_dct_32[0][0] * src[0] + g_dct_32[16][0] * src[16 * line];
eeee[1] = g_dct_32[0][1] * src[0] + g_dct_32[16][1] * src[16 * line];
// Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector
eee[0] = eeee[0] + eeeo[0];

View file

@ -19,17 +19,17 @@
* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>.
****************************************************************************/
extern const int16_t g_dst[4][4];
extern const int16_t g_t4[4][4];
extern const int16_t g_t8[8][8];
extern const int16_t g_t16[16][16];
extern const int16_t g_t32[32][32];
extern const int16_t g_dst_4[4][4];
extern const int16_t g_dct_4[4][4];
extern const int16_t g_dct_8[8][8];
extern const int16_t g_dct_16[16][16];
extern const int16_t g_dct_32[32][32];
extern const int16_t g_dst_t[4][4];
extern const int16_t g_t4_t[4][4];
extern const int16_t g_t8_t[8][8];
extern const int16_t g_t16_t[16][16];
extern const int16_t g_t32_t[32][32];
extern const int16_t g_dst_4_t[4][4];
extern const int16_t g_dct_4_t[4][4];
extern const int16_t g_dct_8_t[8][8];
extern const int16_t g_dct_16_t[16][16];
extern const int16_t g_dct_32_t[32][32];
int strategy_register_dct_generic(void* opaque);