Add intrinsic version of SATD for 8x8 and larger blocks

This commit is contained in:
Ari Lemmetti 2015-09-24 19:10:03 +03:00
parent d68fc4c41e
commit 55da2a9958

View file

@ -136,6 +136,106 @@ static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * bu
return m256i_horizontal_sum(sum0);
}
static void hor_add_sub_avx2(__m128i *row0, __m128i *row1){
__m128i a = _mm_hadd_epi16(*row0, *row1);
__m128i b = _mm_hsub_epi16(*row0, *row1);
__m128i c = _mm_hadd_epi16(a, b);
__m128i d = _mm_hsub_epi16(a, b);
*row0 = _mm_hadd_epi16(c, d);
*row1 = _mm_hsub_epi16(c, d);
}
static INLINE void ver_add_sub_avx2(__m128i temp_hor[8], __m128i temp_ver[8]){
// First stage
for (int i = 0; i < 8; i += 2){
temp_ver[i+0] = _mm_hadd_epi16(temp_hor[i + 0], temp_hor[i + 1]);
temp_ver[i+1] = _mm_hsub_epi16(temp_hor[i + 0], temp_hor[i + 1]);
}
// Second stage
for (int i = 0; i < 8; i += 4){
temp_hor[i + 0] = _mm_add_epi16(temp_ver[i + 0], temp_ver[i + 2]);
temp_hor[i + 1] = _mm_add_epi16(temp_ver[i + 1], temp_ver[i + 3]);
temp_hor[i + 2] = _mm_sub_epi16(temp_ver[i + 0], temp_ver[i + 2]);
temp_hor[i + 3] = _mm_sub_epi16(temp_ver[i + 1], temp_ver[i + 3]);
}
// Third stage
for (int i = 0; i < 4; ++i){
temp_ver[i + 0] = _mm_add_epi16(temp_hor[0 + i], temp_hor[4 + i]);
temp_ver[i + 4] = _mm_sub_epi16(temp_hor[0 + i], temp_hor[4 + i]);
}
}
static unsigned kvz_satd_8bit_8x8_general_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
{
__m128i temp_hor[8];
__m128i temp_ver[8];
for (int row = 0; row < 8; row += 2){
for (int i = 0; i < 2; ++i){
__m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(&buf1[(row + i) * stride1])));
__m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(&buf2[(row + i) * stride2])));
temp_hor[row + i] = _mm_sub_epi16(buf1_row, buf2_row);
}
hor_add_sub_avx2(&temp_hor[row], &temp_hor[row + 1]);
}
ver_add_sub_avx2(temp_hor, temp_ver);
__m128i sad = _mm_setzero_si128();
for (int row = 0; row < 8; ++row){
__m128i abs_value = _mm_abs_epi16(temp_ver[row]);
sad = _mm_add_epi32(sad, _mm_madd_epi16(abs_value, _mm_set1_epi16(1)));
}
sad = _mm_hadd_epi32(sad, sad);
sad = _mm_hadd_epi32(sad, sad);
unsigned result = (_mm_cvtsi128_si32(sad) + 2) >> 2;
return result;
}
// Function macro for defining hadamard calculating functions
// for fixed size blocks. They calculate hadamard for integer
// multiples of 8x8 with the 8x8 hadamard function.
#define SATD_NXN_AVX2(n) \
static unsigned satd_8bit_ ## n ## x ## n ## _avx2( \
const kvz_pixel * const block1, const kvz_pixel * const block2) \
{ \
unsigned x, y; \
unsigned sum = 0; \
for (y = 0; y < (n); y += 8) { \
unsigned row = y * (n); \
for (x = 0; x < (n); x += 8) { \
sum += kvz_satd_8bit_8x8_general_avx2(&block1[row + x], (n), &block2[row + x], (n)); \
} \
} \
return sum>>(KVZ_BIT_DEPTH-8); \
}
static unsigned satd_8bit_8x8_avx2(
const kvz_pixel * const block1, const kvz_pixel * const block2)
{
unsigned x, y;
unsigned sum = 0;
for (y = 0; y < (8); y += 8) {
unsigned row = y * (8);
for (x = 0; x < (8); x += 8) {
sum += kvz_satd_8bit_8x8_general_avx2(&block1[row + x], (8), &block2[row + x], (8));
}
}
return sum>>(KVZ_BIT_DEPTH-8); \
}
//SATD_NXN_AVX2(8)
SATD_NXN_AVX2(16)
SATD_NXN_AVX2(32)
SATD_NXN_AVX2(64)
#endif //COMPILE_INTEL_AVX2
@ -153,6 +253,11 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2);
success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2);
success &= kvz_strategyselector_register(opaque, "sad_64x64", "avx2", 40, &sad_8bit_64x64_avx2);
success &= kvz_strategyselector_register(opaque, "satd_8x8", "avx2", 40, &satd_8bit_8x8_avx2);
success &= kvz_strategyselector_register(opaque, "satd_16x16", "avx2", 40, &satd_8bit_16x16_avx2);
success &= kvz_strategyselector_register(opaque, "satd_32x32", "avx2", 40, &satd_8bit_32x32_avx2);
success &= kvz_strategyselector_register(opaque, "satd_64x64", "avx2", 40, &satd_8bit_64x64_avx2);
}
#endif
return success;