mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Add intrinsic version of SATD for 8x8 and larger blocks
This commit is contained in:
parent
d68fc4c41e
commit
55da2a9958
|
@ -136,6 +136,106 @@ static unsigned sad_8bit_64x64_avx2(const kvz_pixel * buf1, const kvz_pixel * bu
|
|||
return m256i_horizontal_sum(sum0);
|
||||
}
|
||||
|
||||
static void hor_add_sub_avx2(__m128i *row0, __m128i *row1){
|
||||
|
||||
__m128i a = _mm_hadd_epi16(*row0, *row1);
|
||||
__m128i b = _mm_hsub_epi16(*row0, *row1);
|
||||
|
||||
__m128i c = _mm_hadd_epi16(a, b);
|
||||
__m128i d = _mm_hsub_epi16(a, b);
|
||||
|
||||
*row0 = _mm_hadd_epi16(c, d);
|
||||
*row1 = _mm_hsub_epi16(c, d);
|
||||
}
|
||||
|
||||
static INLINE void ver_add_sub_avx2(__m128i temp_hor[8], __m128i temp_ver[8]){
|
||||
|
||||
// First stage
|
||||
for (int i = 0; i < 8; i += 2){
|
||||
temp_ver[i+0] = _mm_hadd_epi16(temp_hor[i + 0], temp_hor[i + 1]);
|
||||
temp_ver[i+1] = _mm_hsub_epi16(temp_hor[i + 0], temp_hor[i + 1]);
|
||||
}
|
||||
|
||||
// Second stage
|
||||
for (int i = 0; i < 8; i += 4){
|
||||
temp_hor[i + 0] = _mm_add_epi16(temp_ver[i + 0], temp_ver[i + 2]);
|
||||
temp_hor[i + 1] = _mm_add_epi16(temp_ver[i + 1], temp_ver[i + 3]);
|
||||
temp_hor[i + 2] = _mm_sub_epi16(temp_ver[i + 0], temp_ver[i + 2]);
|
||||
temp_hor[i + 3] = _mm_sub_epi16(temp_ver[i + 1], temp_ver[i + 3]);
|
||||
}
|
||||
|
||||
// Third stage
|
||||
for (int i = 0; i < 4; ++i){
|
||||
temp_ver[i + 0] = _mm_add_epi16(temp_hor[0 + i], temp_hor[4 + i]);
|
||||
temp_ver[i + 4] = _mm_sub_epi16(temp_hor[0 + i], temp_hor[4 + i]);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned kvz_satd_8bit_8x8_general_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
|
||||
{
|
||||
__m128i temp_hor[8];
|
||||
__m128i temp_ver[8];
|
||||
|
||||
for (int row = 0; row < 8; row += 2){
|
||||
for (int i = 0; i < 2; ++i){
|
||||
__m128i buf1_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(&buf1[(row + i) * stride1])));
|
||||
__m128i buf2_row = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(&buf2[(row + i) * stride2])));
|
||||
temp_hor[row + i] = _mm_sub_epi16(buf1_row, buf2_row);
|
||||
}
|
||||
hor_add_sub_avx2(&temp_hor[row], &temp_hor[row + 1]);
|
||||
}
|
||||
|
||||
ver_add_sub_avx2(temp_hor, temp_ver);
|
||||
|
||||
__m128i sad = _mm_setzero_si128();
|
||||
for (int row = 0; row < 8; ++row){
|
||||
__m128i abs_value = _mm_abs_epi16(temp_ver[row]);
|
||||
sad = _mm_add_epi32(sad, _mm_madd_epi16(abs_value, _mm_set1_epi16(1)));
|
||||
}
|
||||
|
||||
sad = _mm_hadd_epi32(sad, sad);
|
||||
sad = _mm_hadd_epi32(sad, sad);
|
||||
|
||||
unsigned result = (_mm_cvtsi128_si32(sad) + 2) >> 2;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function macro for defining hadamard calculating functions
|
||||
// for fixed size blocks. They calculate hadamard for integer
|
||||
// multiples of 8x8 with the 8x8 hadamard function.
|
||||
#define SATD_NXN_AVX2(n) \
|
||||
static unsigned satd_8bit_ ## n ## x ## n ## _avx2( \
|
||||
const kvz_pixel * const block1, const kvz_pixel * const block2) \
|
||||
{ \
|
||||
unsigned x, y; \
|
||||
unsigned sum = 0; \
|
||||
for (y = 0; y < (n); y += 8) { \
|
||||
unsigned row = y * (n); \
|
||||
for (x = 0; x < (n); x += 8) { \
|
||||
sum += kvz_satd_8bit_8x8_general_avx2(&block1[row + x], (n), &block2[row + x], (n)); \
|
||||
} \
|
||||
} \
|
||||
return sum>>(KVZ_BIT_DEPTH-8); \
|
||||
}
|
||||
|
||||
static unsigned satd_8bit_8x8_avx2(
|
||||
const kvz_pixel * const block1, const kvz_pixel * const block2)
|
||||
{
|
||||
unsigned x, y;
|
||||
unsigned sum = 0;
|
||||
for (y = 0; y < (8); y += 8) {
|
||||
unsigned row = y * (8);
|
||||
for (x = 0; x < (8); x += 8) {
|
||||
sum += kvz_satd_8bit_8x8_general_avx2(&block1[row + x], (8), &block2[row + x], (8));
|
||||
}
|
||||
}
|
||||
return sum>>(KVZ_BIT_DEPTH-8); \
|
||||
}
|
||||
|
||||
//SATD_NXN_AVX2(8)
|
||||
SATD_NXN_AVX2(16)
|
||||
SATD_NXN_AVX2(32)
|
||||
SATD_NXN_AVX2(64)
|
||||
|
||||
#endif //COMPILE_INTEL_AVX2
|
||||
|
||||
|
@ -153,6 +253,11 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
|
|||
success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "sad_64x64", "avx2", 40, &sad_8bit_64x64_avx2);
|
||||
|
||||
success &= kvz_strategyselector_register(opaque, "satd_8x8", "avx2", 40, &satd_8bit_8x8_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "satd_16x16", "avx2", 40, &satd_8bit_16x16_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "satd_32x32", "avx2", 40, &satd_8bit_32x32_avx2);
|
||||
success &= kvz_strategyselector_register(opaque, "satd_64x64", "avx2", 40, &satd_8bit_64x64_avx2);
|
||||
}
|
||||
#endif
|
||||
return success;
|
||||
|
|
Loading…
Reference in a new issue