Modify AVX2 SAD to mask data by byte granularity in AVX registers

Avoids using any SAD calculations narrower than 256 bits, and
simplifies the code. Also improves execution speed
This commit is contained in:
Pauli Oikkonen 2019-01-07 16:48:32 +02:00
parent 7585f79a71
commit 887d7700a8

View file

@ -53,18 +53,18 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
const int width, const int height, const unsigned stride1, const unsigned stride2)
{
int32_t y, x;
uint32_t sad = 0;
__m256i avx_inc = _mm256_setzero_si256();
// 256-bit blocks, bytes after them, 32-bit blocks after the large blocks
const int largeblock_bytes = width & ~31;
const int any_residuals = width & 31;
const int residual_128bs = any_residuals >> 4;
const int residual_dwords = any_residuals >> 2;
// Bytes in block in 256-bit blocks per each scanline, and remainder
const int largeblock_bytes = width & ~31;
const int residual_bytes = width & 31;
const __m256i ns = _mm256_setr_epi32 (0, 1, 2, 3, 4, 5, 6, 7);
const __m256i rds = _mm256_set1_epi32 (residual_dwords);
const __m256i rdmask = _mm256_cmpgt_epi32(rds, ns);
const __m256i rds = _mm256_set1_epi8(residual_bytes);
const __m256i ns = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31);
const __m256i rdmask = _mm256_cmpgt_epi8(rds, ns);
__m256i avx_inc = _mm256_setzero_si256();
for (y = 0; y < height; ++y) {
@ -74,22 +74,13 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
__m256i curr_sads = _mm256_sad_epu8(a, b);
avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
}
/*
* If there are no residual values, it does not matter what bogus values
* we use here since it will be masked away anyway
*/
if (any_residuals) {
if (residual_bytes) {
__m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
__m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
__m256i b_masked = _mm256_blendv_epi8(a, b, rdmask);
__m256i curr_sads = _mm256_sad_epu8 (a, b_masked);
avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
x = width & ~(uint32_t)3;
for (; x < width; x++)
sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
}
}
__m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2));
@ -100,9 +91,7 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
// 32 bits should always be enough for even the largest blocks with a SAD of
// 255 in each pixel, even though the SAD results themselves are 64 bits
__m128i avx_inc_128 = _mm256_castsi256_si128(avx_inc_5);
sad += _mm_cvtsi128_si32(avx_inc_128);
return sad;
return _mm_cvtsi128_si32(avx_inc_128);
}
/**