Use SSE instead of AVX for small widths

Highly dubious if this will help performance at all
This commit is contained in:
Pauli Oikkonen 2019-01-07 20:12:13 +02:00
parent b2176bf72a
commit 9b24d81c6a

View file

@ -56,42 +56,54 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
// Bytes in block in 256-bit blocks per each scanline, and remainder
const int largeblock_bytes = width & ~31;
const int residual_bytes = width & 31;
const int residual_bytes_1 = width & 31;
const int residual_xmms = residual_bytes_1 >> 4;
const int residual_bytes = residual_bytes_1 & 15;
const __m128i rds = _mm_set1_epi8(residual_bytes);
const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15);
const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
const __m256i rds = _mm256_set1_epi8(residual_bytes);
const __m256i ns = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31);
const __m256i rdmask = _mm256_cmpgt_epi8(rds, ns);
__m256i avx_inc = _mm256_setzero_si256();
__m128i sse_inc = _mm_setzero_si128();
for (y = 0; y < height; ++y) {
for (x = 0; x < largeblock_bytes; x += 32) {
__m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
__m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
__m256i curr_sads = _mm256_sad_epu8(a, b);
avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
}
if (residual_xmms) {
__m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x)));
__m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x)));
__m128i curr_sads = _mm_sad_epu8 (a, b);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
x += 16;
}
if (residual_bytes) {
__m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
__m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
__m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x)));
__m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x)));
__m256i b_masked = _mm256_blendv_epi8(a, b, rdmask);
__m256i curr_sads = _mm256_sad_epu8 (a, b_masked);
avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
__m128i b_masked = _mm_blendv_epi8(a, b, rdmask);
__m128i curr_sads = _mm_sad_epu8(a, b_masked);
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
}
}
__m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m256i avx_inc_3 = _mm256_add_epi64 (avx_inc, avx_inc_2);
__m256i avx_inc_4 = _mm256_shuffle_epi32 (avx_inc_3, _MM_SHUFFLE(1, 0, 3, 2));
__m256i avx_inc_5 = _mm256_add_epi64 (avx_inc_3, avx_inc_4);
__m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m256i avx_inc_3 = _mm256_add_epi64 (avx_inc, avx_inc_2);
__m256i avx_inc_4 = _mm256_shuffle_epi32 (avx_inc_3, _MM_SHUFFLE(1, 0, 3, 2));
__m256i avx_inc_5 = _mm256_add_epi64 (avx_inc_3, avx_inc_4);
__m128i avx_inc_128 = _mm256_castsi256_si128 (avx_inc_5);
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
__m128i sse_sads = _mm_add_epi64 (sse_inc, sse_inc_2);
__m128i sads = _mm_add_epi64 (sse_sads, avx_inc_128);
// 32 bits should always be enough for even the largest blocks with a SAD of
// 255 in each pixel, even though the SAD results themselves are 64 bits
__m128i avx_inc_128 = _mm256_castsi256_si128(avx_inc_5);
return _mm_cvtsi128_si32(avx_inc_128);
return _mm_cvtsi128_si32(sads);
}
/**