mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
Use SSE instead of AVX for small widths
Highly dubious if this will help performance at all
This commit is contained in:
parent
b2176bf72a
commit
9b24d81c6a
|
@ -56,42 +56,54 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const
|
|||
|
||||
// Bytes in block in 256-bit blocks per each scanline, and remainder
|
||||
const int largeblock_bytes = width & ~31;
|
||||
const int residual_bytes = width & 31;
|
||||
const int residual_bytes_1 = width & 31;
|
||||
const int residual_xmms = residual_bytes_1 >> 4;
|
||||
const int residual_bytes = residual_bytes_1 & 15;
|
||||
|
||||
const __m128i rds = _mm_set1_epi8(residual_bytes);
|
||||
const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15);
|
||||
const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
|
||||
|
||||
const __m256i rds = _mm256_set1_epi8(residual_bytes);
|
||||
const __m256i ns = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31);
|
||||
const __m256i rdmask = _mm256_cmpgt_epi8(rds, ns);
|
||||
__m256i avx_inc = _mm256_setzero_si256();
|
||||
__m128i sse_inc = _mm_setzero_si128();
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
|
||||
for (x = 0; x < largeblock_bytes; x += 32) {
|
||||
__m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
|
||||
__m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
|
||||
__m256i curr_sads = _mm256_sad_epu8(a, b);
|
||||
avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
|
||||
}
|
||||
if (residual_xmms) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x)));
|
||||
__m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x)));
|
||||
__m128i curr_sads = _mm_sad_epu8 (a, b);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
x += 16;
|
||||
}
|
||||
if (residual_bytes) {
|
||||
__m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x)));
|
||||
__m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x)));
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y * stride1 + x)));
|
||||
__m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y * stride2 + x)));
|
||||
|
||||
__m256i b_masked = _mm256_blendv_epi8(a, b, rdmask);
|
||||
__m256i curr_sads = _mm256_sad_epu8 (a, b_masked);
|
||||
avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
|
||||
__m128i b_masked = _mm_blendv_epi8(a, b, rdmask);
|
||||
__m128i curr_sads = _mm_sad_epu8(a, b_masked);
|
||||
sse_inc = _mm_add_epi64(sse_inc, curr_sads);
|
||||
}
|
||||
}
|
||||
__m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i avx_inc_3 = _mm256_add_epi64 (avx_inc, avx_inc_2);
|
||||
__m256i avx_inc_4 = _mm256_shuffle_epi32 (avx_inc_3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i avx_inc_5 = _mm256_add_epi64 (avx_inc_3, avx_inc_4);
|
||||
__m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i avx_inc_3 = _mm256_add_epi64 (avx_inc, avx_inc_2);
|
||||
__m256i avx_inc_4 = _mm256_shuffle_epi32 (avx_inc_3, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m256i avx_inc_5 = _mm256_add_epi64 (avx_inc_3, avx_inc_4);
|
||||
__m128i avx_inc_128 = _mm256_castsi256_si128 (avx_inc_5);
|
||||
|
||||
__m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i sse_sads = _mm_add_epi64 (sse_inc, sse_inc_2);
|
||||
__m128i sads = _mm_add_epi64 (sse_sads, avx_inc_128);
|
||||
|
||||
// 32 bits should always be enough for even the largest blocks with a SAD of
|
||||
// 255 in each pixel, even though the SAD results themselves are 64 bits
|
||||
__m128i avx_inc_128 = _mm256_castsi256_si128(avx_inc_5);
|
||||
return _mm_cvtsi128_si32(avx_inc_128);
|
||||
return _mm_cvtsi128_si32(sads);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in a new issue