diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index b2b39075..e8de7455 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -53,18 +53,18 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const const int width, const int height, const unsigned stride1, const unsigned stride2) { int32_t y, x; - uint32_t sad = 0; - __m256i avx_inc = _mm256_setzero_si256(); - // 256-bit blocks, bytes after them, 32-bit blocks after the large blocks - const int largeblock_bytes = width & ~31; - const int any_residuals = width & 31; - const int residual_128bs = any_residuals >> 4; - const int residual_dwords = any_residuals >> 2; + // Bytes in block in 256-bit blocks per each scanline, and remainder + const int largeblock_bytes = width & ~31; + const int residual_bytes = width & 31; - const __m256i ns = _mm256_setr_epi32 (0, 1, 2, 3, 4, 5, 6, 7); - const __m256i rds = _mm256_set1_epi32 (residual_dwords); - const __m256i rdmask = _mm256_cmpgt_epi32(rds, ns); + const __m256i rds = _mm256_set1_epi8(residual_bytes); + const __m256i ns = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + const __m256i rdmask = _mm256_cmpgt_epi8(rds, ns); + __m256i avx_inc = _mm256_setzero_si256(); for (y = 0; y < height; ++y) { @@ -74,22 +74,13 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const __m256i curr_sads = _mm256_sad_epu8(a, b); avx_inc = _mm256_add_epi64(avx_inc, curr_sads); } - - /* - * If there are no residual values, it does not matter what bogus values - * we use here since it will be masked away anyway - */ - if (any_residuals) { + if (residual_bytes) { __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y * stride1 + x))); __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y * stride2 + x))); __m256i b_masked = _mm256_blendv_epi8(a, b, rdmask); __m256i curr_sads = _mm256_sad_epu8 (a, b_masked); avx_inc = _mm256_add_epi64(avx_inc, curr_sads); - x = width & ~(uint32_t)3; - - for (; x < width; x++) - sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } __m256i avx_inc_2 = _mm256_permute4x64_epi64(avx_inc, _MM_SHUFFLE(1, 0, 3, 2)); @@ -100,9 +91,7 @@ uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const // 32 bits should always be enough for even the largest blocks with a SAD of // 255 in each pixel, even though the SAD results themselves are 64 bits __m128i avx_inc_128 = _mm256_castsi256_si128(avx_inc_5); - sad += _mm_cvtsi128_si32(avx_inc_128); - - return sad; + return _mm_cvtsi128_si32(avx_inc_128); } /**