From 768203a2de8ff4e7797c744fd1778917720c2d20 Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Wed, 30 Jan 2019 22:57:06 +0200 Subject: [PATCH] First version of arbitrary-width SSE4.1 hor_sad --- src/image.c | 15 +++- src/strategies/avx2/picture-avx2.c | 13 +-- src/strategies/sse41/picture-sse41.c | 3 + .../sse41/reg_sad_pow2_widths-sse41.h | 82 ++++++++++++++++++- 4 files changed, 104 insertions(+), 9 deletions(-) diff --git a/src/image.c b/src/image.c index 08e75a0a..d4d645b6 100644 --- a/src/image.c +++ b/src/image.c @@ -413,6 +413,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture &ref_data[(block_height - bottom - 1) * ref->stride], block_width, bottom, pic->stride); } else if (left) { + /* if (block_width == 16 || block_width == 8 || block_width == 4 || block_width == 32) { result += kvz_hor_sad(pic_data, ref_data, block_width, block_height, pic->stride, @@ -425,19 +426,29 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture &ref_data[left], block_width - left, block_height, pic->stride, ref->stride); } + */ + result += kvz_hor_sad(pic_data, ref_data, + block_width, block_height, pic->stride, + ref->stride, left, right); } else if (right) { + /* if (block_width == 32) { result += kvz_hor_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride, left, right); } else { - result += kvz_reg_sad(pic_data, + rulli += kvz_reg_sad(pic_data, ref_data, block_width - right, block_height, pic->stride, ref->stride); - result += hor_sad(&pic_data[block_width - right], + rulli += hor_sad(&pic_data[block_width - right], &ref_data[block_width - right - 1], right, block_height, pic->stride, ref->stride); } + */ + // TODO: create a generic strat from ol' hor_sad + result += kvz_hor_sad(pic_data, ref_data, + block_width, block_height, pic->stride, + ref->stride, left, right); } else { result += reg_sad_maybe_optimized(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride, optimized_sad); diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 39d022c8..60dc3df4 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -1299,19 +1299,22 @@ static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_dat int32_t width, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { - if (width == 4) + // TODO TODO: create righty versions from these + if (width == 4 && left) return hor_sad_left_sse41_w4(pic_data, ref_data, width, height, pic_stride, ref_stride, left); - if (width == 8) + if (width == 8 && left) return hor_sad_left_sse41_w8(pic_data, ref_data, width, height, pic_stride, ref_stride, left); - if (width == 16) + if (width == 16 && left) return hor_sad_left_sse41_w16(pic_data, ref_data, width, height, pic_stride, ref_stride, left); - if (width == 32) + if (width == 32 && left) return hor_sad_sse41_w32(pic_data, ref_data, width, height, pic_stride, ref_stride, left, right); - assert(0); + else + return hor_sad_sse41_arbitrary(pic_data, ref_data, width, height, + pic_stride, ref_stride, left, right); } #endif //COMPILE_INTEL_AVX2 diff --git a/src/strategies/sse41/picture-sse41.c b/src/strategies/sse41/picture-sse41.c index 8c903475..d4fb8468 100644 --- a/src/strategies/sse41/picture-sse41.c +++ b/src/strategies/sse41/picture-sse41.c @@ -87,6 +87,7 @@ static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_da int32_t width, int32_t height, uint32_t pic_stride, uint32_t ref_stride, uint32_t left, uint32_t right) { + /* if (width == 4) return hor_sad_left_sse41_w4(pic_data, ref_data, width, height, pic_stride, ref_stride, left); @@ -99,7 +100,9 @@ static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_da if (width == 32) return hor_sad_sse41_w32(pic_data, ref_data, width, height, pic_stride, ref_stride, left, right); + */ assert(0); + return 0; } #endif //COMPILE_INTEL_SSE41 diff --git a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h index 00e4ce7d..00edeca1 100644 --- a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h +++ b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h @@ -726,8 +726,8 @@ static uint32_t hor_sad_left_sse41_w16(const kvz_pixel *pic_data, const kvz_pixe } static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, - int32_t width, int32_t height, uint32_t pic_stride, - uint32_t ref_stride, uint32_t left, uint32_t right) + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right) { const int32_t height_twoline_groups = height & ~1; const int32_t height_residual_lines = height & 1; @@ -804,4 +804,82 @@ static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *re return _mm_cvtsi128_si32(sad); } +static uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right) +{ + const size_t xmm_width = 16; + const __m128i xmm_widths = _mm_set1_epi8(xmm_width); + + // Bytes in block in 128-bit blocks per each scanline, and remainder + const int32_t width_xmms = width & ~(xmm_width - 1); + const int32_t width_residual_pixels = width & (xmm_width - 1); + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + + const __m128i rds = _mm_set1_epi8 (width_residual_pixels); + const __m128i rdmask = _mm_cmpgt_epi8(rds, ns); + + int32_t border_idx; + __m128i is_right_border = _mm_setzero_si128(); + if (left) { + border_idx = left; + } else { + border_idx = width - (right + 1); + is_right_border = _mm_cmpeq_epi8(is_right_border, is_right_border); + } + const __m128i epol_src_idx = _mm_set1_epi8(border_idx); + + int32_t x, y; + __m128i sse_inc = _mm_setzero_si128(); + __m128i epol_mask; + for (x = 0; x < width_xmms; x += xmm_width) { + + // This is a dirty hack, but it saves us an easily predicted branch! It + // also marks the first or last valid pixel (the border one) for + // extrapolating, but that makes no difference since the pixels marked + // for extrapolation will always be written over with that exact pixel's + // value. + epol_mask = _mm_cmpgt_epi8(epol_src_idx, ns); + epol_mask = _mm_xor_si128 (epol_mask, is_right_border); + + for (y = 0; y < height; y++) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + x)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + x)); + + __m128i border_px_b = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 0) * ref_stride + border_idx)); + __m128i b_epol = _mm_blendv_epi8(b, border_px_b, epol_mask); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + } + ns = _mm_add_epi8(ns, xmm_widths); + } + if (width_residual_pixels) { + epol_mask = _mm_cmpgt_epi8(epol_src_idx, ns); + epol_mask = _mm_xor_si128 (epol_mask, is_right_border); + + for (y = 0; y < height; y++) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + x)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + x)); + + __m128i border_px_b = _mm_set1_epi8 (*(uint8_t *)(ref_data + (y + 0) * ref_stride + border_idx)); + __m128i b_epol_1 = _mm_blendv_epi8(b, border_px_b, epol_mask); + __m128i b_epol_2 = _mm_blendv_epi8(a, b_epol_1, rdmask); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol_2); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + #endif