From f781dc31f0c394dfbfc8e7e2d9bf77f335b29088 Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Tue, 22 Jan 2019 15:57:16 +0200 Subject: [PATCH] Create strategy for ver_sad Easy to vectorize --- src/image.c | 38 +-- src/strategies/avx2/picture-avx2.c | 19 ++ src/strategies/generic/picture-generic.c | 27 +++ src/strategies/sse41/picture-sse41.c | 18 ++ .../sse41/reg_sad_pow2_widths-sse41.h | 228 ++++++++++++++++++ src/strategies/strategies-picture.c | 1 + src/strategies/strategies-picture.h | 6 +- 7 files changed, 304 insertions(+), 33 deletions(-) diff --git a/src/image.c b/src/image.c index 7049f0fb..fe626d72 100644 --- a/src/image.c +++ b/src/image.c @@ -260,32 +260,6 @@ static unsigned cor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data, return sad; } -/** - * \brief Vertically interpolate SAD outside the frame. - * - * \param data1 Starting point of the first picture. - * \param data2 Starting point of the second picture. - * \param width Width of the region for which SAD is calculated. - * \param height Height of the region for which SAD is calculated. - * \param width Width of the pixel array. - * - * \returns Sum of Absolute Differences - */ -static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data, - int block_width, int block_height, unsigned pic_stride) -{ - int x, y; - unsigned sad = 0; - - for (y = 0; y < block_height; ++y) { - for (x = 0; x < block_width; ++x) { - sad += abs(pic_data[y * pic_stride + x] - ref_data[x]); - } - } - - return sad; -} - /** * \brief Horizontally interpolate SAD outside the frame. * @@ -370,7 +344,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture result += cor_sad(pic_data, &ref_data[top * ref->stride + left], left, top, pic->stride); - result += ver_sad(&pic_data[left], + result += kvz_ver_sad(&pic_data[left], &ref_data[top * ref->stride + left], block_width - left, top, pic->stride); result += hor_sad(&pic_data[top * pic->stride], @@ -380,7 +354,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture &ref_data[top * ref->stride + left], block_width - left, block_height - top, pic->stride, ref->stride); } else if (top && right) { - result += ver_sad(pic_data, + result += kvz_ver_sad(pic_data, &ref_data[top * ref->stride], block_width - right, top, pic->stride); result += cor_sad(&pic_data[block_width - right], @@ -402,7 +376,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture result += cor_sad(&pic_data[(block_height - bottom) * pic->stride], &ref_data[(block_height - bottom - 1) * ref->stride + left], left, bottom, pic->stride); - result += ver_sad(&pic_data[(block_height - bottom) * pic->stride + left], + result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left], &ref_data[(block_height - bottom - 1) * ref->stride + left], block_width - left, bottom, pic->stride); } else if (bottom && right) { @@ -412,14 +386,14 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture result += hor_sad(&pic_data[block_width - right], &ref_data[block_width - right - 1], right, block_height - bottom, pic->stride, ref->stride); - result += ver_sad(&pic_data[(block_height - bottom) * pic->stride], + result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride], &ref_data[(block_height - bottom - 1) * ref->stride], block_width - right, bottom, pic->stride); result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right], &ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1], right, bottom, pic->stride); } else if (top) { - result += ver_sad(pic_data, + result += kvz_ver_sad(pic_data, &ref_data[top * ref->stride], block_width, top, pic->stride); result += reg_sad_maybe_optimized(&pic_data[top * pic->stride], @@ -431,7 +405,7 @@ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture ref_data, block_width, block_height - bottom, pic->stride, ref->stride, optimized_sad); - result += ver_sad(&pic_data[(block_height - bottom) * pic->stride], + result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride], &ref_data[(block_height - bottom - 1) * ref->stride], block_width, bottom, pic->stride); } else if (left) { diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 5aa7fe23..c8dec42d 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -1277,6 +1277,24 @@ static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width) else return NULL; } + +static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t stride) +{ + if (width == 0) + return 0; + if (width == 4) + return ver_sad_w4(pic_data, ref_data, height, stride); + if (width == 8) + return ver_sad_w8(pic_data, ref_data, height, stride); + if (width == 12) + return ver_sad_w12(pic_data, ref_data, height, stride); + if (width == 16) + return ver_sad_w16(pic_data, ref_data, height, stride); + else + return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); +} + #endif //COMPILE_INTEL_AVX2 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) @@ -1312,6 +1330,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2); success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2); success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2); + success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2); } #endif diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c index 8c7cbdfa..a49f099a 100644 --- a/src/strategies/generic/picture-generic.c +++ b/src/strategies/generic/picture-generic.c @@ -593,6 +593,32 @@ static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width) return NULL; } +/** + * \brief Vertically interpolate SAD outside the frame. + * + * \param data1 Starting point of the first picture. + * \param data2 Starting point of the second picture. + * \param width Width of the region for which SAD is calculated. + * \param height Height of the region for which SAD is calculated. + * \param width Width of the pixel array. + * + * \returns Sum of Absolute Differences + */ +static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int block_width, int block_height, unsigned pic_stride) +{ + int x, y; + unsigned sad = 0; + + for (y = 0; y < block_height; ++y) { + for (x = 0; x < block_width; ++x) { + sad += abs(pic_data[y * pic_stride + x] - ref_data[x]); + } + } + + return sad; +} + int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) { bool success = true; @@ -629,6 +655,7 @@ int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic); success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic); + success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic); return success; } diff --git a/src/strategies/sse41/picture-sse41.c b/src/strategies/sse41/picture-sse41.c index d28202ef..65384f18 100644 --- a/src/strategies/sse41/picture-sse41.c +++ b/src/strategies/sse41/picture-sse41.c @@ -66,6 +66,23 @@ static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width) return NULL; } +static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t stride) +{ + if (width == 0) + return 0; + if (width == 4) + return ver_sad_w4(pic_data, ref_data, height, stride); + if (width == 8) + return ver_sad_w8(pic_data, ref_data, height, stride); + if (width == 12) + return ver_sad_w12(pic_data, ref_data, height, stride); + if (width == 16) + return ver_sad_w16(pic_data, ref_data, height, stride); + else + return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); +} + #endif //COMPILE_INTEL_SSE41 @@ -75,6 +92,7 @@ int kvz_strategy_register_picture_sse41(void* opaque, uint8_t bitdepth) { if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41); success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41); + success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41); } #endif return success; diff --git a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h index a072b16b..d659ee7d 100644 --- a/src/strategies/sse41/reg_sad_pow2_widths-sse41.h +++ b/src/strategies/sse41/reg_sad_pow2_widths-sse41.h @@ -314,4 +314,232 @@ static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kv return _mm_cvtsi128_si32(sad); } +static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride)); + + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1); + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2); + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3); + + __m128i curr_sads = _mm_sad_epu8(a, ref_row); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + if (height_residual_lines) { + // Only pick the last dword, because we're comparing single dwords (lines) + ref_row = _mm_bsrli_si128(ref_row, 12); + + for (; y < height; y++) { + __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride)); + + __m128i curr_sads = _mm_sad_epu8(a, ref_row); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + uint64_t result = 0; + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128d a_d = _mm_setzero_pd(); + __m128d c_d = _mm_setzero_pd(); + + a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride)); + a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride)); + + c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride)); + c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride)); + + __m128i a = _mm_castpd_si128(a_d); + __m128i c = _mm_castpd_si128(c_d); + + __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row); + __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + } + if (height_residual_lines) { + __m64 b = (__m64)_mm_cvtsi128_si64(ref_row); + + for (; y < height; y++) { + __m64 a = *(__m64 *)(pic_data + y * stride); + __m64 sads = _mm_sad_pu8(a, b); + result += (uint64_t)sads; + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + result += _mm_cvtsi128_si32(sad); + return result; +} + +static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + for (y = 0; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride)); + + __m128i a_masked = _mm_blend_epi16(ref_row, a, 0x3f); + __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride)); + __m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride)); + __m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride)); + __m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride)); + + __m128i curr_sads_1 = _mm_sad_epu8 (pic_row_1, ref_row); + __m128i curr_sads_2 = _mm_sad_epu8 (pic_row_2, ref_row); + __m128i curr_sads_3 = _mm_sad_epu8 (pic_row_3, ref_row); + __m128i curr_sads_4 = _mm_sad_epu8 (pic_row_4, ref_row); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_1); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_2); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_3); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_4); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride)); + __m128i curr_sads = _mm_sad_epu8 (pic_row, ref_row); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t stride) +{ + int32_t y, x; + __m128i sse_inc = _mm_setzero_si128(); + + // Bytes in block in 128-bit blocks per each scanline, and remainder + const int32_t width_xmms = width & ~15; + const int32_t width_residual_pixels = width & 15; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + const __m128i rds = _mm_set1_epi8 (width_residual_pixels); + const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i rdmask = _mm_cmpgt_epi8(rds, ns); + + for (x = 0; x < width_xmms; x += 16) { + const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x)); + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x)); + __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x)); + __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x)); + __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x)); + + __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a); + __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c); + __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e); + __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x)); + + __m128i curr_sads = _mm_sad_epu8(a, ref_row); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + } + + if (width_residual_pixels) { + const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x)); + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x)); + __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x)); + __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x)); + __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x)); + + __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask); + __m128i c_masked = _mm_blendv_epi8(ref_row, c, rdmask); + __m128i e_masked = _mm_blendv_epi8(ref_row, e, rdmask); + __m128i g_masked = _mm_blendv_epi8(ref_row, g, rdmask); + + __m128i curr_sads_ab = _mm_sad_epu8 (ref_row, a_masked); + __m128i curr_sads_cd = _mm_sad_epu8 (ref_row, c_masked); + __m128i curr_sads_ef = _mm_sad_epu8 (ref_row, e_masked); + __m128i curr_sads_gh = _mm_sad_epu8 (ref_row, g_masked); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x)); + + __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask); + __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + #endif diff --git a/src/strategies/strategies-picture.c b/src/strategies/strategies-picture.c index 58814376..f32a6e0d 100644 --- a/src/strategies/strategies-picture.c +++ b/src/strategies/strategies-picture.c @@ -64,6 +64,7 @@ pixels_calc_ssd_func * kvz_pixels_calc_ssd = 0; inter_recon_bipred_func * kvz_inter_recon_bipred_blend = 0; get_optimized_sad_func *kvz_get_optimized_sad = 0; +ver_sad_func *kvz_ver_sad = 0; int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) { diff --git a/src/strategies/strategies-picture.h b/src/strategies/strategies-picture.h index 4d6b3c32..d834033b 100644 --- a/src/strategies/strategies-picture.h +++ b/src/strategies/strategies-picture.h @@ -113,7 +113,9 @@ typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_p typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width); typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t); - +typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t block_width, int32_t block_height, + uint32_t pic_stride); typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0, const int hi_prec_luma_rec1, @@ -167,6 +169,7 @@ extern pixels_calc_ssd_func *kvz_pixels_calc_ssd; extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend; extern get_optimized_sad_func *kvz_get_optimized_sad; +extern ver_sad_func *kvz_ver_sad; int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth); cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n); @@ -201,6 +204,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n); {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \ {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \ {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \ + {"ver_sad", (void**) &kvz_ver_sad}, \