From fa4bb86406138ca28eb6888ccf7f3c65fc917e83 Mon Sep 17 00:00:00 2001 From: Pauli Oikkonen Date: Tue, 19 Nov 2019 13:39:02 +0200 Subject: [PATCH] Optimize intra_pred_planar_avx2 for 4x4 blocks --- src/strategies/avx2/intra-avx2.c | 104 ++++++++++--------------------- 1 file changed, 33 insertions(+), 71 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 5ec49712..1a6af0fc 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -453,14 +453,6 @@ static void kvz_angular_pred_avx2( } } -void print_128(const __m128i v) -{ - uint8_t buf[16]; - _mm_storeu_si128((__m128i *)buf, v); - for (int i = 0; i < 16; i++) - printf("%.2x%c", buf[i], i == 15 ? '\n' : i == 7 ? '-' : ' '); -} - /** * \brief Generate planar prediction. * \param log2_width Log2 of width, range 2..5. @@ -510,13 +502,18 @@ static void kvz_intra_pred_planar_avx2( } else { // Only if log2_width == 2 <=> width == 4 assert(width == 4); - - const __m128i ones = _mm_set1_epi8(1); const __m128i rl_shufmask = _mm_setr_epi32(0x04040404, 0x05050505, 0x06060606, 0x07070707); - __m128i rl_v = _mm_cvtsi32_si128(*(const uint32_t *)(ref_left + 1)); - __m128i rt_v = _mm_cvtsi32_si128(*(const uint32_t *)(ref_top + 1)); + const __m128i xp1 = _mm_set1_epi32 (0x04030201); + const __m128i yp1 = _mm_shuffle_epi8(xp1, rl_shufmask); + + const __m128i rdist = _mm_set1_epi32 (0x00010203); + const __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask); + + const __m128i wid16 = _mm_set1_epi16 (width); + const __m128i tr = _mm_set1_epi8 (top_right); + const __m128i bl = _mm_set1_epi8 (bottom_left); uint32_t rt14 = *(const uint32_t *)(ref_top + 1); uint32_t rl14 = *(const uint32_t *)(ref_left + 1); @@ -524,73 +521,37 @@ static void kvz_intra_pred_planar_avx2( uint64_t rl14_64 = (uint64_t)rl14; uint64_t rtl14 = rt14_64 | (rl14_64 << 32); - __m128i rtl_v = _mm_cvtsi64_si128 (rtl14); - __m128i rt = _mm_broadcastd_epi32(rtl_v); - __m128i rl = _mm_shuffle_epi8 (rtl_v, rl_shufmask); + __m128i rtl_v = _mm_cvtsi64_si128 (rtl14); + __m128i rt = _mm_broadcastd_epi32(rtl_v); + __m128i rl = _mm_shuffle_epi8 (rtl_v, rl_shufmask); - __m128i xp1 = _mm_set1_epi32 (0x04030201); - __m128i yp1 = _mm_shuffle_epi8(xp1, rl_shufmask); + __m128i rtrl_l = _mm_unpacklo_epi8 (rt, rl); + __m128i rtrl_h = _mm_unpackhi_epi8 (rt, rl); - __m128i rdist = _mm_set1_epi32 (0x00010203); - __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask); + __m128i bdrd_l = _mm_unpacklo_epi8 (bdist, rdist); + __m128i bdrd_h = _mm_unpackhi_epi8 (bdist, rdist); - /////////////////////////////////////////////////////////////// + __m128i hvs_lo = _mm_maddubs_epi16 (rtrl_l, bdrd_l); + __m128i hvs_hi = _mm_maddubs_epi16 (rtrl_h, bdrd_h); - __m128i rtrl_l = _mm_unpacklo_epi8(rt, rl); - __m128i rtrl_h = _mm_unpackhi_epi8(rt, rl); + __m128i xp1yp1_l = _mm_unpacklo_epi8 (xp1, yp1); + __m128i xp1yp1_h = _mm_unpackhi_epi8 (xp1, yp1); + __m128i trbl_lh = _mm_unpacklo_epi8 (tr, bl); - __m128i bdrd_l = _mm_unpacklo_epi8(bdist, rdist); - __m128i bdrd_h = _mm_unpackhi_epi8(bdist, rdist); + __m128i addend_l = _mm_maddubs_epi16 (trbl_lh, xp1yp1_l); + __m128i addend_h = _mm_maddubs_epi16 (trbl_lh, xp1yp1_h); - __m128i hvs_lo = _mm_maddubs_epi16(rtrl_l, bdrd_l); - __m128i hvs_hi = _mm_maddubs_epi16(rtrl_h, bdrd_h); + addend_l = _mm_add_epi16 (addend_l, wid16); + addend_h = _mm_add_epi16 (addend_h, wid16); - int asdf = 0; - for (int i = 0; i < 4; i++) { - if (ref_top[width + 1 + i] == 0x80) { - asdf = 1; - break; - } - } - if (!asdf) { - /*printf("hvs_lo: ");*/ print_128(hvs_lo); - /*printf("hvs_hi: "); */print_128(hvs_hi); - } + __m128i sum_l = _mm_add_epi16 (hvs_lo, addend_l); + __m128i sum_h = _mm_add_epi16 (hvs_hi, addend_h); - // rt1 rt2 rt3 rt4 rl1 rl2 rl3 rl4 0 0 0 0 0 0 0 0 - // __m128i rtl_v = _mm_cvtsi64_si128(rtl14); - - for (int y = 0; y < width; ++y) { - - uint8_t yp1 = y + 1; - uint16_t yp1_bl = yp1 * bottom_left; - uint8_t rl_curr = ref_left[y + 1]; // 1...4 - uint8_t bdist_s = width - 1 - y; - - for (int x = 0; x < width; ++x) { - uint8_t xp1 = x + 1; - uint16_t xp1_tr = xp1 * top_right; - uint8_t rt_curr = ref_top[x + 1]; - - uint8_t rdist_s = width - 1 - x; - - int16_t hor = rdist_s * rl_curr; - int16_t ver = bdist_s * rt_curr; - int16_t hv_sum = hor + ver; - if (!asdf) { - int daa = x + y * width; - printf("%.2x %.2x%c", hv_sum & 0xff, hv_sum >> 8, ((daa & 7) == 7) ? '\n' : ((daa & 3) == 3) ? '-' : ' '); - } - hv_sum += width; - hv_sum += xp1_tr + yp1_bl; - - dst[y * width + x] = hv_sum >> (log2_width + 1); - } - } - if (!asdf) { - fflush(stdout); - asm("int $3"); - } + // Shift right by log2_width + 1 + __m128i sum_l_t = _mm_srli_epi16 (sum_l, 3); + __m128i sum_h_t = _mm_srli_epi16 (sum_h, 3); + __m128i result = _mm_packus_epi16 (sum_l_t, sum_h_t); + _mm_storeu_si128((__m128i *)dst, result); } } @@ -608,3 +569,4 @@ int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth) #endif //COMPILE_INTEL_AVX2 && defined X86_64 return success; } +