Optimize intra_pred_planar_avx2 for 4x4 blocks

2024-11-24 02:24:07 +00:00 · 2019-11-19 13:39:02 +02:00 · 2019-11-19 13:39:02 +02:00 · fa4bb86406
parent 4761d228f9
commit fa4bb86406
1 changed files with 33 additions and 71 deletions
--- a/src/strategies/avx2/intra-avx2.c
+++ b/src/strategies/avx2/intra-avx2.c
@ -453,14 +453,6 @@ static void kvz_angular_pred_avx2(
  }
 }
 void print_128(const __m128i v)
 {
  uint8_t buf[16];
  _mm_storeu_si128((__m128i *)buf, v);
  for (int i = 0; i < 16; i++)
    printf("%.2x%c", buf[i], i == 15 ? '\n' : i == 7 ? '-' : ' ');
 }
 /**
 * \brief Generate planar prediction.
 * \param log2_width    Log2 of width, range 2..5.
@ -510,13 +502,18 @@ static void kvz_intra_pred_planar_avx2(
  } else {
    // Only if log2_width == 2 <=> width == 4
    assert(width == 4);
    const __m128i ones = _mm_set1_epi8(1);
    const __m128i rl_shufmask = _mm_setr_epi32(0x04040404, 0x05050505,
                                               0x06060606, 0x07070707);
-    __m128i rl_v = _mm_cvtsi32_si128(*(const uint32_t *)(ref_left + 1));
+    const __m128i xp1   = _mm_set1_epi32  (0x04030201);
-    __m128i rt_v = _mm_cvtsi32_si128(*(const uint32_t *)(ref_top  + 1));
+    const __m128i yp1   = _mm_shuffle_epi8(xp1,   rl_shufmask);
    const __m128i rdist = _mm_set1_epi32  (0x00010203);
    const __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask);
    const __m128i wid16 = _mm_set1_epi16  (width);
    const __m128i tr    = _mm_set1_epi8   (top_right);
    const __m128i bl    = _mm_set1_epi8   (bottom_left);
    uint32_t rt14    = *(const uint32_t *)(ref_top  + 1);
    uint32_t rl14    = *(const uint32_t *)(ref_left + 1);
@ -528,69 +525,33 @@ static void kvz_intra_pred_planar_avx2(
    __m128i rt       = _mm_broadcastd_epi32(rtl_v);
    __m128i rl       = _mm_shuffle_epi8    (rtl_v,    rl_shufmask);
-    __m128i xp1   = _mm_set1_epi32  (0x04030201);
+    __m128i rtrl_l   = _mm_unpacklo_epi8   (rt,       rl);
-    __m128i yp1   = _mm_shuffle_epi8(xp1,   rl_shufmask);
+    __m128i rtrl_h   = _mm_unpackhi_epi8   (rt,       rl);
-    __m128i rdist = _mm_set1_epi32  (0x00010203);
+    __m128i bdrd_l   = _mm_unpacklo_epi8   (bdist,    rdist);
-    __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask);
+    __m128i bdrd_h   = _mm_unpackhi_epi8   (bdist,    rdist);
-    ///////////////////////////////////////////////////////////////
+    __m128i hvs_lo   = _mm_maddubs_epi16   (rtrl_l,   bdrd_l);
    __m128i hvs_hi   = _mm_maddubs_epi16   (rtrl_h,   bdrd_h);
-    __m128i rtrl_l = _mm_unpacklo_epi8(rt,     rl);
+    __m128i xp1yp1_l = _mm_unpacklo_epi8   (xp1,      yp1);
-    __m128i rtrl_h = _mm_unpackhi_epi8(rt,     rl);
+    __m128i xp1yp1_h = _mm_unpackhi_epi8   (xp1,      yp1);
    __m128i trbl_lh  = _mm_unpacklo_epi8   (tr,       bl);
-    __m128i bdrd_l = _mm_unpacklo_epi8(bdist,  rdist);
+    __m128i addend_l = _mm_maddubs_epi16   (trbl_lh,  xp1yp1_l);
-    __m128i bdrd_h = _mm_unpackhi_epi8(bdist,  rdist);
+    __m128i addend_h = _mm_maddubs_epi16   (trbl_lh,  xp1yp1_h);
-    __m128i hvs_lo = _mm_maddubs_epi16(rtrl_l, bdrd_l);
+            addend_l = _mm_add_epi16       (addend_l, wid16);
-    __m128i hvs_hi = _mm_maddubs_epi16(rtrl_h, bdrd_h);
+            addend_h = _mm_add_epi16       (addend_h, wid16);
-    int asdf = 0;
+    __m128i sum_l    = _mm_add_epi16       (hvs_lo,   addend_l);
-    for (int i = 0; i < 4; i++) {
+    __m128i sum_h    = _mm_add_epi16       (hvs_hi,   addend_h);
      if (ref_top[width + 1 + i] == 0x80) {
        asdf = 1;
        break;
      }
    }
    if (!asdf) {
      /*printf("hvs_lo: ");*/ print_128(hvs_lo);
      /*printf("hvs_hi: "); */print_128(hvs_hi);
    }
-    // rt1 rt2 rt3 rt4 rl1 rl2 rl3 rl4 0 0 0 0 0 0 0 0
+    // Shift right by log2_width + 1
-    // __m128i  rtl_v   = _mm_cvtsi64_si128(rtl14);
+    __m128i sum_l_t  = _mm_srli_epi16      (sum_l,    3);
-
+    __m128i sum_h_t  = _mm_srli_epi16      (sum_h,    3);
-    for (int y = 0; y < width; ++y) {
+    __m128i result   = _mm_packus_epi16    (sum_l_t,  sum_h_t);
-
+    _mm_storeu_si128((__m128i *)dst, result);
      uint8_t  yp1     = y + 1;
      uint16_t yp1_bl  = yp1 * bottom_left;
      uint8_t  rl_curr = ref_left[y + 1]; // 1...4
      uint8_t  bdist_s = width - 1 - y;
      for (int x = 0; x < width; ++x) {
        uint8_t  xp1     = x + 1;
        uint16_t xp1_tr  = xp1 * top_right;
        uint8_t  rt_curr = ref_top[x + 1];
        uint8_t  rdist_s = width - 1 - x;
        int16_t  hor     = rdist_s * rl_curr;
        int16_t  ver     = bdist_s * rt_curr;
        int16_t  hv_sum  = hor + ver;
        if (!asdf) {
          int daa = x + y * width;
          printf("%.2x %.2x%c", hv_sum & 0xff, hv_sum >> 8, ((daa & 7) == 7) ? '\n' : ((daa & 3) == 3) ? '-' : ' ');
        }
                 hv_sum += width;
                 hv_sum += xp1_tr + yp1_bl;
        dst[y * width + x] = hv_sum >> (log2_width + 1);
      }
    }
    if (!asdf) {
      fflush(stdout);
      asm("int $3");
    }
  }
 }
@ -608,3 +569,4 @@ int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
  return success;
 }