mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 02:24:07 +00:00
Optimize intra_pred_planar_avx2 for 4x4 blocks
This commit is contained in:
parent
4761d228f9
commit
fa4bb86406
|
@ -453,14 +453,6 @@ static void kvz_angular_pred_avx2(
|
|||
}
|
||||
}
|
||||
|
||||
void print_128(const __m128i v)
|
||||
{
|
||||
uint8_t buf[16];
|
||||
_mm_storeu_si128((__m128i *)buf, v);
|
||||
for (int i = 0; i < 16; i++)
|
||||
printf("%.2x%c", buf[i], i == 15 ? '\n' : i == 7 ? '-' : ' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Generate planar prediction.
|
||||
* \param log2_width Log2 of width, range 2..5.
|
||||
|
@ -510,13 +502,18 @@ static void kvz_intra_pred_planar_avx2(
|
|||
} else {
|
||||
// Only if log2_width == 2 <=> width == 4
|
||||
assert(width == 4);
|
||||
|
||||
const __m128i ones = _mm_set1_epi8(1);
|
||||
const __m128i rl_shufmask = _mm_setr_epi32(0x04040404, 0x05050505,
|
||||
0x06060606, 0x07070707);
|
||||
|
||||
__m128i rl_v = _mm_cvtsi32_si128(*(const uint32_t *)(ref_left + 1));
|
||||
__m128i rt_v = _mm_cvtsi32_si128(*(const uint32_t *)(ref_top + 1));
|
||||
const __m128i xp1 = _mm_set1_epi32 (0x04030201);
|
||||
const __m128i yp1 = _mm_shuffle_epi8(xp1, rl_shufmask);
|
||||
|
||||
const __m128i rdist = _mm_set1_epi32 (0x00010203);
|
||||
const __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask);
|
||||
|
||||
const __m128i wid16 = _mm_set1_epi16 (width);
|
||||
const __m128i tr = _mm_set1_epi8 (top_right);
|
||||
const __m128i bl = _mm_set1_epi8 (bottom_left);
|
||||
|
||||
uint32_t rt14 = *(const uint32_t *)(ref_top + 1);
|
||||
uint32_t rl14 = *(const uint32_t *)(ref_left + 1);
|
||||
|
@ -524,73 +521,37 @@ static void kvz_intra_pred_planar_avx2(
|
|||
uint64_t rl14_64 = (uint64_t)rl14;
|
||||
uint64_t rtl14 = rt14_64 | (rl14_64 << 32);
|
||||
|
||||
__m128i rtl_v = _mm_cvtsi64_si128 (rtl14);
|
||||
__m128i rt = _mm_broadcastd_epi32(rtl_v);
|
||||
__m128i rl = _mm_shuffle_epi8 (rtl_v, rl_shufmask);
|
||||
__m128i rtl_v = _mm_cvtsi64_si128 (rtl14);
|
||||
__m128i rt = _mm_broadcastd_epi32(rtl_v);
|
||||
__m128i rl = _mm_shuffle_epi8 (rtl_v, rl_shufmask);
|
||||
|
||||
__m128i xp1 = _mm_set1_epi32 (0x04030201);
|
||||
__m128i yp1 = _mm_shuffle_epi8(xp1, rl_shufmask);
|
||||
__m128i rtrl_l = _mm_unpacklo_epi8 (rt, rl);
|
||||
__m128i rtrl_h = _mm_unpackhi_epi8 (rt, rl);
|
||||
|
||||
__m128i rdist = _mm_set1_epi32 (0x00010203);
|
||||
__m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask);
|
||||
__m128i bdrd_l = _mm_unpacklo_epi8 (bdist, rdist);
|
||||
__m128i bdrd_h = _mm_unpackhi_epi8 (bdist, rdist);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
__m128i hvs_lo = _mm_maddubs_epi16 (rtrl_l, bdrd_l);
|
||||
__m128i hvs_hi = _mm_maddubs_epi16 (rtrl_h, bdrd_h);
|
||||
|
||||
__m128i rtrl_l = _mm_unpacklo_epi8(rt, rl);
|
||||
__m128i rtrl_h = _mm_unpackhi_epi8(rt, rl);
|
||||
__m128i xp1yp1_l = _mm_unpacklo_epi8 (xp1, yp1);
|
||||
__m128i xp1yp1_h = _mm_unpackhi_epi8 (xp1, yp1);
|
||||
__m128i trbl_lh = _mm_unpacklo_epi8 (tr, bl);
|
||||
|
||||
__m128i bdrd_l = _mm_unpacklo_epi8(bdist, rdist);
|
||||
__m128i bdrd_h = _mm_unpackhi_epi8(bdist, rdist);
|
||||
__m128i addend_l = _mm_maddubs_epi16 (trbl_lh, xp1yp1_l);
|
||||
__m128i addend_h = _mm_maddubs_epi16 (trbl_lh, xp1yp1_h);
|
||||
|
||||
__m128i hvs_lo = _mm_maddubs_epi16(rtrl_l, bdrd_l);
|
||||
__m128i hvs_hi = _mm_maddubs_epi16(rtrl_h, bdrd_h);
|
||||
addend_l = _mm_add_epi16 (addend_l, wid16);
|
||||
addend_h = _mm_add_epi16 (addend_h, wid16);
|
||||
|
||||
int asdf = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (ref_top[width + 1 + i] == 0x80) {
|
||||
asdf = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!asdf) {
|
||||
/*printf("hvs_lo: ");*/ print_128(hvs_lo);
|
||||
/*printf("hvs_hi: "); */print_128(hvs_hi);
|
||||
}
|
||||
__m128i sum_l = _mm_add_epi16 (hvs_lo, addend_l);
|
||||
__m128i sum_h = _mm_add_epi16 (hvs_hi, addend_h);
|
||||
|
||||
// rt1 rt2 rt3 rt4 rl1 rl2 rl3 rl4 0 0 0 0 0 0 0 0
|
||||
// __m128i rtl_v = _mm_cvtsi64_si128(rtl14);
|
||||
|
||||
for (int y = 0; y < width; ++y) {
|
||||
|
||||
uint8_t yp1 = y + 1;
|
||||
uint16_t yp1_bl = yp1 * bottom_left;
|
||||
uint8_t rl_curr = ref_left[y + 1]; // 1...4
|
||||
uint8_t bdist_s = width - 1 - y;
|
||||
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8_t xp1 = x + 1;
|
||||
uint16_t xp1_tr = xp1 * top_right;
|
||||
uint8_t rt_curr = ref_top[x + 1];
|
||||
|
||||
uint8_t rdist_s = width - 1 - x;
|
||||
|
||||
int16_t hor = rdist_s * rl_curr;
|
||||
int16_t ver = bdist_s * rt_curr;
|
||||
int16_t hv_sum = hor + ver;
|
||||
if (!asdf) {
|
||||
int daa = x + y * width;
|
||||
printf("%.2x %.2x%c", hv_sum & 0xff, hv_sum >> 8, ((daa & 7) == 7) ? '\n' : ((daa & 3) == 3) ? '-' : ' ');
|
||||
}
|
||||
hv_sum += width;
|
||||
hv_sum += xp1_tr + yp1_bl;
|
||||
|
||||
dst[y * width + x] = hv_sum >> (log2_width + 1);
|
||||
}
|
||||
}
|
||||
if (!asdf) {
|
||||
fflush(stdout);
|
||||
asm("int $3");
|
||||
}
|
||||
// Shift right by log2_width + 1
|
||||
__m128i sum_l_t = _mm_srli_epi16 (sum_l, 3);
|
||||
__m128i sum_h_t = _mm_srli_epi16 (sum_h, 3);
|
||||
__m128i result = _mm_packus_epi16 (sum_l_t, sum_h_t);
|
||||
_mm_storeu_si128((__m128i *)dst, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -608,3 +569,4 @@ int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
|
|||
#endif //COMPILE_INTEL_AVX2 && defined X86_64
|
||||
return success;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue