mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 02:24:07 +00:00
[SIMD] Initial AVX2 code for PDPC in angular prediction
This commit is contained in:
parent
7ccd1a571c
commit
913573baca
|
@ -362,11 +362,45 @@ static void kvz_angular_pred_avx2(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(PDPC_filter) {
|
if(PDPC_filter) {
|
||||||
for (int yy = 0; yy < 4; ++yy) {
|
|
||||||
|
int16_t wL[4];
|
||||||
|
int16_t left[4][4];
|
||||||
|
for (int x = 0; x + 3 < (MIN(3 << scale, width) & ~0x3); x += 4) {
|
||||||
|
for (int xx = 0; xx < 4; ++xx) {
|
||||||
|
int inv_angle_sum = 256 + (x + xx + 1) * modedisp2invsampledisp[abs(mode_disp)];
|
||||||
|
wL[xx] = 32 >> (2 * (x + xx) >> scale);
|
||||||
|
|
||||||
int inv_angle_sum = 256;
|
for (int yy = 0; yy < 4; ++yy) {
|
||||||
for (int x = 0; x < MIN(3 << scale, width); x++) {
|
left[yy][xx] = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1];
|
||||||
inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__m128i vseq = _mm_setr_epi32(0, 1, 2, 3);
|
||||||
|
__m128i vidx = _mm_slli_epi32(vseq, log2_width);
|
||||||
|
__m256i vdst = _mm256_cvtepu8_epi16(_mm_i32gather_epi32((uint32_t*)(dst + y * width + x), vidx, 1));
|
||||||
|
__m256i vleft = _mm256_loadu_si256((__m256i*)left);
|
||||||
|
__m256i vwL = _mm256_set1_epi64x(*(uint64_t*)wL);
|
||||||
|
__m256i accu = _mm256_sub_epi16(vleft, vdst);
|
||||||
|
accu = _mm256_mullo_epi16(vwL, accu);
|
||||||
|
accu = _mm256_add_epi16(accu, _mm256_set1_epi16(32));
|
||||||
|
accu = _mm256_srai_epi16(accu, 6);
|
||||||
|
accu = _mm256_add_epi16(vdst, accu);
|
||||||
|
|
||||||
|
__m128i lo = _mm256_castsi256_si128(accu);
|
||||||
|
__m128i hi = _mm256_extracti128_si256(accu, 1);
|
||||||
|
__m128i filtered = _mm_packus_epi16(lo, hi);
|
||||||
|
|
||||||
|
*(uint32_t*)(dst + (y + 0) * width + x) = _mm_extract_epi32(filtered, 0);
|
||||||
|
*(uint32_t*)(dst + (y + 1) * width + x) = _mm_extract_epi32(filtered, 1);
|
||||||
|
*(uint32_t*)(dst + (y + 2) * width + x) = _mm_extract_epi32(filtered, 2);
|
||||||
|
*(uint32_t*)(dst + (y + 3) * width + x) = _mm_extract_epi32(filtered, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remainder
|
||||||
|
int x_rem = MIN(3 << scale, width) & ~0x3;
|
||||||
|
for (int yy = 0; yy < 4; yy++) {
|
||||||
|
for (int x = x_rem; x < MIN(3 << scale, width); x++) {
|
||||||
|
int inv_angle_sum = 256 + (x + 1) * modedisp2invsampledisp[abs(mode_disp)];
|
||||||
|
|
||||||
int wL = 32 >> (2 * x >> scale);
|
int wL = 32 >> (2 * x >> scale);
|
||||||
const kvz_pixel left = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1];
|
const kvz_pixel left = ref_side[(y + yy) + (inv_angle_sum >> 9) + 1];
|
||||||
|
|
Loading…
Reference in a new issue