mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-24 02:24:07 +00:00
[SIMD] Copy generic implementation of angular prediction as a skeleton.
This commit is contained in:
parent
450cbd356c
commit
3dfe09e850
|
@ -31,342 +31,6 @@
|
||||||
#include "strategies/missing-intel-intrinsics.h"
|
#include "strategies/missing-intel-intrinsics.h"
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register.
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param delta_pos Fractional pixel precise position of sample displacement
|
|
||||||
* \param x Sample offset in direction x in ref_main array
|
|
||||||
*/
|
|
||||||
static INLINE __m128i filter_4x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){
|
|
||||||
|
|
||||||
int8_t delta_int = delta_pos >> 5;
|
|
||||||
int8_t delta_fract = delta_pos & (32-1);
|
|
||||||
__m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int]));
|
|
||||||
__m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1]));
|
|
||||||
__m128i pairs = _mm_unpacklo_epi8(sample0, sample1);
|
|
||||||
__m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) );
|
|
||||||
sample0 = _mm_maddubs_epi16(pairs, weight);
|
|
||||||
sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16));
|
|
||||||
sample0 = _mm_srli_epi16(sample0, 5);
|
|
||||||
sample0 = _mm_packus_epi16(sample0, sample0);
|
|
||||||
|
|
||||||
return sample0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst.
|
|
||||||
* \param dst Destination buffer
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param sample_disp Sample displacement per row
|
|
||||||
* \param vertical_mode Mode direction, true if vertical
|
|
||||||
*/
|
|
||||||
static void filter_4x4_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){
|
|
||||||
|
|
||||||
__m128i row0 = filter_4x1_avx2(ref_main, 1 * sample_disp, 0);
|
|
||||||
__m128i row1 = filter_4x1_avx2(ref_main, 2 * sample_disp, 0);
|
|
||||||
__m128i row2 = filter_4x1_avx2(ref_main, 3 * sample_disp, 0);
|
|
||||||
__m128i row3 = filter_4x1_avx2(ref_main, 4 * sample_disp, 0);
|
|
||||||
|
|
||||||
//Transpose if horizontal mode
|
|
||||||
if (!vertical_mode) {
|
|
||||||
__m128i temp = _mm_unpacklo_epi16(_mm_unpacklo_epi8(row0, row1), _mm_unpacklo_epi8(row2, row3));
|
|
||||||
row0 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 0));
|
|
||||||
row1 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 1));
|
|
||||||
row2 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 2));
|
|
||||||
row3 = _mm_cvtsi32_si128(_mm_extract_epi32(temp, 3));
|
|
||||||
}
|
|
||||||
|
|
||||||
*(int32_t*)(dst + 0 * 4) = _mm_cvtsi128_si32(row0);
|
|
||||||
*(int32_t*)(dst + 1 * 4) = _mm_cvtsi128_si32(row1);
|
|
||||||
*(int32_t*)(dst + 2 * 4) = _mm_cvtsi128_si32(row2);
|
|
||||||
*(int32_t*)(dst + 3 * 4) = _mm_cvtsi128_si32(row3);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for 8 pixels. Returns 8 filtered pixels in lower 64-bits of the register.
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param delta_pos Fractional pixel precise position of sample displacement
|
|
||||||
* \param x Sample offset in direction x in ref_main array
|
|
||||||
*/
|
|
||||||
static INLINE __m128i filter_8x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){
|
|
||||||
|
|
||||||
int8_t delta_int = delta_pos >> 5;
|
|
||||||
int8_t delta_fract = delta_pos & (32-1);
|
|
||||||
__m128i sample0 = _mm_cvtsi64_si128(*(uint64_t*)&(ref_main[x + delta_int]));
|
|
||||||
__m128i sample1 = _mm_cvtsi64_si128(*(uint64_t*)&(ref_main[x + delta_int + 1]));
|
|
||||||
__m128i pairs_lo = _mm_unpacklo_epi8(sample0, sample1);
|
|
||||||
|
|
||||||
__m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) );
|
|
||||||
__m128i v_temp_lo = _mm_maddubs_epi16(pairs_lo, weight);
|
|
||||||
v_temp_lo = _mm_add_epi16(v_temp_lo, _mm_set1_epi16(16));
|
|
||||||
v_temp_lo = _mm_srli_epi16(v_temp_lo, 5);
|
|
||||||
sample0 = _mm_packus_epi16(v_temp_lo, v_temp_lo);
|
|
||||||
|
|
||||||
return sample0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for 8x8 block. Writes filtered 8x8 block to dst.
|
|
||||||
* \param dst Destination buffer
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param sample_disp Sample displacement per row
|
|
||||||
* \param vertical_mode Mode direction, true if vertical
|
|
||||||
*/
|
|
||||||
static void filter_8x8_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){
|
|
||||||
__m128i row0 = filter_8x1_avx2(ref_main, 1 * sample_disp, 0);
|
|
||||||
__m128i row1 = filter_8x1_avx2(ref_main, 2 * sample_disp, 0);
|
|
||||||
__m128i row2 = filter_8x1_avx2(ref_main, 3 * sample_disp, 0);
|
|
||||||
__m128i row3 = filter_8x1_avx2(ref_main, 4 * sample_disp, 0);
|
|
||||||
__m128i row4 = filter_8x1_avx2(ref_main, 5 * sample_disp, 0);
|
|
||||||
__m128i row5 = filter_8x1_avx2(ref_main, 6 * sample_disp, 0);
|
|
||||||
__m128i row6 = filter_8x1_avx2(ref_main, 7 * sample_disp, 0);
|
|
||||||
__m128i row7 = filter_8x1_avx2(ref_main, 8 * sample_disp, 0);
|
|
||||||
|
|
||||||
//Transpose if horizontal mode
|
|
||||||
if (!vertical_mode) {
|
|
||||||
__m128i q0 = _mm_unpacklo_epi8(row0, row1);
|
|
||||||
__m128i q1 = _mm_unpacklo_epi8(row2, row3);
|
|
||||||
__m128i q2 = _mm_unpacklo_epi8(row4, row5);
|
|
||||||
__m128i q3 = _mm_unpacklo_epi8(row6, row7);
|
|
||||||
|
|
||||||
__m128i h0 = _mm_unpacklo_epi16(q0, q1);
|
|
||||||
__m128i h1 = _mm_unpacklo_epi16(q2, q3);
|
|
||||||
__m128i h2 = _mm_unpackhi_epi16(q0, q1);
|
|
||||||
__m128i h3 = _mm_unpackhi_epi16(q2, q3);
|
|
||||||
|
|
||||||
__m128i temp0 = _mm_unpacklo_epi32(h0, h1);
|
|
||||||
__m128i temp1 = _mm_unpackhi_epi32(h0, h1);
|
|
||||||
__m128i temp2 = _mm_unpacklo_epi32(h2, h3);
|
|
||||||
__m128i temp3 = _mm_unpackhi_epi32(h2, h3);
|
|
||||||
|
|
||||||
row0 = _mm_cvtsi64_si128(_mm_extract_epi64(temp0, 0));
|
|
||||||
row1 = _mm_cvtsi64_si128(_mm_extract_epi64(temp0, 1));
|
|
||||||
row2 = _mm_cvtsi64_si128(_mm_extract_epi64(temp1, 0));
|
|
||||||
row3 = _mm_cvtsi64_si128(_mm_extract_epi64(temp1, 1));
|
|
||||||
row4 = _mm_cvtsi64_si128(_mm_extract_epi64(temp2, 0));
|
|
||||||
row5 = _mm_cvtsi64_si128(_mm_extract_epi64(temp2, 1));
|
|
||||||
row6 = _mm_cvtsi64_si128(_mm_extract_epi64(temp3, 0));
|
|
||||||
row7 = _mm_cvtsi64_si128(_mm_extract_epi64(temp3, 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 0 * 8), row0);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 1 * 8), row1);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 2 * 8), row2);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 3 * 8), row3);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 4 * 8), row4);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 5 * 8), row5);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 6 * 8), row6);
|
|
||||||
_mm_storel_epi64((__m128i*)(dst + 7 * 8), row7);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for two 16 pixels. Returns 8 filtered pixels in lower 64-bits of both lanes of the YMM register.
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param delta_pos Fractional pixel precise position of sample displacement
|
|
||||||
* \param x Sample offset in direction x in ref_main array
|
|
||||||
*/
|
|
||||||
static INLINE __m256i filter_16x1_avx2(const uint8_t *ref_main, int16_t delta_pos, int x){
|
|
||||||
|
|
||||||
int8_t delta_int = delta_pos >> 5;
|
|
||||||
int8_t delta_fract = delta_pos & (32-1);
|
|
||||||
__m256i sample0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)&(ref_main[x + delta_int])));
|
|
||||||
sample0 = _mm256_packus_epi16(sample0, sample0);
|
|
||||||
__m256i sample1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)&(ref_main[x + delta_int + 1])));
|
|
||||||
sample1 = _mm256_packus_epi16(sample1, sample1);
|
|
||||||
__m256i pairs_lo = _mm256_unpacklo_epi8(sample0, sample1);
|
|
||||||
|
|
||||||
__m256i weight = _mm256_set1_epi16( (delta_fract << 8) | (32 - delta_fract) );
|
|
||||||
__m256i v_temp_lo = _mm256_maddubs_epi16(pairs_lo, weight);
|
|
||||||
v_temp_lo = _mm256_add_epi16(v_temp_lo, _mm256_set1_epi16(16));
|
|
||||||
v_temp_lo = _mm256_srli_epi16(v_temp_lo, 5);
|
|
||||||
sample0 = _mm256_packus_epi16(v_temp_lo, v_temp_lo);
|
|
||||||
|
|
||||||
return sample0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for 16x16 block. Writes filtered 16x16 block to dst.
|
|
||||||
* \param dst Destination buffer
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param sample_disp Sample displacement per row
|
|
||||||
* \param vertical_mode Mode direction, true if vertical
|
|
||||||
*/
|
|
||||||
static void filter_16x16_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode){
|
|
||||||
for (int y = 0; y < 16; y += 8) {
|
|
||||||
__m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, 0);
|
|
||||||
__m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, 0);
|
|
||||||
__m256i row2 = filter_16x1_avx2(ref_main, (y + 3) * sample_disp, 0);
|
|
||||||
__m256i row3 = filter_16x1_avx2(ref_main, (y + 4) * sample_disp, 0);
|
|
||||||
__m256i row4 = filter_16x1_avx2(ref_main, (y + 5) * sample_disp, 0);
|
|
||||||
__m256i row5 = filter_16x1_avx2(ref_main, (y + 6) * sample_disp, 0);
|
|
||||||
__m256i row6 = filter_16x1_avx2(ref_main, (y + 7) * sample_disp, 0);
|
|
||||||
__m256i row7 = filter_16x1_avx2(ref_main, (y + 8) * sample_disp, 0);
|
|
||||||
|
|
||||||
if (!vertical_mode) {
|
|
||||||
__m256i q0 = _mm256_unpacklo_epi8(row0, row1);
|
|
||||||
__m256i q1 = _mm256_unpacklo_epi8(row2, row3);
|
|
||||||
__m256i q2 = _mm256_unpacklo_epi8(row4, row5);
|
|
||||||
__m256i q3 = _mm256_unpacklo_epi8(row6, row7);
|
|
||||||
|
|
||||||
__m256i h0 = _mm256_unpacklo_epi16(q0, q1);
|
|
||||||
__m256i h1 = _mm256_unpacklo_epi16(q2, q3);
|
|
||||||
__m256i h2 = _mm256_unpackhi_epi16(q0, q1);
|
|
||||||
__m256i h3 = _mm256_unpackhi_epi16(q2, q3);
|
|
||||||
|
|
||||||
__m256i temp0 = _mm256_unpacklo_epi32(h0, h1);
|
|
||||||
__m256i temp1 = _mm256_unpackhi_epi32(h0, h1);
|
|
||||||
__m256i temp2 = _mm256_unpacklo_epi32(h2, h3);
|
|
||||||
__m256i temp3 = _mm256_unpackhi_epi32(h2, h3);
|
|
||||||
|
|
||||||
row0 = _mm256_unpacklo_epi64(temp0, temp0);
|
|
||||||
row1 = _mm256_unpackhi_epi64(temp0, temp0);
|
|
||||||
row2 = _mm256_unpacklo_epi64(temp1, temp1);
|
|
||||||
row3 = _mm256_unpackhi_epi64(temp1, temp1);
|
|
||||||
row4 = _mm256_unpacklo_epi64(temp2, temp2);
|
|
||||||
row5 = _mm256_unpackhi_epi64(temp2, temp2);
|
|
||||||
row6 = _mm256_unpacklo_epi64(temp3, temp3);
|
|
||||||
row7 = _mm256_unpackhi_epi64(temp3, temp3);
|
|
||||||
|
|
||||||
//x and y must be flipped due to transpose
|
|
||||||
int rx = y;
|
|
||||||
int ry = 0;
|
|
||||||
|
|
||||||
*(int64_t*)(dst + (ry + 0) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row0));
|
|
||||||
*(int64_t*)(dst + (ry + 1) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row1));
|
|
||||||
*(int64_t*)(dst + (ry + 2) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row2));
|
|
||||||
*(int64_t*)(dst + (ry + 3) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row3));
|
|
||||||
*(int64_t*)(dst + (ry + 4) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row4));
|
|
||||||
*(int64_t*)(dst + (ry + 5) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row5));
|
|
||||||
*(int64_t*)(dst + (ry + 6) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row6));
|
|
||||||
*(int64_t*)(dst + (ry + 7) * 16 + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row7));
|
|
||||||
|
|
||||||
*(int64_t*)(dst + (ry + 8) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row0, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 9) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row1, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 10) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row2, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 11) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row3, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 12) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row4, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 13) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row5, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 14) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row6, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 15) * 16 + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row7, 1));
|
|
||||||
} else {
|
|
||||||
|
|
||||||
//Set ry for the lower half of the block
|
|
||||||
int rx = 0;
|
|
||||||
int ry = y;
|
|
||||||
|
|
||||||
row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 2) * 16 + rx), _mm256_castsi256_si128(row2));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 3) * 16 + rx), _mm256_castsi256_si128(row3));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 4) * 16 + rx), _mm256_castsi256_si128(row4));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 5) * 16 + rx), _mm256_castsi256_si128(row5));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 6) * 16 + rx), _mm256_castsi256_si128(row6));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (ry + 7) * 16 + rx), _mm256_castsi256_si128(row7));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Linear interpolation for NxN blocks 16x16 and larger. Writes filtered NxN block to dst.
|
|
||||||
* \param dst Destination buffer
|
|
||||||
* \param ref_main Reference pixels
|
|
||||||
* \param sample_disp Sample displacement per row
|
|
||||||
* \param vertical_mode Mode direction, true if vertical
|
|
||||||
* \param width Block width
|
|
||||||
*/
|
|
||||||
static void filter_NxN_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_disp, bool vertical_mode, int width){
|
|
||||||
for (int y = 0; y < width; y += 8) {
|
|
||||||
for (int x = 0; x < width; x += 16) {
|
|
||||||
__m256i row0 = filter_16x1_avx2(ref_main, (y + 1) * sample_disp, x);
|
|
||||||
__m256i row1 = filter_16x1_avx2(ref_main, (y + 2) * sample_disp, x);
|
|
||||||
__m256i row2 = filter_16x1_avx2(ref_main, (y + 3) * sample_disp, x);
|
|
||||||
__m256i row3 = filter_16x1_avx2(ref_main, (y + 4) * sample_disp, x);
|
|
||||||
__m256i row4 = filter_16x1_avx2(ref_main, (y + 5) * sample_disp, x);
|
|
||||||
__m256i row5 = filter_16x1_avx2(ref_main, (y + 6) * sample_disp, x);
|
|
||||||
__m256i row6 = filter_16x1_avx2(ref_main, (y + 7) * sample_disp, x);
|
|
||||||
__m256i row7 = filter_16x1_avx2(ref_main, (y + 8) * sample_disp, x);
|
|
||||||
|
|
||||||
//Transpose if horizontal mode
|
|
||||||
if (!vertical_mode) {
|
|
||||||
__m256i q0 = _mm256_unpacklo_epi8(row0, row1);
|
|
||||||
__m256i q1 = _mm256_unpacklo_epi8(row2, row3);
|
|
||||||
__m256i q2 = _mm256_unpacklo_epi8(row4, row5);
|
|
||||||
__m256i q3 = _mm256_unpacklo_epi8(row6, row7);
|
|
||||||
|
|
||||||
__m256i h0 = _mm256_unpacklo_epi16(q0, q1);
|
|
||||||
__m256i h1 = _mm256_unpacklo_epi16(q2, q3);
|
|
||||||
__m256i h2 = _mm256_unpackhi_epi16(q0, q1);
|
|
||||||
__m256i h3 = _mm256_unpackhi_epi16(q2, q3);
|
|
||||||
|
|
||||||
__m256i temp0 = _mm256_unpacklo_epi32(h0, h1);
|
|
||||||
__m256i temp1 = _mm256_unpackhi_epi32(h0, h1);
|
|
||||||
__m256i temp2 = _mm256_unpacklo_epi32(h2, h3);
|
|
||||||
__m256i temp3 = _mm256_unpackhi_epi32(h2, h3);
|
|
||||||
|
|
||||||
row0 = _mm256_unpacklo_epi64(temp0, temp0);
|
|
||||||
row1 = _mm256_unpackhi_epi64(temp0, temp0);
|
|
||||||
row2 = _mm256_unpacklo_epi64(temp1, temp1);
|
|
||||||
row3 = _mm256_unpackhi_epi64(temp1, temp1);
|
|
||||||
row4 = _mm256_unpacklo_epi64(temp2, temp2);
|
|
||||||
row5 = _mm256_unpackhi_epi64(temp2, temp2);
|
|
||||||
row6 = _mm256_unpacklo_epi64(temp3, temp3);
|
|
||||||
row7 = _mm256_unpackhi_epi64(temp3, temp3);
|
|
||||||
|
|
||||||
//x and y must be flipped due to transpose
|
|
||||||
int rx = y;
|
|
||||||
int ry = x;
|
|
||||||
|
|
||||||
*(int64_t*)(dst + (ry + 0) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row0));
|
|
||||||
*(int64_t*)(dst + (ry + 1) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row1));
|
|
||||||
*(int64_t*)(dst + (ry + 2) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row2));
|
|
||||||
*(int64_t*)(dst + (ry + 3) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row3));
|
|
||||||
*(int64_t*)(dst + (ry + 4) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row4));
|
|
||||||
*(int64_t*)(dst + (ry + 5) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row5));
|
|
||||||
*(int64_t*)(dst + (ry + 6) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row6));
|
|
||||||
*(int64_t*)(dst + (ry + 7) * width + rx) = _mm_cvtsi128_si64(_mm256_castsi256_si128(row7));
|
|
||||||
|
|
||||||
*(int64_t*)(dst + (ry + 8) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row0, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 9) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row1, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 10) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row2, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 11) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row3, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 12) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row4, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 13) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row5, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 14) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row6, 1));
|
|
||||||
*(int64_t*)(dst + (ry + 15) * width + rx) = _mm_cvtsi128_si64(_mm256_extracti128_si256(row7, 1));
|
|
||||||
} else {
|
|
||||||
|
|
||||||
//Move all filtered pixels to the lower lane to reduce memory accesses
|
|
||||||
row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0));
|
|
||||||
row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1));
|
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 2) * width + x), _mm256_castsi256_si128(row2));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 3) * width + x), _mm256_castsi256_si128(row3));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 4) * width + x), _mm256_castsi256_si128(row4));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 5) * width + x), _mm256_castsi256_si128(row5));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 6) * width + x), _mm256_castsi256_si128(row6));
|
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + 7) * width + x), _mm256_castsi256_si128(row7));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Generage angular predictions.
|
* \brief Generage angular predictions.
|
||||||
* \param log2_width Log2 of width, range 2..5.
|
* \param log2_width Log2 of width, range 2..5.
|
||||||
|
@ -378,81 +42,334 @@ static void filter_NxN_avx2(uint8_t *dst, const uint8_t *ref_main, int sample_di
|
||||||
static void kvz_angular_pred_avx2(
|
static void kvz_angular_pred_avx2(
|
||||||
const int_fast8_t log2_width,
|
const int_fast8_t log2_width,
|
||||||
const int_fast8_t intra_mode,
|
const int_fast8_t intra_mode,
|
||||||
const uint8_t *const in_ref_above,
|
const int_fast8_t channel_type,
|
||||||
const uint8_t *const in_ref_left,
|
const kvz_pixel *const in_ref_above,
|
||||||
uint8_t *const dst)
|
const kvz_pixel *const in_ref_left,
|
||||||
|
kvz_pixel *const dst)
|
||||||
{
|
{
|
||||||
assert(log2_width >= 2 && log2_width <= 5);
|
|
||||||
assert(intra_mode >= 2 && intra_mode <= 34);
|
|
||||||
|
|
||||||
static const int8_t modedisp2sampledisp[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
|
assert(log2_width >= 2 && log2_width <= 5);
|
||||||
static const int16_t modedisp2invsampledisp[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / sampledisp
|
assert(intra_mode >= 2 && intra_mode <= 66);
|
||||||
|
|
||||||
|
static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 };
|
||||||
|
static const int16_t modedisp2invsampledisp[32] = { 0, 16384, 8192, 5461, 4096, 2731, 2048, 1638, 1365, 1170, 1024, 910, 819, 712, 630, 565, 512, 468, 420, 364, 321, 287, 256, 224, 191, 161, 128, 96, 64, 48, 32, 16 }; // (512 * 32) / sampledisp
|
||||||
|
static const int32_t pre_scale[] = { 8, 7, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -2, -3 };
|
||||||
|
static const int16_t intraGaussFilter[32][4] = {
|
||||||
|
{ 16, 32, 16, 0 },
|
||||||
|
{ 15, 29, 17, 3 },
|
||||||
|
{ 15, 29, 17, 3 },
|
||||||
|
{ 14, 29, 18, 3 },
|
||||||
|
{ 13, 29, 18, 4 },
|
||||||
|
{ 13, 28, 19, 4 },
|
||||||
|
{ 13, 28, 19, 4 },
|
||||||
|
{ 12, 28, 20, 4 },
|
||||||
|
{ 11, 28, 20, 5 },
|
||||||
|
{ 11, 27, 21, 5 },
|
||||||
|
{ 10, 27, 22, 5 },
|
||||||
|
{ 9, 27, 22, 6 },
|
||||||
|
{ 9, 26, 23, 6 },
|
||||||
|
{ 9, 26, 23, 6 },
|
||||||
|
{ 8, 25, 24, 7 },
|
||||||
|
{ 8, 25, 24, 7 },
|
||||||
|
{ 8, 24, 24, 8 },
|
||||||
|
{ 7, 24, 25, 8 },
|
||||||
|
{ 7, 24, 25, 8 },
|
||||||
|
{ 6, 23, 26, 9 },
|
||||||
|
{ 6, 23, 26, 9 },
|
||||||
|
{ 6, 22, 27, 9 },
|
||||||
|
{ 5, 22, 27, 10 },
|
||||||
|
{ 5, 21, 27, 11 },
|
||||||
|
{ 5, 20, 28, 11 },
|
||||||
|
{ 4, 20, 28, 12 },
|
||||||
|
{ 4, 19, 28, 13 },
|
||||||
|
{ 4, 19, 28, 13 },
|
||||||
|
{ 4, 18, 29, 13 },
|
||||||
|
{ 3, 18, 29, 14 },
|
||||||
|
{ 3, 17, 29, 15 },
|
||||||
|
{ 3, 17, 29, 15 }
|
||||||
|
};
|
||||||
|
static const int16_t cubic_filter[32][4] =
|
||||||
|
{
|
||||||
|
{ 0, 64, 0, 0 },
|
||||||
|
{ -1, 63, 2, 0 },
|
||||||
|
{ -2, 62, 4, 0 },
|
||||||
|
{ -2, 60, 7, -1 },
|
||||||
|
{ -2, 58, 10, -2 },
|
||||||
|
{ -3, 57, 12, -2 },
|
||||||
|
{ -4, 56, 14, -2 },
|
||||||
|
{ -4, 55, 15, -2 },
|
||||||
|
{ -4, 54, 16, -2 },
|
||||||
|
{ -5, 53, 18, -2 },
|
||||||
|
{ -6, 52, 20, -2 },
|
||||||
|
{ -6, 49, 24, -3 },
|
||||||
|
{ -6, 46, 28, -4 },
|
||||||
|
{ -5, 44, 29, -4 },
|
||||||
|
{ -4, 42, 30, -4 },
|
||||||
|
{ -4, 39, 33, -4 },
|
||||||
|
{ -4, 36, 36, -4 },
|
||||||
|
{ -4, 33, 39, -4 },
|
||||||
|
{ -4, 30, 42, -4 },
|
||||||
|
{ -4, 29, 44, -5 },
|
||||||
|
{ -4, 28, 46, -6 },
|
||||||
|
{ -3, 24, 49, -6 },
|
||||||
|
{ -2, 20, 52, -6 },
|
||||||
|
{ -2, 18, 53, -5 },
|
||||||
|
{ -2, 16, 54, -4 },
|
||||||
|
{ -2, 15, 55, -4 },
|
||||||
|
{ -2, 14, 56, -4 },
|
||||||
|
{ -2, 12, 57, -3 },
|
||||||
|
{ -2, 10, 58, -2 },
|
||||||
|
{ -1, 7, 60, -2 },
|
||||||
|
{ 0, 4, 62, -2 },
|
||||||
|
{ 0, 2, 63, -1 },
|
||||||
|
};
|
||||||
|
|
||||||
// Temporary buffer for modes 11-25.
|
// Temporary buffer for modes 11-25.
|
||||||
// It only needs to be big enough to hold indices from -width to width-1.
|
// It only needs to be big enough to hold indices from -width to width-1.
|
||||||
uint8_t tmp_ref[2 * 32];
|
kvz_pixel tmp_ref[2 * 128] = { 0 };
|
||||||
const int_fast8_t width = 1 << log2_width;
|
kvz_pixel temp_main[2 * 128] = { 0 };
|
||||||
|
kvz_pixel temp_side[2 * 128] = { 0 };
|
||||||
|
const int_fast32_t width = 1 << log2_width;
|
||||||
|
|
||||||
|
uint32_t pred_mode = intra_mode; // ToDo: handle WAIP
|
||||||
|
|
||||||
// Whether to swap references to always project on the left reference row.
|
// Whether to swap references to always project on the left reference row.
|
||||||
const bool vertical_mode = intra_mode >= 18;
|
const bool vertical_mode = intra_mode >= 34;
|
||||||
// Modes distance to horizontal or vertical mode.
|
// Modes distance to horizontal or vertical mode.
|
||||||
const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
|
const int_fast8_t mode_disp = vertical_mode ? pred_mode - 50 : -(pred_mode - 18);
|
||||||
|
//const int_fast8_t mode_disp = vertical_mode ? intra_mode - 26 : 10 - intra_mode;
|
||||||
|
|
||||||
// Sample displacement per column in fractions of 32.
|
// Sample displacement per column in fractions of 32.
|
||||||
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
|
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
|
||||||
|
|
||||||
|
// TODO: replace latter width with height
|
||||||
|
int scale = MIN(2, log2_width - pre_scale[abs(mode_disp)]);
|
||||||
|
|
||||||
// Pointer for the reference we are interpolating from.
|
// Pointer for the reference we are interpolating from.
|
||||||
const uint8_t *ref_main;
|
kvz_pixel *ref_main;
|
||||||
// Pointer for the other reference.
|
// Pointer for the other reference.
|
||||||
const uint8_t *ref_side;
|
const kvz_pixel *ref_side;
|
||||||
|
|
||||||
// Set ref_main and ref_side such that, when indexed with 0, they point to
|
// Set ref_main and ref_side such that, when indexed with 0, they point to
|
||||||
// index 0 in block coordinates.
|
// index 0 in block coordinates.
|
||||||
if (sample_disp < 0) {
|
if (sample_disp < 0) {
|
||||||
// Negative sample_disp means, we need to use both references.
|
for (int i = 0; i <= width + 1; i++) {
|
||||||
|
temp_main[width + i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
|
||||||
ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
|
temp_side[width + i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
|
||||||
ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
|
|
||||||
|
|
||||||
// Move the reference pixels to start from the middle to the later half of
|
|
||||||
// the tmp_ref, so there is room for negative indices.
|
|
||||||
for (int_fast8_t x = -1; x < width; ++x) {
|
|
||||||
tmp_ref[x + width] = ref_main[x];
|
|
||||||
}
|
}
|
||||||
// Get a pointer to block index 0 in tmp_ref.
|
|
||||||
ref_main = tmp_ref + width;
|
|
||||||
|
|
||||||
// Extend the side reference to the negative indices of main reference.
|
ref_main = temp_main + width;
|
||||||
int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
|
ref_side = temp_side + width;
|
||||||
int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
|
|
||||||
int_fast8_t most_negative_index = (width * sample_disp) >> 5;
|
for (int i = -width; i <= -1; i++) {
|
||||||
for (int_fast8_t x = -2; x >= most_negative_index; --x) {
|
ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, width)];
|
||||||
col_sample_disp += inv_abs_sample_disp;
|
}
|
||||||
int_fast8_t side_index = col_sample_disp >> 8;
|
|
||||||
tmp_ref[x + width] = ref_side[side_index - 1];
|
|
||||||
|
|
||||||
|
//const uint32_t index_offset = width + 1;
|
||||||
|
//const int32_t last_index = width;
|
||||||
|
//const int_fast32_t most_negative_index = (width * sample_disp) >> 5;
|
||||||
|
//// Negative sample_disp means, we need to use both references.
|
||||||
|
|
||||||
|
//// TODO: update refs to take into account variating block size and shapes
|
||||||
|
//// (height is not always equal to width)
|
||||||
|
//ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
|
||||||
|
//ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
|
||||||
|
|
||||||
|
//// Move the reference pixels to start from the middle to the later half of
|
||||||
|
//// the tmp_ref, so there is room for negative indices.
|
||||||
|
//for (int_fast32_t x = -1; x < width; ++x) {
|
||||||
|
// tmp_ref[x + index_offset] = ref_main[x];
|
||||||
|
//}
|
||||||
|
//// Get a pointer to block index 0 in tmp_ref.
|
||||||
|
//ref_main = &tmp_ref[index_offset];
|
||||||
|
//tmp_ref[index_offset -1] = tmp_ref[index_offset];
|
||||||
|
|
||||||
|
//// Extend the side reference to the negative indices of main reference.
|
||||||
|
//int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
|
||||||
|
//int_fast16_t inv_abs_sample_disp = modedisp2invsampledisp[abs(mode_disp)];
|
||||||
|
//// TODO: add 'vertical_mode ? height : width' instead of 'width'
|
||||||
|
//
|
||||||
|
//for (int_fast32_t x = -1; x > most_negative_index; x--) {
|
||||||
|
// col_sample_disp += inv_abs_sample_disp;
|
||||||
|
// int_fast32_t side_index = col_sample_disp >> 8;
|
||||||
|
// tmp_ref[x + index_offset - 1] = ref_side[side_index - 1];
|
||||||
|
//}
|
||||||
|
//tmp_ref[last_index + index_offset] = tmp_ref[last_index + index_offset - 1];
|
||||||
|
//tmp_ref[most_negative_index + index_offset - 1] = tmp_ref[most_negative_index + index_offset];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
for (int i = 0; i <= (width << 1); i++) {
|
||||||
|
temp_main[i] = (vertical_mode ? in_ref_above[i] : in_ref_left[i]);
|
||||||
|
temp_side[i] = (vertical_mode ? in_ref_left[i] : in_ref_above[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int log2_ratio = 0;
|
||||||
|
const int s = 0;
|
||||||
|
const int max_index = (0 << s) + 2;
|
||||||
|
const int ref_length = width << 1;
|
||||||
|
const kvz_pixel val = temp_main[ref_length];
|
||||||
|
for (int j = 0; j <= max_index; j++) {
|
||||||
|
temp_main[ref_length + j] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
ref_main = temp_main;
|
||||||
|
ref_side = temp_side;
|
||||||
|
//// sample_disp >= 0 means we don't need to refer to negative indices,
|
||||||
|
//// which means we can just use the references as is.
|
||||||
|
//ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
|
||||||
|
//ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
|
||||||
|
|
||||||
|
//memcpy(tmp_ref + width, ref_main, (width*2) * sizeof(kvz_pixel));
|
||||||
|
//ref_main = &tmp_ref[width];
|
||||||
|
//tmp_ref[width-1] = tmp_ref[width];
|
||||||
|
//int8_t last_index = 1 + width*2;
|
||||||
|
//tmp_ref[width + last_index] = tmp_ref[width + last_index - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sample_disp != 0) {
|
||||||
|
// The mode is not horizontal or vertical, we have to do interpolation.
|
||||||
|
|
||||||
|
int_fast32_t delta_pos = 0;
|
||||||
|
for (int_fast32_t y = 0; y < width; ++y) {
|
||||||
|
delta_pos += sample_disp;
|
||||||
|
int_fast32_t delta_int = delta_pos >> 5;
|
||||||
|
int_fast32_t delta_fract = delta_pos & (32 - 1);
|
||||||
|
|
||||||
|
if ((abs(sample_disp) & 0x1F) != 0) {
|
||||||
|
|
||||||
|
// Luma Channel
|
||||||
|
if (channel_type == 0) {
|
||||||
|
int32_t ref_main_index = delta_int;
|
||||||
|
kvz_pixel p[4];
|
||||||
|
bool use_cubic = true; // Default to cubic filter
|
||||||
|
static const int kvz_intra_hor_ver_dist_thres[8] = { 24, 24, 24, 14, 2, 0, 0, 0 };
|
||||||
|
int filter_threshold = kvz_intra_hor_ver_dist_thres[log2_width];
|
||||||
|
int dist_from_vert_or_hor = MIN(abs(pred_mode - 50), abs(pred_mode - 18));
|
||||||
|
if (dist_from_vert_or_hor > filter_threshold) {
|
||||||
|
static const int16_t modedisp2sampledisp[32] = { 0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 23, 26, 29, 32, 35, 39, 45, 51, 57, 64, 73, 86, 102, 128, 171, 256, 341, 512, 1024 };
|
||||||
|
const int_fast8_t mode_disp = (pred_mode >= 34) ? pred_mode - 50 : 18 - pred_mode;
|
||||||
|
const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
|
||||||
|
if ((abs(sample_disp) & 0x1F) != 0)
|
||||||
|
{
|
||||||
|
use_cubic = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int16_t filter_coeff[4] = { 16 - (delta_fract >> 1), 32 - (delta_fract >> 1), 16 + (delta_fract >> 1), delta_fract >> 1 };
|
||||||
|
int16_t const * const f = use_cubic ? cubic_filter[delta_fract] : filter_coeff;
|
||||||
|
// Do 4-tap intra interpolation filtering
|
||||||
|
for (int_fast32_t x = 0; x < width; x++, ref_main_index++) {
|
||||||
|
p[0] = ref_main[ref_main_index];
|
||||||
|
p[1] = ref_main[ref_main_index + 1];
|
||||||
|
p[2] = ref_main[ref_main_index + 2];
|
||||||
|
p[3] = ref_main[ref_main_index + 3];
|
||||||
|
|
||||||
|
dst[y * width + x] = CLIP_TO_PIXEL(((int32_t)(f[0] * p[0]) + (int32_t)(f[1] * p[1]) + (int32_t)(f[2] * p[2]) + (int32_t)(f[3] * p[3]) + 32) >> 6);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
|
||||||
|
// Do linear filtering
|
||||||
|
for (int_fast32_t x = 0; x < width; ++x) {
|
||||||
|
kvz_pixel ref1 = ref_main[x + delta_int + 1];
|
||||||
|
kvz_pixel ref2 = ref_main[x + delta_int + 2];
|
||||||
|
dst[y * width + x] = ref1 + ((delta_fract * (ref2-ref1) + 16) >> 5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Just copy the integer samples
|
||||||
|
for (int_fast32_t x = 0; x < width; x++) {
|
||||||
|
dst[y * width + x] = ref_main[x + delta_int + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// PDPC
|
||||||
|
bool PDPC_filter = (width >= 4 || channel_type != 0);
|
||||||
|
if (pred_mode > 1 && pred_mode < 67) {
|
||||||
|
if (mode_disp < 0) {
|
||||||
|
PDPC_filter = false;
|
||||||
|
}
|
||||||
|
else if (mode_disp > 0) {
|
||||||
|
PDPC_filter = (scale >= 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(PDPC_filter) {
|
||||||
|
int inv_angle_sum = 256;
|
||||||
|
for (int x = 0; x < MIN(3 << scale, width); x++) {
|
||||||
|
inv_angle_sum += modedisp2invsampledisp[abs(mode_disp)];
|
||||||
|
|
||||||
|
int wL = 32 >> (2 * x >> scale);
|
||||||
|
const kvz_pixel left = ref_side[y + (inv_angle_sum >> 9) + 1];
|
||||||
|
dst[y * width + x] = dst[y * width + x] + ((wL * (left - dst[y * width + x]) + 32) >> 6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (pred_mode == 2 || pred_mode == 66) {
|
||||||
|
int wT = 16 >> MIN(31, ((y << 1) >> scale));
|
||||||
|
for (int x = 0; x < width; x++) {
|
||||||
|
int wL = 16 >> MIN(31, ((x << 1) >> scale));
|
||||||
|
if (wT + wL == 0) break;
|
||||||
|
int c = x + y + 1;
|
||||||
|
if (c >= 2 * width) { wL = 0; }
|
||||||
|
if (c >= 2 * width) { wT = 0; }
|
||||||
|
const kvz_pixel left = (wL != 0) ? ref_side[c] : 0;
|
||||||
|
const kvz_pixel top = (wT != 0) ? ref_main[c] : 0;
|
||||||
|
dst[y * width + x] = CLIP_TO_PIXEL((wL * left + wT * top + (64 - wL - wT) * dst[y * width + x] + 32) >> 6);
|
||||||
|
}
|
||||||
|
} else if (sample_disp == 0 || sample_disp >= 12) {
|
||||||
|
int inv_angle_sum_0 = 2;
|
||||||
|
for (int x = 0; x < width; x++) {
|
||||||
|
inv_angle_sum_0 += modedisp2invsampledisp[abs(mode_disp)];
|
||||||
|
int delta_pos_0 = inv_angle_sum_0 >> 2;
|
||||||
|
int delta_frac_0 = delta_pos_0 & 63;
|
||||||
|
int delta_int_0 = delta_pos_0 >> 6;
|
||||||
|
int delta_y = y + delta_int_0 + 1;
|
||||||
|
// TODO: convert to JVET_K0500_WAIP
|
||||||
|
if (delta_y > width + width - 1) break;
|
||||||
|
|
||||||
|
int wL = 32 >> MIN(31, ((x << 1) >> scale));
|
||||||
|
if (wL == 0) break;
|
||||||
|
const kvz_pixel *p = ref_side + delta_y - 1;
|
||||||
|
kvz_pixel left = p[delta_frac_0 >> 5];
|
||||||
|
dst[y * width + x] = CLIP_TO_PIXEL((wL * left + (64 - wL) * dst[y * width + x] + 32) >> 6);
|
||||||
|
}
|
||||||
|
}*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// sample_disp >= 0 means we don't need to refer to negative indices,
|
// Mode is horizontal or vertical, just copy the pixels.
|
||||||
// which means we can just use the references as is.
|
|
||||||
ref_main = (vertical_mode ? in_ref_above : in_ref_left) + 1;
|
// TODO: update outer loop to use height instead of width
|
||||||
ref_side = (vertical_mode ? in_ref_left : in_ref_above) + 1;
|
for (int_fast32_t y = 0; y < width; ++y) {
|
||||||
|
for (int_fast32_t x = 0; x < width; ++x) {
|
||||||
|
dst[y * width + x] = ref_main[x + 1];
|
||||||
|
}
|
||||||
|
if ((width >= 4 || channel_type != 0) && sample_disp >= 0) {
|
||||||
|
int scale = (log2_width + log2_width - 2) >> 2;
|
||||||
|
const kvz_pixel top_left = ref_main[0];
|
||||||
|
const kvz_pixel left = ref_side[1 + y];
|
||||||
|
for (int i = 0; i < MIN(3 << scale, width); i++) {
|
||||||
|
const int wL = 32 >> (2 * i >> scale);
|
||||||
|
const kvz_pixel val = dst[y * width + i];
|
||||||
|
dst[y * width + i] = CLIP_TO_PIXEL(val + ((wL * (left - top_left) + 32) >> 6));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Flip the block if this is was a horizontal mode.
|
||||||
// The mode is not horizontal or vertical, we have to do interpolation.
|
if (!vertical_mode) {
|
||||||
switch (width) {
|
for (int_fast32_t y = 0; y < width - 1; ++y) {
|
||||||
case 4:
|
for (int_fast32_t x = y + 1; x < width; ++x) {
|
||||||
filter_4x4_avx2(dst, ref_main, sample_disp, vertical_mode);
|
SWAP(dst[y * width + x], dst[x * width + y], kvz_pixel);
|
||||||
break;
|
}
|
||||||
case 8:
|
}
|
||||||
filter_8x8_avx2(dst, ref_main, sample_disp, vertical_mode);
|
|
||||||
break;
|
|
||||||
case 16:
|
|
||||||
filter_16x16_avx2(dst, ref_main, sample_disp, vertical_mode);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
filter_NxN_avx2(dst, ref_main, sample_disp, vertical_mode, width);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -916,7 +833,7 @@ int kvz_strategy_register_intra_avx2(void* opaque, uint8_t bitdepth)
|
||||||
#if COMPILE_INTEL_AVX2 && defined X86_64
|
#if COMPILE_INTEL_AVX2 && defined X86_64
|
||||||
#if KVZ_BIT_DEPTH == 8
|
#if KVZ_BIT_DEPTH == 8
|
||||||
if (bitdepth == 8) {
|
if (bitdepth == 8) {
|
||||||
//success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2);
|
success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2);
|
||||||
success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &kvz_intra_pred_planar_avx2);
|
success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &kvz_intra_pred_planar_avx2);
|
||||||
success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &kvz_intra_pred_filtered_dc_avx2);
|
success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &kvz_intra_pred_filtered_dc_avx2);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue