mirror of
https://github.com/ultravideo/uvg266.git
synced 2024-11-27 19:24:06 +00:00
some modifications made to prevent reading too much
This commit is contained in:
parent
1d935ee888
commit
508b218a12
|
@ -729,6 +729,8 @@ static void inter_recon_bipred_no_mov_avx2(
|
||||||
kvz_pixel* temp_lcu_u,
|
kvz_pixel* temp_lcu_u,
|
||||||
kvz_pixel* temp_lcu_v) {
|
kvz_pixel* temp_lcu_v) {
|
||||||
|
|
||||||
|
// This function is used only when kvazaar can't find any movement from the current block
|
||||||
|
|
||||||
|
|
||||||
int y_in_lcu, x_in_lcu;
|
int y_in_lcu, x_in_lcu;
|
||||||
__m256i sample0_epi8, sample1_epi8, temp_y_epi8;
|
__m256i sample0_epi8, sample1_epi8, temp_y_epi8;
|
||||||
|
@ -933,6 +935,8 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
int offset = 1 << (shift - 1);
|
int offset = 1 << (shift - 1);
|
||||||
int shift_left = 14 - KVZ_BIT_DEPTH;
|
int shift_left = 14 - KVZ_BIT_DEPTH;
|
||||||
__m256i temp_epi16_y, temp_epi8, temp_y_epi16, sample0_epi16, sample1_epi16;
|
__m256i temp_epi16_y, temp_epi8, temp_y_epi16, sample0_epi16, sample1_epi16;
|
||||||
|
__m256i temp_u_epi16 = _mm256_setzero_si256();
|
||||||
|
__m256i temp_v_epi16 = _mm256_setzero_si256();
|
||||||
|
|
||||||
int start_point = 0;
|
int start_point = 0;
|
||||||
int start_point_uv = 0;
|
int start_point_uv = 0;
|
||||||
|
@ -953,6 +957,56 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
for (int temp_x = 0; temp_x < width; temp_x += 16) {
|
for (int temp_x = 0; temp_x < width; temp_x += 16) {
|
||||||
x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
|
x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
switch (width)
|
||||||
|
{
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
|
||||||
|
// Load total of 8 elements from memory to vector (4 are used)
|
||||||
|
sample0_epi16 = hi_prec_luma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_luma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
|
||||||
|
temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift);
|
||||||
|
|
||||||
|
temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
|
||||||
|
|
||||||
|
pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
|
||||||
|
*pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 8:
|
||||||
|
|
||||||
|
// Load total of 8 elements from memory to vector
|
||||||
|
sample0_epi16 = hi_prec_luma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_luma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
|
||||||
|
temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift);
|
||||||
|
|
||||||
|
// Pack the bits from 1-bit to 8-bit
|
||||||
|
temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
|
||||||
|
|
||||||
|
|
||||||
|
// Store 64-bits from vector to memory
|
||||||
|
_mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
||||||
|
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 16:
|
||||||
|
|
||||||
// Load total of 16 elements from memory to vector
|
// Load total of 16 elements from memory to vector
|
||||||
sample0_epi16 = hi_prec_luma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) :
|
sample0_epi16 = hi_prec_luma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) :
|
||||||
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
@ -965,28 +1019,6 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
|
temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
|
||||||
temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift);
|
temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift);
|
||||||
|
|
||||||
switch (width)
|
|
||||||
{
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
|
|
||||||
|
|
||||||
pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
|
|
||||||
*pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 8:
|
|
||||||
|
|
||||||
// Pack the bits from 1-bit to 8-bit
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
|
|
||||||
|
|
||||||
|
|
||||||
// Store 64-bits from vector to memory
|
|
||||||
_mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
|
||||||
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 16:
|
|
||||||
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), _MM_SHUFFLE(1, 3, 2, 0));
|
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), _MM_SHUFFLE(1, 3, 2, 0));
|
||||||
_mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
_mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
||||||
|
|
||||||
|
@ -994,6 +1026,20 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
|
||||||
|
// Load total of 16 elements from memory to vector
|
||||||
|
sample0_epi16 = hi_prec_luma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_luma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
|
||||||
|
temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift);
|
||||||
|
|
||||||
|
|
||||||
if (temp == 0) {
|
if (temp == 0) {
|
||||||
|
|
||||||
temp_epi16_y = temp_y_epi16;
|
temp_epi16_y = temp_y_epi16;
|
||||||
|
@ -1016,39 +1062,39 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
|
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
|
||||||
x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
|
x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
|
||||||
|
|
||||||
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
|
||||||
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
|
||||||
|
|
||||||
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
|
||||||
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
|
||||||
|
|
||||||
// (sample1 + sample2 + offset)>>shift
|
|
||||||
__m256i temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
|
||||||
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
|
||||||
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
|
||||||
|
|
||||||
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
|
||||||
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
|
||||||
|
|
||||||
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
|
||||||
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
|
||||||
|
|
||||||
// (sample1 + sample2 + offset)>>shift
|
|
||||||
__m256i temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
|
||||||
temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
|
|
||||||
temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift);
|
|
||||||
|
|
||||||
|
|
||||||
switch (width) {
|
switch (width) {
|
||||||
|
|
||||||
case 4:
|
case 4:
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
|
||||||
|
temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
|
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
|
||||||
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 0] = _mm256_extract_epi8(temp_epi8, 0);
|
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 0] = _mm256_extract_epi8(temp_epi8, 0);
|
||||||
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1);
|
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
|
temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
|
||||||
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 0] = _mm256_extract_epi8(temp_epi8, 0);
|
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 0] = _mm256_extract_epi8(temp_epi8, 0);
|
||||||
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1);
|
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1);
|
||||||
|
@ -1057,6 +1103,31 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
|
|
||||||
|
|
||||||
case 8:
|
case 8:
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
|
||||||
|
temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
|
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
|
||||||
|
|
||||||
pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
|
pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
|
||||||
|
@ -1071,11 +1142,34 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
|
|
||||||
case 16:
|
case 16:
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
|
||||||
|
temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift);
|
||||||
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
|
temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
|
||||||
|
|
||||||
// Store 64-bit integer into memory
|
// Store 64-bit integer into memory
|
||||||
_mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
_mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
||||||
|
|
||||||
|
|
||||||
temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
|
temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
|
||||||
|
|
||||||
// Store 64-bit integer into memory
|
// Store 64-bit integer into memory
|
||||||
|
@ -1085,6 +1179,39 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
|
|
||||||
case 32:
|
case 32:
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
|
||||||
|
temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift);
|
||||||
|
|
||||||
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(1, 3, 2, 0));
|
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(1, 3, 2, 0));
|
||||||
_mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
_mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
|
||||||
|
|
||||||
|
@ -1094,6 +1221,40 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
|
||||||
|
temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift);
|
||||||
|
|
||||||
|
sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
|
||||||
|
_mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
|
||||||
|
|
||||||
|
// (sample1 + sample2 + offset)>>shift
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
|
||||||
|
temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
|
||||||
|
temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift);
|
||||||
|
|
||||||
if (temp_uv == 0) {
|
if (temp_uv == 0) {
|
||||||
|
|
||||||
// Store to temporary vector
|
// Store to temporary vector
|
||||||
|
|
|
@ -32,9 +32,9 @@ static lcu_t lcu1;
|
||||||
|
|
||||||
int temp1, temp2, temp3, temp4;
|
int temp1, temp2, temp3, temp4;
|
||||||
|
|
||||||
int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } };
|
int16_t mv_param[2][2] = { { 7,7 },{ 7,7 } };
|
||||||
int width = 8;
|
int width = 16;
|
||||||
int height = 8;
|
int height = 16;
|
||||||
int xpos = 0;
|
int xpos = 0;
|
||||||
int ypos = 0;
|
int ypos = 0;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue