From 508b218a127be8b0a20a06df1af2e7d23b9fb293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Reima=20Hyv=C3=B6nen?= Date: Tue, 14 Aug 2018 10:50:39 +0300 Subject: [PATCH] some modifications made to prevent reading too much --- src/strategies/avx2/picture-avx2.c | 229 ++++++++++++++++++++++++----- tests/inter_recon_bipred_tests.c | 6 +- 2 files changed, 198 insertions(+), 37 deletions(-) diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 10159371..78c33dba 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -729,6 +729,8 @@ static void inter_recon_bipred_no_mov_avx2( kvz_pixel* temp_lcu_u, kvz_pixel* temp_lcu_v) { + // This function is used only when kvazaar can't find any movement from the current block + int y_in_lcu, x_in_lcu; __m256i sample0_epi8, sample1_epi8, temp_y_epi8; @@ -933,6 +935,8 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, int offset = 1 << (shift - 1); int shift_left = 14 - KVZ_BIT_DEPTH; __m256i temp_epi16_y, temp_epi8, temp_y_epi16, sample0_epi16, sample1_epi16; + __m256i temp_u_epi16 = _mm256_setzero_si256(); + __m256i temp_v_epi16 = _mm256_setzero_si256(); int start_point = 0; int start_point_uv = 0; @@ -953,22 +957,25 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, for (int temp_x = 0; temp_x < width; temp_x += 16) { x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); - // Load total of 16 elements from memory to vector - sample0_epi16 = hi_prec_luma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) : - _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); - sample1_epi16 = hi_prec_luma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) : - _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); - - // (sample1 + sample2 + offset)>>shift - temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); - temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift); switch (width) { case 4: + + // Load total of 8 elements from memory to vector (4 are used) + sample0_epi16 = hi_prec_luma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + sample1_epi16 = hi_prec_luma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); + temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift); + temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16); pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); @@ -976,6 +983,18 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, break; case 8: + + // Load total of 8 elements from memory to vector + sample0_epi16 = hi_prec_luma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + sample1_epi16 = hi_prec_luma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); + temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift); // Pack the bits from 1-bit to 8-bit temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16); @@ -987,6 +1006,19 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, break; case 16: + + // Load total of 16 elements from memory to vector + sample0_epi16 = hi_prec_luma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + sample1_epi16 = hi_prec_luma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); + temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift); + temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); @@ -994,6 +1026,20 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, default: + + // Load total of 16 elements from memory to vector + sample0_epi16 = hi_prec_luma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + sample1_epi16 = hi_prec_luma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); + temp_y_epi16 = _mm256_srai_epi16(temp_y_epi16, shift); + + if (temp == 0) { temp_epi16_y = temp_y_epi16; @@ -1016,39 +1062,39 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); - sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : - _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); - - sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : - _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); - - // (sample1 + sample2 + offset)>>shift - __m256i temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); - temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); - - sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : - _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); - - sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : - _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); - - // (sample1 + sample2 + offset)>>shift - __m256i temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); - temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift); - switch (width) { case 4: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift); + + temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 0] = _mm256_extract_epi8(temp_epi8, 0); lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1); - - temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16); lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 0] = _mm256_extract_epi8(temp_epi8, 0); lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + 1] = _mm256_extract_epi8(temp_epi8, 1); @@ -1057,6 +1103,31 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, case 8: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift); + + + temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); @@ -1071,11 +1142,34 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, case 16: + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift); + temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); // Store 64-bit integer into memory _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16); // Store 64-bit integer into memory @@ -1085,6 +1179,39 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, case 32: + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift); + temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); @@ -1094,6 +1221,40 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, break; default: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srai_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_srai_epi16(temp_v_epi16, shift); + if (temp_uv == 0) { // Store to temporary vector diff --git a/tests/inter_recon_bipred_tests.c b/tests/inter_recon_bipred_tests.c index fc85e40a..9874a582 100644 --- a/tests/inter_recon_bipred_tests.c +++ b/tests/inter_recon_bipred_tests.c @@ -32,9 +32,9 @@ static lcu_t lcu1; int temp1, temp2, temp3, temp4; -int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } }; -int width = 8; -int height = 8; +int16_t mv_param[2][2] = { { 7,7 },{ 7,7 } }; +int width = 16; +int height = 16; int xpos = 0; int ypos = 0;