diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 6f0ce0ed..91c4715e 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -887,6 +887,320 @@ static void inter_recon_bipred_no_mov_avx2( } +static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, + const int hi_prec_luma_rec1, + const int hi_prec_chroma_rec0, + const int hi_prec_chroma_rec1, + const int height, + const int width, + const int ypos, + const int xpos, + const hi_prec_buf_t*high_precision_rec0, + const hi_prec_buf_t*high_precision_rec1, + lcu_t* lcu, + kvz_pixel* temp_lcu_y, + kvz_pixel* temp_lcu_u, + kvz_pixel* temp_lcu_v) +{ + if (hi_prec_luma_rec0 == 0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0) + { + inter_recon_bipred_no_mov_avx2(height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v); + } + + else + { + + int y_in_lcu, x_in_lcu; + int shift = 15 - KVZ_BIT_DEPTH; + int offset = 1 << (shift - 1); + int8_t shift_left = 14 - KVZ_BIT_DEPTH; + + __m256i temp_epi32_y, temp_epi8, temp_y_epi32, sample0_epi32, sample1_epi32, temp_epi16; + + int32_t * pointer = 0; + + __m256i offset_epi32 = _mm256_set1_epi32(offset); + /* + printf("%d ", hi_prec_luma_rec0); + for (int temp_y = 0; temp_y < height; ++temp_y) { + int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + for (int temp_x = 0; temp_x < width; ++temp_x) { + int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + uint8_t luku = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = luku; + } + } + */ + + for (int temp_y = 0; temp_y < height; ++temp_y) { + temp_epi32_y = _mm256_setzero_si256(); + + + y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + + + for (int temp_x = 0; temp_x < width; temp_x += 8) { + x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + + + switch (width) + { + + case 4: + + // Load total of 4 elements from memory to vector + + sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepi8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepi8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + + // (sample1 + sample2 + offset)>>shift + temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + + temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32); + + temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift); + + // Pack the bits from 32-bit to 8-bit + temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + + pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); + break; + + default: + + // Load total of 8 elements from memory to vector + sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepi8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepi8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH); + + + // (sample1 + sample2 + offset)>>shift + temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + + temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32); + + temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift); + + // Pack the bits from 32-bit to 8-bit + temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + + // Store 64-bits from vector to memory + _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + break; + } + } + } + + + __m256i sample0_epi16, sample1_epi16; + int temp_uv = 0; + + int start_point_uv = 0; + + __m256i temp_u_epi16 = _mm256_setzero_si256(); + __m256i temp_v_epi16 = _mm256_setzero_si256(); + + __m256i offset_epi16 = _mm256_set1_epi16(offset); + __m256i temp_epi16_u = _mm256_setzero_si256(); + __m256i temp_epi16_v = _mm256_setzero_si256(); + + for (int temp_y = 0; temp_y < height >> 1; ++temp_y) { + int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + for (int temp_x = 0; temp_x < width >> 1; temp_x+=16) { + int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + switch (width) { + + case 8: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); + + temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + + temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16); + + temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); + + temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); + + pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); + + temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16); + + pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); + + break; + + case 16: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); + + temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + + temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16); + + temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); + + temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); + + // Store 64-bit integer into memory + _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + + temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16); + + // Store 64-bit integer into memory + _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + break; + + case 32: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); + + temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_adds_epu16(sample0_epi16, sample1_epi16); + + temp_v_epi16 = _mm256_adds_epu16(temp_v_epi16, offset_epi16); + + temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); + + temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(3, 1, 2, 0)); + _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_v_epi16, temp_v_epi16), _MM_SHUFFLE(3, 1, 2, 0)); + _mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + break; + + default: + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); + + sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : + _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); + + // (sample1 + sample2 + offset)>>shift + temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); + + if (temp_uv == 0) { + + // Store to temporary vector + temp_epi16_u = temp_u_epi16; + temp_epi16_v = temp_v_epi16; + + // Save starting point to memory + start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu; + + temp_uv++; + } + + else { + temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_u, temp_u_epi16), _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8); + + temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_v, temp_v_epi16), _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8); + + temp_uv = 0; + + } + break; + } + } + } + } +} + +/* static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, const int hi_prec_luma_rec1, const int hi_prec_chroma_rec0, @@ -919,8 +1233,6 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, __m256i temp_u_epi16 = _mm256_setzero_si256(); __m256i temp_v_epi16 = _mm256_setzero_si256(); __m256i max_values = _mm256_setzero_si256(); - __m256i compare_epi16 = _mm256_setzero_si256(); - __m256i and_epi16 = _mm256_set1_epi16(-1); int start_point = 0; int start_point_uv = 0; @@ -960,14 +1272,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values); + temp_y_epi16 = _mm256_adds_epi16(temp_y_epi16, offset_epi16); temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift); - temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16)); // Pack the bits from 16-bit to 8-bit temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16); @@ -989,17 +1298,13 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, sample1_epi16 = hi_prec_luma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left); - max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values); - + temp_y_epi16 = _mm256_adds_epu16(temp_y_epi16, offset_epi16); + temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift); - temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16)); // Pack the bits from 16-bit to 8-bit @@ -1023,14 +1328,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - - temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values); + temp_y_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_adds_epi16(temp_y_epi16, offset_epi16); + temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift); - temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16)); // Pack the bits from 16-bit to 8-bit temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16); @@ -1054,14 +1356,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_y_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values); + temp_y_epi16 = _mm256_adds_epi16(temp_y_epi16, offset_epi16); temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift); - temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16)); // Pack the bits from 16-bit to 8-bit temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16); @@ -1103,15 +1402,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_u_epi16), max_values); + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); - temp_u_epi16 = _mm256_and_si256(temp_u_epi16, _mm256_subs_epu16(and_epi16, compare_epi16)); - sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); @@ -1122,15 +1417,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_v_epi16), max_values); + temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16); temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); - temp_v_epi16 = _mm256_and_si256(temp_v_epi16, _mm256_subs_epu16(and_epi16, compare_epi16)); - temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); @@ -1155,15 +1446,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_u_epi16), max_values); + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); - temp_u_epi16 = _mm256_and_si256(temp_u_epi16, _mm256_subs_epu16(and_epi16, compare_epi16)); - sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); @@ -1174,14 +1461,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_v_epi16), max_values); + temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16); temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); - temp_v_epi16 = _mm256_and_si256(temp_v_epi16, _mm256_subs_epu16(and_epi16, compare_epi16)); temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16); @@ -1207,15 +1491,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); - temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_u_epi16), max_values); + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); - temp_u_epi16 = _mm256_and_si256(temp_u_epi16, _mm256_subs_epu16(and_epi16, compare_epi16)); - sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); @@ -1226,14 +1506,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16); // (sample1 + sample2 + offset)>>shift - temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_adds_epu16(sample0_epi16, sample1_epi16); - temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); - - compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_v_epi16), max_values); + temp_v_epi16 = _mm256_adds_epu16(temp_v_epi16, offset_epi16); temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); - temp_v_epi16 = _mm256_and_si256(temp_v_epi16, _mm256_subs_epu16(and_epi16, compare_epi16)); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(3, 1, 2, 0)); _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); @@ -1252,8 +1529,8 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); // (sample1 + sample2 + offset)>>shift - temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16); + temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16); temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift); sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) : @@ -1263,8 +1540,8 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left); // (sample1 + sample2 + offset)>>shift - temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16); - temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16); + temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16); + temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16); temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift); if (temp_uv == 0) { @@ -1302,7 +1579,7 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, } } - +*/ #endif //COMPILE_INTEL_AVX2 diff --git a/tests/inter_recon_bipred_tests.c b/tests/inter_recon_bipred_tests.c index da8a2a00..20d6fdd6 100644 --- a/tests/inter_recon_bipred_tests.c +++ b/tests/inter_recon_bipred_tests.c @@ -32,9 +32,9 @@ static lcu_t lcu1; int temp1, temp2, temp3, temp4; -int16_t mv_param[2][2] = { { 7,7 },{ 7,7 } }; -int width = 8; -int height = 8; +int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } }; +int width = 16; +int height = 16; int xpos = 0; int ypos = 0; @@ -125,27 +125,10 @@ TEST test_inter_recon_bipred() memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64); memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32); memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32); - /* - for (temp_y = 0; temp_y < height; ++temp_y) { - int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); - for (temp_x = 0; temp_x < width; temp_x += 1) { - int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); - printf("%d ", expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]); - } - } - printf("\n"); - /* - for (temp_y = 0; temp_y < height >> 1; ++temp_y) { - int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); - for (temp_x = 0; temp_x < width >> 1; ++temp_x) { - int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); - printf("%d ", expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]); - } - } - printf("\n");*/ + kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); - /* + for (temp_y = 0; temp_y < height; ++temp_y) { int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); for (temp_x = 0; temp_x < width; temp_x += 1) { @@ -154,8 +137,8 @@ TEST test_inter_recon_bipred() } } printf("\n"); - */ - + + /* for (temp_y = 0; temp_y < height >> 1; ++temp_y) { int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); for (temp_x = 0; temp_x < width >> 1; ++temp_x) { @@ -164,7 +147,7 @@ TEST test_inter_recon_bipred() } } printf("\n"); - + */ for (temp_y = 0; temp_y < height; ++temp_y) { int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));