diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c index 8af0a7b3..02b3e96f 100644 --- a/src/strategies/avx2/picture-avx2.c +++ b/src/strategies/avx2/picture-avx2.c @@ -756,256 +756,6 @@ static unsigned pixels_calc_ssd_avx2(const kvz_pixel *const ref, const kvz_pixel } } -static void inter_recon_bipred_no_mov_avx2( - const int height, - const int width, - const int ypos, - const int xpos, - const hi_prec_buf_t*high_precision_rec0, - const hi_prec_buf_t*high_precision_rec1, - lcu_t* lcu, - kvz_pixel* temp_lcu_y, - kvz_pixel* temp_lcu_u, - kvz_pixel* temp_lcu_v, - bool predict_luma, - bool predict_chroma) { - - // This function is used only when kvazaar can't find any movement from the current block - int y_in_lcu, x_in_lcu; - __m256i sample0_epi8, sample1_epi8, temp_y_epi8; - int32_t * pointer = 0; - - for (int temp_y = 0; temp_y < height; temp_y += 1) { - y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); - - for (int temp_x = 0; temp_x < width; temp_x += 32) { - - x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); - - if (predict_luma) { - switch (width) - { - - case 4: - - sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - - temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); - *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8)); - - break; - - case 8: - - sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - - temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - // Store 64-bits from vector to memory - _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8)); - - break; - - case 12: - sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - - temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - // Store 64-bits from vector to memory - _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8)); - - x_in_lcu = ((xpos + temp_x + 8) & ((LCU_WIDTH)-1)); - - sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - - temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); - *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8)); - break; - - - case 16: - - sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - - temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - // Store 128-bit to memory - _mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8)); - - break; - - case 32: - - sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); - - temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - - // Store 256-bit integers to memory - _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_y_epi8); - break; - - default: - // If width is something strange size, use this - for (int temp_i = 0; temp_i < width; ++temp_i) { - x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1)); - - int sample0_y = (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - int sample1_y = (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - - lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y) >> 1); - } - - - } - } - - if (temp_x < width >> 1 && temp_y < height >> 1) { - y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); - x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); - - __m256i temp_u_epi8; - __m256i temp_v_epi8; - - if (predict_chroma) { - switch (width) - { - - case 8: - - - sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); - *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8)); - - pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); - *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8)); - - break; - - case 12: - - sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); - *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8)); - - pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); - *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8)); - - // This is used only with odd shaped objects - for (int temp_i = 4; temp_i < width >> 1; ++temp_i) { - int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); - int16_t sample0_u = (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - int16_t sample1_u = (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1); - - int16_t sample0_v = (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - int16_t sample1_v = (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1); - } - - break; - - case 16: - - sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - // Store 64-bit integer into memory - _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_u_epi8)); - - // Store 64-bit integer into memory - _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_v_epi8)); - - break; - - case 32: - - sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - // Fill 128 bit vector with packed data and store it to memory - _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_u_epi8)); - - // Fill 128 bit vector with packed data and store it to memory - _mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_v_epi8)); - - - break; - - case 64: - - sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); - temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); - - _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_u_epi8); - _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_v_epi8); - break; - - default: - // This is used only with odd shaped objects - for (int temp_i = 0; temp_i < width >> 1; ++temp_i) { - int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); - int16_t sample0_u = (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - int16_t sample1_u = (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1); - - int16_t sample0_v = (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - int16_t sample1_v = (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); - lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1); - } - - break; - - } - } - y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); - } - } - } - - -} - static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, const int hi_prec_luma_rec1, const int hi_prec_chroma_rec0, @@ -1023,17 +773,6 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, bool predict_luma, bool predict_chroma) { - if(hi_prec_luma_rec0 == 0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0) - { - inter_recon_bipred_no_mov_avx2(height, width, ypos, xpos, - high_precision_rec0, high_precision_rec1, - lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v, - predict_luma, predict_chroma); - } - - else - { - int y_in_lcu, x_in_lcu; int shift = 15 - KVZ_BIT_DEPTH; int offset = 1 << (shift - 1); @@ -1268,7 +1007,6 @@ bool predict_chroma) } } } - } } static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)