Remove avx2 blending

2024-11-24 02:24:07 +00:00 · 2020-04-04 14:58:06 +03:00 · 2020-04-04 14:58:06 +03:00 · e7857cbb24
parent 55bb8ee17f
commit e7857cbb24
1 changed files with 1 additions and 236 deletions
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -769,241 +769,6 @@ static unsigned pixels_calc_ssd_avx2(const uint8_t *const ref, const uint8_t *co
  }
 }

-static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
- const int hi_prec_luma_rec1,
- const int hi_prec_chroma_rec0,
- const int hi_prec_chroma_rec1,
- const int height,
- const int width,
- const int ypos,
- const int xpos,
- const hi_prec_buf_t*high_precision_rec0,
- const hi_prec_buf_t*high_precision_rec1,
- lcu_t* lcu,
- uint8_t* temp_lcu_y,
- uint8_t* temp_lcu_u,
- uint8_t* temp_lcu_v,
-bool predict_luma,
-bool predict_chroma)
-{
-  int y_in_lcu, x_in_lcu;
-  int shift = 15 - KVZ_BIT_DEPTH;
-  int offset = 1 << (shift - 1);
-  __m256i temp_epi8, temp_y_epi32, sample0_epi32, sample1_epi32, temp_epi16;
-  int32_t * pointer = 0;
-  __m256i offset_epi32 = _mm256_set1_epi32(offset);
-
-  for (int temp_y = 0; temp_y < height; ++temp_y) {
-
-   y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-
-   for (int temp_x = 0; temp_x < width; temp_x += 8) {
-    x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-
-    if (predict_luma) {
-      bool use_8_elements = ((temp_x + 8) <= width);
-
-      if (!use_8_elements) {
-        if (width < 4) {
-          // If width is smaller than 4 there's no need to use SIMD
-          for (int temp_i = 0; temp_i < width; ++temp_i) {
-            x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
-
-            int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-            int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-
-            lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-          }
-        }
-
-        else {
-          // Load total of 4 elements from memory to vector
-          sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-
-          sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-
-          // (sample1 + sample2 + offset)>>shift 
-          temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-          temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
-          temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
-
-          // Pack the bits from 32-bit to 8-bit
-          temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
-          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-          pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
-          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
-
-
-
-          for (int temp_i = temp_x + 4; temp_i < width; ++temp_i) {
-            x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
-
-            int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-            int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-
-            lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-          }
-
-        }
-      } else {
-        // Load total of 8 elements from memory to vector
-        sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
-          _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH);
-
-        sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
-          _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH);
-
-        // (sample1 + sample2 + offset)>>shift 
-        temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-        temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
-        temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
-
-        // Pack the bits from 32-bit to 8-bit
-        temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
-        temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-        temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-        // Store 64-bits from vector to memory
-        _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
-      }
-    }
-   }
-  }
-  for (int temp_y = 0; temp_y < height >> 1; ++temp_y) {
-   int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-   
-   for (int temp_x = 0; temp_x < width >> 1; temp_x += 8) {
-
-    int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-
-    if (predict_chroma) {
-      if ((width >> 1) < 4) {
-        // If width>>1 is smaller than 4 there's no need to use SIMD
-
-        for (int temp_i = 0; temp_i < width >> 1; ++temp_i) {
-          int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-          int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-          int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-          lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-          int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-          int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-          lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-        }
-      }
-
-      else {
-
-        bool use_8_elements = ((temp_x + 8) <= (width >> 1));
-
-        __m256i temp_u_epi32, temp_v_epi32;
-
-        if (!use_8_elements) {
-          // Load 4 pixels to vector
-          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-          // (sample1 + sample2 + offset)>>shift 
-          temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-          temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
-          temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
-
-
-
-          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-
-          // (sample1 + sample2 + offset)>>shift 
-          temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-          temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
-          temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
-
-
-          temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
-          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-          pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
-          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
-
-
-          temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
-          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-          pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
-          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
-
-          for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
-
-            // Use only if width>>1 is not divideble by 4
-            int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-            int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-            int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-            lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-            int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-            int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-            lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-          }
-        } else {
-          // Load 8 pixels to vector
-          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-          // (sample1 + sample2 + offset)>>shift 
-          temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-          temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
-          temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
-
-          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
-
-
-          // (sample1 + sample2 + offset)>>shift 
-          temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-          temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
-          temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
-
-          temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
-          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-          // Store 64-bit integer into memory
-          _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
-
-          temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
-          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-          // Store 64-bit integer into memory
-          _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
-        }
-      }
-    }
-   }
-  }
-}
-
 static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
 {
  if (width == 0)
@ -1278,7 +1043,7 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
    success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);

    success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
-    success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
+    //success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
    success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
    success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
    success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2);