Updated some commands to match the standard

2024-11-24 02:24:07 +00:00 · 2019-03-20 15:42:02 +02:00 · 2019-03-20 15:42:02 +02:00 · c884c738b1
parent b412ed2f59
commit c884c738b1
1 changed files with 27 additions and 21 deletions
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@ -22,7 +22,7 @@

 #if COMPILE_INTEL_AVX2
 #include <immintrin.h>
-#include <mmintrin.h>
+#include <nmmintrin.h>

 #include "cu.h"
 #include "encoder.h"
@ -123,7 +123,7 @@ static int sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,

   tmp_sum_epi32 = _mm256_add_epi32(tmp_sum_epi32, _mm256_andnot_si256(offset_zeros_epi32, _mm256_sub_epi32(tmp1_vec_epi32, tmp2_vec_epi32)));
  }
-
+  tmp_diff_epi32 = _mm256_setzero_si256();
  // Load the last 6 pixels to use

  const kvz_pixel *c_data = &rec_data[y * block_width + x];
@ -226,7 +226,7 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
   //--------------------------------------------------------------------------
   // v_cat == 0
   __m256i mask_epi32 = _mm256_cmpeq_epi32(zeros_epi32, v_cat_epi32);
-   int temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32))/4;
+   int temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32))/4;
   cat_sum_cnt[1][0] += temp_cnt;
   temp_mem_epi32 = _mm256_load_si256((__m256i*)&orig_data[y * block_width + x] - c);
   temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
@ -235,7 +235,7 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
   // v_cat == 1

   mask_epi32 = _mm256_cmpeq_epi32(ones_epi32, v_cat_epi32);
-   temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+   temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
   cat_sum_cnt[1][1] += temp_cnt;
   temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
   tmp_one_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
@ -243,7 +243,7 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
   // v_cat == 2

   mask_epi32 = _mm256_cmpeq_epi32(twos_epi32, v_cat_epi32);
-   temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+   temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
   cat_sum_cnt[1][2] += temp_cnt;
   temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
   tmp_two_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
@ -251,7 +251,7 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
   // v_cat == 3

   mask_epi32 = _mm256_cmpeq_epi32(threes_epi32, v_cat_epi32);
-   temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+   temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
   cat_sum_cnt[1][3] += temp_cnt;
   temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
   tmp_three_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
@ -259,7 +259,7 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
   // v_cat == 4

   mask_epi32 = _mm256_cmpeq_epi32(fours_epi32, v_cat_epi32);
-   temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+   temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
   cat_sum_cnt[1][4] += temp_cnt;
   temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
   tmp_four_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
@ -315,41 +315,42 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
  // It's done for every single value that cat could get {1, 2, 0, 3, 4}
  //--------------------------------------------------------------------------
  __m256i mask_epi32 = _mm256_cmpeq_epi32(zeros_epi32, v_cat_epi32);
-  int temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4 - 2;
+  int temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4 - 2;
  cat_sum_cnt[1][0] += temp_cnt;
  temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
  tmp_zero_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
  //--------------------------------------------------------------------------

  mask_epi32 = _mm256_cmpeq_epi32(ones_epi32, v_cat_epi32);
-  temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+  temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
  cat_sum_cnt[1][1] += temp_cnt;
  temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
  tmp_one_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
  //--------------------------------------------------------------------------

  mask_epi32 = _mm256_cmpeq_epi32(twos_epi32, v_cat_epi32);
-  temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+  temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
  cat_sum_cnt[1][2] += temp_cnt;
  temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
  tmp_two_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
  //--------------------------------------------------------------------------

  mask_epi32 = _mm256_cmpeq_epi32(threes_epi32, v_cat_epi32);
-  temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+  temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
  cat_sum_cnt[1][3] += temp_cnt;
  temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
  tmp_three_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
  //--------------------------------------------------------------------------

  mask_epi32 = _mm256_cmpeq_epi32(fours_epi32, v_cat_epi32);
-  temp_cnt = __popcnt(_mm256_movemask_epi8(mask_epi32)) / 4;
+  temp_cnt = _mm_popcnt_u32(_mm256_movemask_epi8(mask_epi32)) / 4;
  cat_sum_cnt[1][4] += temp_cnt;
  temp_epi32 = _mm256_and_si256(mask_epi32, temp_mem_epi32);
  tmp_four_values_epi32 = _mm256_add_epi32(tmp_zero_values_epi32, temp_epi32);
  //--------------------------------------------------------------------------

  temp_epi32 = _mm256_hadd_epi32(tmp_zero_values_epi32, tmp_one_values_epi32);
+
  temp_mem_epi32 = _mm256_hadd_epi32(tmp_two_values_epi32, tmp_three_values_epi32);
  temp_mem_epi32 = _mm256_hadd_epi32(temp_epi32, temp_mem_epi32);

@ -386,12 +387,15 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
 int block_height,
 color_t color_i)
 {
+
 // Arrays orig_data and rec_data are quarter size for chroma.
 int offset_v = color_i == COLOR_V ? 5 : 0;
 
 if (sao->type == SAO_TYPE_BAND) {
  int offsets[1 << KVZ_BIT_DEPTH];
  kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
+  unsigned char*temp;
+
  for (int y = 0; y < block_height; ++y) {
   for (int x = 0; x < block_width; x+=32) {

@ -406,10 +410,11 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
    switch (choose)
    {

-    case 2:
+    case 2:;

     __m256i rec_data_256_epi8 = _mm256_loadu_si256((__m256i*)&rec_data[y * stride + x]);
-     uint8_t*temp = (uint8_t*)&rec_data_256_epi8;
+     temp = (unsigned char*)&rec_data_256_epi8;
+
     __m256i offsets_256_epi8 = _mm256_set_epi8(offsets[temp[31]], offsets[temp[30]], offsets[temp[29]], offsets[temp[28]], offsets[temp[27]], offsets[temp[26]], offsets[temp[25]],
      offsets[temp[24]], offsets[temp[23]], offsets[temp[22]], offsets[temp[21]], offsets[temp[20]], offsets[temp[19]], offsets[temp[18]], offsets[temp[17]], offsets[temp[16]],
      offsets[temp[15]], offsets[temp[14]], offsets[temp[13]], offsets[temp[12]], offsets[temp[11]], offsets[temp[10]], offsets[temp[9]],
@ -417,10 +422,10 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
     _mm256_storeu_si256((__m256i*)& new_rec_data[y * new_stride + x], offsets_256_epi8);
     break;

-    case 1:
+    case 1:;

     __m128i rec_data_128_epi8 = _mm_loadu_si128((__m128i*)&rec_data[y * stride + x]);
-     temp = (uint8_t*)&rec_data_128_epi8;
+     temp = (unsigned char*)&rec_data_128_epi8;
     __m128i offsets_128_epi8 = _mm_set_epi8(offsets[temp[15]], offsets[temp[14]], offsets[temp[13]], offsets[temp[12]], offsets[temp[11]], offsets[temp[10]], offsets[temp[9]],
      offsets[temp[8]], offsets[temp[7]], offsets[temp[6]], offsets[temp[5]], offsets[temp[4]], offsets[temp[3]], offsets[temp[2]], offsets[temp[1]], offsets[temp[0]]);
     _mm_storeu_si128((__m128i*)& new_rec_data[y * new_stride + x], offsets_128_epi8);
@ -431,7 +436,7 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,

     break;

-    default:
+    default:;

     for (int i = x; i < block_width; i++) {
      new_rec_data[y * new_stride + i] = offsets[rec_data[y * stride + i]];
@ -452,14 +457,14 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
  vector2d_t b_ofs = g_sao_edge_offsets[sao->eo_class][1];

  for (int y = 0; y < block_height; ++y) {
-   int x = 0;
-   for (x;  x < block_width; x += 8) {
+   int x;
+   for (x = 0; x < block_width; x += 8) {

    bool use_8_elements = (block_width - x) >= 8;

    switch (use_8_elements)
    {
-    case true:
+    case true:;
     const kvz_pixel *c_data = &rec_data[y * stride + x];

     __m128i vector_a_epi8 = _mm_loadl_epi64((__m128i*)&c_data[a_ofs.y * stride + a_ofs.x]);
@ -488,7 +493,8 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
     // Store 64-bits from vector to memory
     _mm_storel_epi64((__m128i*)&(new_rec_data[y * new_stride + x]), _mm256_castsi256_si128(temp_epi8));
     break;
-    default:
+
+    default:;
     for (int i = x; i < (block_width); ++i) {

      const kvz_pixel *c_data = &rec_data[y * stride + i];