first working optimation

2024-11-30 12:44:07 +00:00 · 2019-03-04 16:55:39 +02:00 · 2019-03-04 16:55:39 +02:00 · 96dc60a1ed
parent c148aff9fb
commit 96dc60a1ed
1 changed files with 76 additions and 72 deletions
--- a/src/strategies/avx2/sao-avx2.c
+++ b/src/strategies/avx2/sao-avx2.c
@ -394,56 +394,6 @@ static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
 }


-static void sao_reconstruct_color_avx(const encoder_control_t * const encoder,
- const kvz_pixel *rec_data,
- kvz_pixel *new_rec_data,
- const sao_info_t *sao,
- int stride,
- int new_stride,
- int block_width,
- int block_height,
- color_t color_i)
-{
- // Arrays orig_data and rec_data are quarter size for chroma.
- int offset_v = color_i == COLOR_V ? 5 : 0;
-
- if (sao->type == SAO_TYPE_BAND) {
-  int offsets[1 << KVZ_BIT_DEPTH];
-  kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
-  for (int y = 0; y < block_height; ++y) {
-   for (int x = 0; x < block_width; ++x) {
-    new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
-   }
-  }
- }
- else {
-  // Don't sample the edge pixels because this function doesn't have access to
-  // their neighbours.
-  for (int y = 0; y < block_height; ++y) {
-   for (int x = 0; x < block_width; x += 8) {
-
-    for (int i = 0; i < 8; ++i) {
-
-     int test = x + i;
-     vector2d_t a_ofs = g_sao_edge_offsets[sao->eo_class][0];
-     vector2d_t b_ofs = g_sao_edge_offsets[sao->eo_class][1];
-     const kvz_pixel *c_data = &rec_data[y * stride + test];
-     kvz_pixel *new_data = &new_rec_data[y * new_stride + test];
-     kvz_pixel a = c_data[a_ofs.y * stride + a_ofs.x];
-     kvz_pixel c = c_data[0];
-     kvz_pixel b = c_data[b_ofs.y * stride + b_ofs.x];
-
-     int eo_cat = sao_calc_eo_cat(a, b, c);
-
-     new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
-
-    }
-
-   }
-  }
- }
-}
-
 static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
 const kvz_pixel *rec_data,
 kvz_pixel *new_rec_data,
@ -458,11 +408,15 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
 int offset_v = color_i == COLOR_V ? 5 : 0;
 

+
+ /* Optimate this
+ */
 if (sao->type == SAO_TYPE_BAND) {
  int offsets[1 << KVZ_BIT_DEPTH];
  kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
  for (int y = 0; y < block_height; ++y) {
   for (int x = 0; x < block_width; ++x) {
+
    new_rec_data[y * new_stride + x] = offsets[rec_data[y * stride + x]];
   }
  }
@ -502,33 +456,20 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
    __m256i vector_sao_offsets_epi32 = _mm256_set_epi32(sao->offsets[temp[7]], sao->offsets[temp[6]], sao->offsets[temp[5]], sao->offsets[temp[4]], sao->offsets[temp[3]], sao->offsets[temp[2]], sao->offsets[temp[1]], sao->offsets[temp[0]]);
    vector_sao_offsets_epi32 = _mm256_add_epi32(vector_sao_offsets_epi32, vector_c_data0_epi32);

+
+    // Convert int to int8_t
    __m256i temp_epi16 = _mm256_packus_epi32(vector_sao_offsets_epi32, vector_sao_offsets_epi32);
+    temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
    __m256i temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);

+    // Store 64-bits from vector to memory
+    _mm_storel_epi64((__m128i*)&(new_rec_data[y * new_stride + x]), _mm256_castsi256_si128(temp_epi8));

-
-    int*temp2 = (int*)&vector_sao_offsets_epi32;
-    
-    for (int i = 0; i < 8; ++i) {
-
-     const kvz_pixel *c_data = &rec_data[y * stride + x + i];
-
-     kvz_pixel *new_data = &new_rec_data[y * new_stride + x + i];
-
-     //printf("%d ", c_data[0] + sao->offsets[temp[i]]);
-     //printf("%d \n", temp2[i]);
-
-
-     new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, temp2[i]);//c_data[0] + sao->offsets[temp[i]]);
    test = x;
   }
-    //Low = 0
-    //High = (1 << KVZ_BIT_DEPTH)
-    //Value = c_data[0] + sao->offsets[eo_cat + offset_v]
-    //new_data[0] = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data[0] + sao->offsets[eo_cat + offset_v]);
-   }
-

+   /* Some optimation still need to be done, because this function uses only 6 pixels
+   */
   for (int i = 0; i < (block_width - test); ++i) {

    const kvz_pixel *c_data = &rec_data[y * stride + test + i];
@ -550,7 +491,8 @@ static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
 }
 }

-
+//--------------------------------------------------------------------------------------
+// Remove when done
 static int sao_band_ddistortion_avx2(const encoder_state_t * const state,
 const kvz_pixel *orig_data,
 const kvz_pixel *rec_data,
@ -580,6 +522,68 @@ static int sao_band_ddistortion_avx2(const encoder_state_t * const state,

 return sum;
 }
+//--------------------------------------------------------------------------------------
+static int sao_band_ddistortion_avx(const encoder_state_t * const state,
+ const kvz_pixel *orig_data,
+ const kvz_pixel *rec_data,
+ int block_width,
+ int block_height,
+ int band_pos,
+ int sao_bands[4])
+{
+ int y, x;
+ int shift = state->encoder_control->bitdepth - 5;
+ int sum = 0;
+
+ __m256i sum_epi32 = _mm256_setzero_si256();
+
+ __m256i band_pos_epi32 = _mm256_set1_epi32(band_pos);
+
+ for (y = 0; y < block_height; ++y) {
+  for (x = 0; x < block_width; x += 8) {
+
+   //int band = (rec_data[y * block_width + x] >> shift) - band_pos;
+
+   __m256i band_epi32 = _mm256_loadu_si256((__m256i*)&rec_data[y * block_width + x]);
+   band_epi32 = _mm256_srli_epi32(band_epi32, shift);
+   band_epi32 = _mm256_sub_epi32(band_epi32, band_pos_epi32);
+
+   __m256i offset_epi32 = _mm256_setzero_si256();
+   __m256i temp1 = _mm256_cmpeq_epi32(offset_epi32, band_epi32);
+   temp1 = _mm256_or_si256(temp1, _mm256_cmpgt_epi32(band_epi32, offset_epi32));
+   __m256i temp2 = _mm256_cmpgt_epi32(_mm256_set1_epi32(4), band_epi32);
+
+   __m256i mask_epi32 = _mm256_andnot_si256(temp2, temp1);
+   int*band = (int*)&band_epi32;
+
+   offset_epi32 = _mm256_setr_epi32(band[0], band[1], band[2], band[3], band[4], band[5], band[6], band[7]);
+
+
+   __m256i orig_data_epi32 = _mm256_loadu_si256((__m256i*)&orig_data[y * block_width + x]);
+   __m256i rec_data_epi32 = _mm256_loadu_si256((__m256i*)&rec_data[y * block_width + x]);
+   __m256i diff_epi32 = _mm256_sub_epi32(orig_data_epi32, rec_data_epi32);
+
+   temp1 = _mm256_sub_epi32(diff_epi32, offset_epi32);
+   temp1 = _mm256_mullo_epi32(temp1, temp1);
+
+   temp2 = _mm256_mullo_epi32(diff_epi32, diff_epi32);
+
+   temp1 = _mm256_sub_epi32(temp1, temp2);
+   temp1 = _mm256_and_si256(temp1, mask_epi32);
+
+   sum_epi32 = _mm256_add_epi32(sum_epi32, temp1);
+  }
+ }
+
+ sum_epi32 = _mm256_hadd_epi32(sum_epi32, sum_epi32);
+ sum_epi32 = _mm256_hadd_epi32(sum_epi32, sum_epi32);
+ 
+ int*temp = (int*)&sum_epi32;
+
+ sum = temp[0] + temp[3];
+
+ return sum;
+}

 #endif //COMPILE_INTEL_AVX2