Trying to find the bug in luma

2024-11-23 18:14:06 +00:00 · 2018-10-11 18:08:41 +03:00 · 2018-10-11 18:08:41 +03:00 · 381e786e10
parent 2f5f81bac3
commit 381e786e10
2 changed files with 348 additions and 88 deletions
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -887,6 +887,320 @@ static void inter_recon_bipred_no_mov_avx2(

 }

+static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
+ const int hi_prec_luma_rec1,
+ const int hi_prec_chroma_rec0,
+ const int hi_prec_chroma_rec1,
+ const int height,
+ const int width,
+ const int ypos,
+ const int xpos,
+ const hi_prec_buf_t*high_precision_rec0,
+ const hi_prec_buf_t*high_precision_rec1,
+ lcu_t* lcu,
+ kvz_pixel* temp_lcu_y,
+ kvz_pixel* temp_lcu_u,
+ kvz_pixel* temp_lcu_v)
+{
+ if (hi_prec_luma_rec0 == 0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0)
+ {
+  inter_recon_bipred_no_mov_avx2(height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
+ }
+
+ else
+ {
+  
+  int y_in_lcu, x_in_lcu;
+  int shift = 15 - KVZ_BIT_DEPTH;
+  int offset = 1 << (shift - 1);
+  int8_t shift_left = 14 - KVZ_BIT_DEPTH;
+
+  __m256i temp_epi32_y, temp_epi8, temp_y_epi32, sample0_epi32, sample1_epi32, temp_epi16;
+
+  int32_t * pointer = 0;
+
+  __m256i offset_epi32 = _mm256_set1_epi32(offset);
+  /*
+  printf("%d ", hi_prec_luma_rec0);
+  for (int temp_y = 0; temp_y < height; ++temp_y) {
+   int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+   for (int temp_x = 0; temp_x < width; ++temp_x) {
+    int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+    int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+    int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+    uint8_t luku = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+    lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = luku;
+   }
+  }
+  */
+  
+  for (int temp_y = 0; temp_y < height; ++temp_y) {
+   temp_epi32_y = _mm256_setzero_si256();
+
+
+   y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+
+
+   for (int temp_x = 0; temp_x < width; temp_x += 8) {
+    x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+
+    switch (width)
+    {
+
+    case 4:
+
+     // Load total of 4 elements from memory to vector
+
+     sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
+      _mm256_slli_epi32(_mm256_cvtepi8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
+
+     sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
+      _mm256_slli_epi32(_mm256_cvtepi8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH);
+
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+
+     temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+
+     temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+
+     // Pack the bits from 32-bit to 8-bit
+     temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
+     temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+     temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+
+     pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+     break;
+
+    default:
+
+     // Load total of 8 elements from memory to vector
+     sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
+      _mm256_slli_epi32(_mm256_cvtepi8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH);
+
+     sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
+      _mm256_slli_epi32(_mm256_cvtepi8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH);
+
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+
+     temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+
+     temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+
+     // Pack the bits from 32-bit to 8-bit
+     temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
+     temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+     temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+
+     // Store 64-bits from vector to memory
+     _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
+
+     break;
+    }
+   }
+  }
+  
+
+  __m256i sample0_epi16, sample1_epi16;
+  int temp_uv = 0;
+
+  int start_point_uv = 0;
+
+  __m256i temp_u_epi16 = _mm256_setzero_si256();
+  __m256i temp_v_epi16 = _mm256_setzero_si256();
+
+  __m256i offset_epi16 = _mm256_set1_epi16(offset);
+  __m256i temp_epi16_u = _mm256_setzero_si256();
+  __m256i temp_epi16_v = _mm256_setzero_si256();
+
+  for (int temp_y = 0; temp_y < height >> 1; ++temp_y) {
+   int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+   for (int temp_x = 0; temp_x < width >> 1; temp_x+=16) {
+    int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+    switch (width) {
+
+    case 8:
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+
+     temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);
+
+     temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+
+     temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16);
+
+     temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
+
+     temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
+
+     pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+
+     temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
+
+     pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+
+     break;
+
+    case 16:
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+
+     temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);
+
+     temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+
+     temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16);
+
+     temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
+
+     temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);
+
+     // Store 64-bit integer into memory
+     _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
+
+
+     temp_epi8 = _mm256_packus_epi16(temp_v_epi16, temp_v_epi16);
+
+     // Store 64-bit integer into memory
+     _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
+
+     break;
+
+    case 32:
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+
+     temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);
+
+     temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_v_epi16 = _mm256_adds_epu16(sample0_epi16, sample1_epi16);
+
+     temp_v_epi16 = _mm256_adds_epu16(temp_v_epi16, offset_epi16);
+
+     temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
+
+     temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(3, 1, 2, 0));
+     _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
+
+     temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_v_epi16, temp_v_epi16), _MM_SHUFFLE(3, 1, 2, 0));
+     _mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
+
+     break;
+
+    default:
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+     temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);
+     temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
+
+     sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     sample1_epi16 = hi_prec_chroma_rec1 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
+      _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+     temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16);
+     temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
+
+     if (temp_uv == 0) {
+
+      // Store to temporary vector
+      temp_epi16_u = temp_u_epi16;
+      temp_epi16_v = temp_v_epi16;
+
+      // Save starting point to memory
+      start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
+
+      temp_uv++;
+     }
+
+     else {
+      temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_u, temp_u_epi16), _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8);
+
+      temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_v, temp_v_epi16), _MM_SHUFFLE(3, 1, 2, 0));
+
+      _mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8);
+
+      temp_uv = 0;
+
+     }
+     break;
+    }
+   }
+  }
+ }
+}
+
+/*
 static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
 	const int hi_prec_luma_rec1,
 	const int hi_prec_chroma_rec0,
@ -919,8 +1233,6 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
  __m256i temp_u_epi16 = _mm256_setzero_si256();
  __m256i temp_v_epi16 = _mm256_setzero_si256();
  __m256i max_values = _mm256_setzero_si256();
-  __m256i compare_epi16 = _mm256_setzero_si256();
-  __m256i and_epi16 = _mm256_set1_epi16(-1);

  int start_point = 0;
  int start_point_uv = 0;
@ -960,14 +1272,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
     max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

     // (sample1 + sample2 + offset)>>shift 
-     temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+     temp_y_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-     temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
-
-     compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values);
+     temp_y_epi16 = _mm256_adds_epi16(temp_y_epi16, offset_epi16);

     temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift);
-     temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16));

     // Pack the bits from 16-bit to 8-bit
     temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
@ -989,17 +1298,13 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
     sample1_epi16 = hi_prec_luma_rec1 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
      _mm256_slli_epi16(_mm256_cvtepu8_epi16((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);

-     max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

     // (sample1 + sample2 + offset)>>shift 
     temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);

-     temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
-
-     compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values);
-     
+     temp_y_epi16 = _mm256_adds_epu16(temp_y_epi16, offset_epi16);
+    
     temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift);
-     temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16));


     // Pack the bits from 16-bit to 8-bit
@ -1023,14 +1328,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
     max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

     // (sample1 + sample2 + offset)>>shift 
-     temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
-
-     temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
-
-     compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values);
+     temp_y_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

+     temp_y_epi16 = _mm256_adds_epi16(temp_y_epi16, offset_epi16);
+     
     temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift);
-     temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16));

     // Pack the bits from 16-bit to 8-bit
     temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
@ -1054,14 +1356,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
     max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

     // (sample1 + sample2 + offset)>>shift 
-     temp_y_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+     temp_y_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-     temp_y_epi16 = _mm256_add_epi16(temp_y_epi16, offset_epi16);
-
-     compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_y_epi16), max_values);
+     temp_y_epi16 = _mm256_adds_epi16(temp_y_epi16, offset_epi16);

     temp_y_epi16 = _mm256_srli_epi16(temp_y_epi16, shift);
-     temp_y_epi16 = _mm256_and_si256(temp_y_epi16, _mm256_subs_epi16(and_epi16, compare_epi16));

     // Pack the bits from 16-bit to 8-bit
     temp_epi8 = _mm256_packus_epi16(temp_y_epi16, temp_y_epi16);
@ -1103,15 +1402,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
      max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

      // (sample1 + sample2 + offset)>>shift 
-      temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+      temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-      temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
-
-      compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_u_epi16), max_values);
+      temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);

      temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
-      temp_u_epi16 = _mm256_and_si256(temp_u_epi16, _mm256_subs_epu16(and_epi16, compare_epi16));
-

      sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
       _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
@ -1122,15 +1417,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
      max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

      // (sample1 + sample2 + offset)>>shift 
-      temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+      temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-      temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
-
-      compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_v_epi16), max_values);
+      temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16);

      temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
-      temp_v_epi16 = _mm256_and_si256(temp_v_epi16, _mm256_subs_epu16(and_epi16, compare_epi16));
-

      temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);

@ -1155,15 +1446,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
      max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

      // (sample1 + sample2 + offset)>>shift 
-      temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+      temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-      temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
-
-      compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_u_epi16), max_values);
+      temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);

      temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
-      temp_u_epi16 = _mm256_and_si256(temp_u_epi16, _mm256_subs_epu16(and_epi16, compare_epi16));
-

      sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
       _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
@ -1174,14 +1461,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
      max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

      // (sample1 + sample2 + offset)>>shift 
-      temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+      temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-      temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
-
-      compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_v_epi16), max_values);
+      temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16);

      temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
-      temp_v_epi16 = _mm256_and_si256(temp_v_epi16, _mm256_subs_epu16(and_epi16, compare_epi16));

      temp_epi8 = _mm256_packus_epi16(temp_u_epi16, temp_u_epi16);

@ -1207,15 +1491,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
      max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

      // (sample1 + sample2 + offset)>>shift 
-      temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+      temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);

-      temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
-
-      compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_u_epi16), max_values);
+      temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);

      temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);
-      temp_u_epi16 = _mm256_and_si256(temp_u_epi16, _mm256_subs_epu16(and_epi16, compare_epi16));
-

      sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_castsi128_si256(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
       _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
@ -1226,14 +1506,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
      max_values = _mm256_max_epu16(sample0_epi16, sample1_epi16);

      // (sample1 + sample2 + offset)>>shift 
-      temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
+      temp_v_epi16 = _mm256_adds_epu16(sample0_epi16, sample1_epi16);

-      temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
-
-      compare_epi16 = _mm256_cmpeq_epi16(_mm256_max_epu16(max_values, temp_v_epi16), max_values);
+      temp_v_epi16 = _mm256_adds_epu16(temp_v_epi16, offset_epi16);

      temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);
-      temp_v_epi16 = _mm256_and_si256(temp_v_epi16, _mm256_subs_epu16(and_epi16, compare_epi16));

      temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), _MM_SHUFFLE(3, 1, 2, 0));
      _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
@ -1252,8 +1529,8 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
       _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);

      // (sample1 + sample2 + offset)>>shift 
-      temp_u_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
-      temp_u_epi16 = _mm256_add_epi16(temp_u_epi16, offset_epi16);
+      temp_u_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+      temp_u_epi16 = _mm256_adds_epi16(temp_u_epi16, offset_epi16);
      temp_u_epi16 = _mm256_srli_epi16(temp_u_epi16, shift);

      sample0_epi16 = hi_prec_chroma_rec0 ? _mm256_loadu_si256((__m256i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])) :
@ -1263,8 +1540,8 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
       _mm256_slli_epi16(_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);

      // (sample1 + sample2 + offset)>>shift 
-      temp_v_epi16 = _mm256_add_epi16(sample0_epi16, sample1_epi16);
-      temp_v_epi16 = _mm256_add_epi16(temp_v_epi16, offset_epi16);
+      temp_v_epi16 = _mm256_adds_epi16(sample0_epi16, sample1_epi16);
+      temp_v_epi16 = _mm256_adds_epi16(temp_v_epi16, offset_epi16);
      temp_v_epi16 = _mm256_srli_epi16(temp_v_epi16, shift);

      if (temp_uv == 0) {
@ -1302,7 +1579,7 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,

 }
 }
-
+*/
 #endif //COMPILE_INTEL_AVX2


--- a/tests/inter_recon_bipred_tests.c
+++ b/tests/inter_recon_bipred_tests.c
@ -32,9 +32,9 @@ static lcu_t lcu1;

 int temp1, temp2, temp3, temp4;

-int16_t mv_param[2][2] = { { 7,7 },{ 7,7 } };
-int width = 8;
-int height = 8;
+int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } };
+int width = 16;
+int height = 16;
 int xpos = 0;
 int ypos = 0;

@ -125,27 +125,10 @@ TEST test_inter_recon_bipred()
 	memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
 	memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
 	memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
- /*
- for (temp_y = 0; temp_y < height; ++temp_y) {
-  int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-  for (temp_x = 0; temp_x < width; temp_x += 1) {
-   int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-   printf("%d ", expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]);
-  }
- }
- printf("\n");
- /*
- for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
-  int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-  for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
-   int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-   printf("%d ", expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]);
-  }
- }
- printf("\n");*/
+
 	
 	kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); 
- /*
+ 
 for (temp_y = 0; temp_y < height; ++temp_y) {
  int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
  for (temp_x = 0; temp_x < width; temp_x += 1) {
@ -154,8 +137,8 @@ TEST test_inter_recon_bipred()
  }
 }
 printf("\n");
- */
-
+ 
+ /*
 for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
  int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
  for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
@ -164,7 +147,7 @@ TEST test_inter_recon_bipred()
  }
 }
 printf("\n");
- 
+ */

 	for (temp_y = 0; temp_y < height; ++temp_y) {
 		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));