25.6 working optimation, ~50% faster than original

2024-11-23 18:14:06 +00:00 · 2018-06-25 17:06:16 +03:00 · 2018-06-25 17:06:16 +03:00 · 17babfffa4
parent 9dfd72628a
commit 17babfffa4
8 changed files with 2146 additions and 62 deletions
--- a/src/inter.c
+++ b/src/inter.c
@ -464,46 +464,21 @@ void kvz_inter_recon_bipred(const encoder_state_t * const state,
  hi_prec_buf_t* high_precision_rec1 = 0;
  if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
  if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+
+
  //Reconstruct both predictors
  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0);
  if (!hi_prec_luma_rec0){
-    memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64);
+    memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); // copy to temp_lcu_y
  }
  if (!hi_prec_chroma_rec0){
-    memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32);
-    memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32);
+    memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_u
+    memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_v
  }
  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1);

-  kvz_inter_recon_bipred_test(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu);
-
- /*
  // After reconstruction, merge the predictors by taking an average of each pixel
-  for (temp_y = 0; temp_y < height; ++temp_y) {
-    int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-    for (temp_x = 0; temp_x < width; ++temp_x) {
-      int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-      int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-    }
-
-  }
-
-  for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
-    int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-    for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
-      int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-      int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-      int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-    }
-  }
-  */
+  kvz_inter_recon_bipred_generic(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
 
  if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0);
  if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -25,6 +25,9 @@

 #if COMPILE_INTEL_AVX2
 #include <immintrin.h>
+#include <emmintrin.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <string.h>

 #include "kvazaar.h"
@ -714,6 +717,403 @@ static unsigned pixels_calc_ssd_avx2(const kvz_pixel *const ref, const kvz_pixel
  }
 }

+
+static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
+	const int hi_prec_luma_rec1,
+	const int hi_prec_chroma_rec0,
+	const int hi_prec_chroma_rec1,
+	const int height,
+	const int width,
+	const int ypos,
+	const int xpos,
+	const hi_prec_buf_t*high_precision_rec0,
+	const hi_prec_buf_t*high_precision_rec1,
+	lcu_t* lcu,
+	kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
+	kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
+	kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]) {
+
+	int y_in_lcu;
+	int x_in_lcu;
+
+	int shift = 15 - KVZ_BIT_DEPTH;
+	int offset = 1 << (shift-1);
+	int shift_left = 14 - KVZ_BIT_DEPTH;	
+
+	__m256i offset_epi32 = _mm256_set1_epi32(offset);
+	__m256i temp_epi32;
+	__m256i temp_epi16;
+
+	__m256i temp_epi8;
+	__m256i temp_zeros_256 = _mm256_setzero_si256();
+	__m256i temp_y_epi32, temp_u_epi32, temp_v_epi32;
+
+	__m256i temp_epi32_u = _mm256_setzero_si256();
+	__m256i temp_epi32_v = _mm256_setzero_si256();
+
+	__m256i sample0_epi32;
+	__m256i sample1_epi32;
+
+	__m256i temp_epi16_u = _mm256_setzero_si256();
+	__m256i temp_epi16_v = _mm256_setzero_si256();
+
+	__m256i final_epi8_256 = _mm256_setzero_si256();
+
+	__m128i offset_4 = _mm_set1_epi32(offset);
+	__m128i sample_epi32;
+	__m128i sample0_y_epi32, sample1_y_epi32, sample0_u_epi32, sample1_u_epi32, sample0_v_epi32, sample1_v_epi32;
+	__m128i temp_zeros_128 = _mm_setzero_si128();
+
+	__m256i idx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+
+	__m128i final_epi8_128;
+
+
+	switch (width)
+	{
+
+	case 4:
+
+		for (int temp_y = 0; temp_y < height; ++temp_y) {
+
+			y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+			x_in_lcu = ((xpos) & ((LCU_WIDTH)-1));
+
+
+
+			sample0_y_epi32 = hi_prec_luma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
+				_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
+
+			sample1_y_epi32 = hi_prec_luma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
+				_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
+
+
+			// (sample1 + sample2 + offset)>>shift 
+			sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_y_epi32, sample1_y_epi32), offset_4);
+			sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
+
+			final_epi8_128 = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, temp_zeros_128), temp_zeros_128);
+
+			int8_t*temp_int_y = (int8_t*)&final_epi8_128;
+			
+			for (int i = 0; i < 4; i++) {
+				lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu + i] = temp_int_y[i];
+			}
+			
+
+			if (temp_y < height >> 1) {
+				y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+				x_in_lcu = (((xpos >> 1)) & (LCU_WIDTH_C - 1));
+
+				sample0_u_epi32 = hi_prec_chroma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+					_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
+
+				sample1_u_epi32 = hi_prec_chroma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+					_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
+
+				// (sample1 + sample2 + offset)>>shift 
+				sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_u_epi32, sample1_u_epi32), offset_4);
+				sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
+
+				__m128i temp_u = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, temp_zeros_128), temp_zeros_128); 
+				int8_t*temp_int_u = (int8_t*)&temp_u;
+
+				sample0_v_epi32 = hi_prec_chroma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+					_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
+
+				sample1_v_epi32 = hi_prec_chroma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
+					_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
+
+				// (sample1 + sample2 + offset)>>shift 
+				sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_v_epi32, sample1_v_epi32), offset_4);
+				sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
+
+				__m128i temp_v = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, temp_zeros_128), temp_zeros_128);
+				int8_t*temp_int_v = (int8_t*)&temp_v;
+
+				for (int i = 0; i < 2; i++) {
+					lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu + i] = temp_int_u[i];
+					lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu + i] = temp_int_v[i];
+
+				}
+			}
+
+			
+		}
+		break;
+		
+	default:
+
+		int start_point = 0;
+		int start_point_uv = 0;
+		
+		for (int temp_y = 0; temp_y < height; temp_y += 1) {
+			temp_epi32 = _mm256_setzero_si256();
+			temp_epi16 = _mm256_setzero_si256();
+			int temp = 0;
+			int temp_uv = 0;
+
+
+			for (int temp_x = 0; temp_x < width; temp_x += 8) {
+
+				
+				y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+				x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+				
+				// Load total of 8 elements from memory to vector and convert all to 32-bit
+				sample0_epi32 = hi_prec_luma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])))) :
+					_mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
+
+				sample1_epi32 = hi_prec_luma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])))) :
+					_mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
+
+				// (sample1 + sample2 + offset)>>shift 
+				temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+				temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+				temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+
+				switch (width)
+				{
+				case 8:
+
+					// Pack the bits from 32-bit to 8-bit
+					temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_y_epi32, temp_zeros_256), temp_zeros_256);
+					temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
+					final_epi8_128 = _mm_loadu_si128((__m128i*)&temp_epi8);
+
+					// Store 64-bits from vector to memory
+					_mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), final_epi8_128);
+
+					break;
+
+				case 16:
+
+					if (temp == 0) {
+
+						// Store to temporary vector
+						temp_epi32 = temp_y_epi32;
+						temp++;
+
+						// Save starting point to memory
+						start_point = (y_in_lcu)* LCU_WIDTH + x_in_lcu;
+					}
+
+					else if (temp == 1) {
+
+						// Pack the bits from 32-bit to 8-bit
+						temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32, temp_y_epi32), temp_zeros_256);
+
+						temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);		
+						
+						// Fill 128 bit vector with packed data and store it to memory
+						__m128i final_epi8_16 = _mm_loadu_si128((__m128i*)&temp_epi8);
+
+						_mm_storeu_si128((__m128i*)&(lcu->rec.y[start_point]), final_epi8_16);
+
+						temp = 0;
+					}
+
+
+					break;
+
+				default:
+					if (temp == 0) {
+
+						temp_epi32 = temp_y_epi32;
+						temp++;
+
+						start_point = y_in_lcu* LCU_WIDTH + x_in_lcu;
+					}
+
+					else if (temp == 1) {
+
+						// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
+						temp_epi16 = _mm256_packus_epi32(temp_epi32, temp_y_epi32);
+						temp++;
+					}
+
+					else if (temp == 2) {
+						temp_epi32 = temp_y_epi32;
+						temp++;
+					}
+
+					else {
+
+						// Convert packed 32-bit integers to packed 8-bit integers and store result to vector
+						temp_epi8 = _mm256_packus_epi16(temp_epi16, _mm256_packus_epi32(temp_epi32, temp_y_epi32));
+
+						// Arrange the vector to right order before inserting it
+						final_epi8_256 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
+
+
+						// Store 256-bits of integer data into memory
+						_mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), final_epi8_256);
+						temp = 0;
+					}
+				}
+				
+				if (temp_x < width >> 1 && temp_y < height >> 1) {
+					y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+					x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+					sample0_epi32 = hi_prec_chroma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
+						_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+					sample1_epi32 = hi_prec_chroma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
+						_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+					// (sample1 + sample2 + offset)>>shift 
+					temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+					temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
+					temp_u_epi32 = _mm256_srai_epi16(temp_u_epi32, shift);
+
+					sample0_epi32 = hi_prec_chroma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
+						_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+					sample1_epi32 = hi_prec_chroma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
+						_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
+
+					// (sample1 + sample2 + offset)>>shift 
+					temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+					temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
+					temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
+
+
+					switch (width) {
+
+					case 8:
+
+						__m256i temp_epi8u = _mm256_packus_epi16(_mm256_packus_epi32(temp_u_epi32, temp_zeros_256), temp_zeros_256);
+
+						int8_t *temp_int_8_u = (int8_t*)&temp_epi8u;
+
+						__m256i temp_epi8v = _mm256_packus_epi16(_mm256_packus_epi32(temp_v_epi32, temp_zeros_256), temp_zeros_256);
+
+						int8_t *temp_int_8_v = (int8_t*)&temp_epi8v;
+						
+
+						
+						for (int i = 0; i < 4; i++) {
+							lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_8_u[i];
+							lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_8_v[i];
+						}
+
+						break;
+
+					case 16:
+
+						temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_u_epi32, temp_zeros_256), temp_zeros_256);
+
+						temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
+						final_epi8_128 = _mm_loadu_si128((__m128i*)&temp_epi8);
+
+						// Store 64-bit integer into memory
+						_mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), final_epi8_128);
+
+						temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_v_epi32, temp_zeros_256), temp_zeros_256);
+
+						temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
+						final_epi8_128 = _mm_loadu_si128((__m128i*)&temp_epi8);
+						
+						// Store 64-bit integer into memory
+						_mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), final_epi8_128);
+
+						break;
+
+					case 32:
+
+						if (temp_uv == 0) {
+
+							// Store to temporary vector
+							temp_epi32_u = temp_u_epi32;
+							temp_epi32_v = temp_v_epi32;
+
+							// Save starting point to memory
+							start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
+
+							temp_uv++;
+						}
+
+						else{
+
+							// Pack the bits from 32-bit to 8-bit
+							__m256i temp_epi8_u = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32_u, temp_u_epi32), temp_zeros_256);
+							__m256i temp_epi8_v = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32_v, temp_v_epi32), temp_zeros_256);
+
+							temp_epi8_u = _mm256_permutevar8x32_epi32(temp_epi8_u, idx);
+							temp_epi8_v = _mm256_permutevar8x32_epi32(temp_epi8_v, idx);
+
+							// Fill 128 bit vector with packed data and store it to memory
+							__m128i final_epi8_u = _mm_loadu_si128((__m128i*)&temp_epi8_u);
+							_mm_storeu_si128((__m128i*)&(lcu->rec.u[start_point_uv]), final_epi8_u);
+
+							// Fill 128 bit vector with packed data and store it to memory
+							__m128i final_epi8_v = _mm_loadu_si128((__m128i*)&temp_epi8_v);
+							_mm_storeu_si128((__m128i*)&(lcu->rec.v[start_point_uv]), final_epi8_v);
+
+							temp_uv = 0;
+					}
+						break;
+
+					default:
+						if (temp_uv == 0) {
+
+							// Store to temporary vector
+							temp_epi32_u = temp_u_epi32;
+							temp_epi32_v = temp_v_epi32;
+
+							// Save starting point to memory
+							start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
+
+							temp_uv++;
+						}
+
+						else if (temp_uv == 1) {
+
+							// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
+							temp_epi16_u = _mm256_packus_epi32(temp_epi32_u, temp_u_epi32);
+							temp_epi16_v = _mm256_packus_epi32(temp_epi32_v, temp_v_epi32);
+							temp_uv++;
+					}
+
+						else if (temp_uv == 2) {
+
+							temp_epi32_u = temp_u_epi32;
+							temp_epi32_v = temp_v_epi32;
+							temp_uv++;
+						}
+
+						else {
+							// Pack 32 bit to 8 bit
+							__m256i temp_epi8_u = _mm256_packus_epi16(temp_epi16_u, _mm256_packus_epi32(temp_epi32_u, temp_u_epi32));
+							__m256i temp_epi8_v = _mm256_packus_epi16(temp_epi16_v, _mm256_packus_epi32(temp_epi32_v, temp_v_epi32));
+
+							// Arrange the vector to right order before inserting it
+							final_epi8_256 = _mm256_permutevar8x32_epi32(temp_epi8_u, idx);
+
+							// Store 256-bits of integer data into memory
+							_mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), final_epi8_256);
+
+							// Arrange the vector to right order before inserting it
+							final_epi8_256 = _mm256_permutevar8x32_epi32(temp_epi8_v, idx);
+
+							// Store 256-bits of integer data into memory
+							_mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), final_epi8_256);
+
+							temp_uv = 0;
+						}
+
+
+					}
+				}
+
+
+			}
+		}
+		
+	}
+}
+
 #endif //COMPILE_INTEL_AVX2


@ -746,6 +1146,8 @@ int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
    success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);

    success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
+	success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
+
  }
 #endif
  return success;
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@ -539,53 +539,79 @@ static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
 	const int hi_prec_luma_rec1,
 	const int hi_prec_chroma_rec0,
 	const int hi_prec_chroma_rec1,
-	int height,
-	int width,
-	int ypos,
-	int xpos,
+	int32_t height,
+	int32_t width,
+	int32_t ypos,
+	int32_t xpos,
 	const hi_prec_buf_t*high_precision_rec0,
 	const hi_prec_buf_t*high_precision_rec1,
-	lcu_t* lcu) {
+	lcu_t* lcu,
+	kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
+	kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
+	kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]) {

-	kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH];
-	kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C];
-	kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C];
-
-	int temp_x, temp_y;
 	int shift = 15 - KVZ_BIT_DEPTH;
 	int offset = 1 << (shift - 1);
-	int y_in_lcu1;
-	int y_in_lcu2;
+
+	int y_in_lcu;
+	int x_in_lcu;


 	//After reconstruction, merge the predictors by taking an average of each pixel
-	for (temp_y = 0; temp_y < height; ++temp_y) {
-		y_in_lcu1 = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+	for (int temp_y = 0; temp_y < height; ++temp_y) {

-		for (temp_x = 0; temp_x < width; ++temp_x) {

-			int x_in_lcu1 = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+		for (int temp_x = 0; temp_x < width; ++temp_x) {
+			y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+			x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));

-			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu1 * LCU_WIDTH + x_in_lcu1] : (temp_lcu_y[y_in_lcu1 * LCU_WIDTH + x_in_lcu1] << (14 - KVZ_BIT_DEPTH)));
-			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu1 * LCU_WIDTH + x_in_lcu1] : (lcu->rec.y[y_in_lcu1 * LCU_WIDTH + x_in_lcu1] << (14 - KVZ_BIT_DEPTH)));
-			lcu->rec.y[y_in_lcu1 * LCU_WIDTH + x_in_lcu1] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+
+			lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);

 			if (temp_x < width >> 1 && temp_y < height >> 1) {

-				y_in_lcu2 = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-				int x_in_lcu2 = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+				y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+				x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));

-				int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] : (temp_lcu_u[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] << (14 - KVZ_BIT_DEPTH)));
-				int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] : (lcu->rec.u[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] << (14 - KVZ_BIT_DEPTH)));
-				lcu->rec.u[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+				int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);

-				int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] : (temp_lcu_v[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] << (14 - KVZ_BIT_DEPTH)));
-				int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] : (lcu->rec.v[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] << (14 - KVZ_BIT_DEPTH)));
-				lcu->rec.v[y_in_lcu2 * LCU_WIDTH_C + x_in_lcu2] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+				int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+				lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
 			}
 		}
+	}
+	/*
+	for (int temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (int temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+		}

 	}
+	for (int temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (int temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+			int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+
+
+
+		}
+	}*/
 }


--- a/src/strategies/strategies-picture.c
+++ b/src/strategies/strategies-picture.c
@ -61,7 +61,7 @@ cost_pixel_any_size_multi_func * kvz_satd_any_size_quad = 0;

 pixels_calc_ssd_func * kvz_pixels_calc_ssd = 0;

-inter_recon_bipred_func * kvz_inter_recon_bipred_test = 0;
+inter_recon_bipred_func * kvz_inter_recon_bipred_generic = 0;


 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@ -124,7 +124,10 @@ typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
 	int xpos,
 	const hi_prec_buf_t*high_precision_rec0,
 	const hi_prec_buf_t*high_precision_rec1,
-	lcu_t* lcu);
+	lcu_t* lcu,
+	kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
+	kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
+	kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]);
 	
 	

@ -160,7 +163,7 @@ extern cost_pixel_any_size_multi_func *kvz_satd_any_size_quad;

 extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;

-extern inter_recon_bipred_func * kvz_inter_recon_bipred_test;
+extern inter_recon_bipred_func * kvz_inter_recon_bipred_generic;

 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
@ -192,7 +195,8 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
  {"satd_32x32_dual", (void**) &kvz_satd_32x32_dual}, \
  {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \
  {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \
-  {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_test}, \
+  {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
+  {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_generic}, \



--- a/src/strategies/x86_asm/x86inc.asm
+++ b/src/strategies/x86_asm/x86inc.asm
--- a/tests/bipred_generic_tests.c
+++ b/tests/bipred_generic_tests.c
@ -0,0 +1,218 @@
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2017 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License version 2.1 as
+* published by the Free Software Foundation.
+*
+* Kvazaar is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+#include "greatest/greatest.h"
+
+#include "test_strategies.h"
+#include "strategies/generic/picture-generic.h"
+#include <string.h>
+#include <stdlib.h>
+
+
+static lcu_t expected_test_result;
+static lcu_t result;
+
+static lcu_t lcu1;
+
+int temp1, temp2, temp3, temp4;
+
+int16_t mv_param[2][2] = { { 7,7 },{ 7,7 } };
+int width = 4;
+int height = 4;
+int xpos = 0;
+int ypos = 0;
+
+kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH];
+kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C];
+kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C];
+
+int hi_prec_luma_rec0;
+int hi_prec_luma_rec1;
+int hi_prec_chroma_rec0;
+int hi_prec_chroma_rec1;
+
+hi_prec_buf_t* high_precision_rec0 = 0;
+hi_prec_buf_t* high_precision_rec1 = 0;
+
+int temp_x, temp_y;
+
+
+
+static void setup()
+{
+
+	memset(lcu1.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(lcu1.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(lcu1.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+	for (int i = 0; i < LCU_WIDTH*LCU_WIDTH; i++) {
+		temp_lcu_y[i] = rand() %256;
+		lcu1.rec.y[i] = rand() % 256;
+	}
+
+	for (int i = 0; i < LCU_WIDTH_C*LCU_WIDTH_C; i++) {
+		temp_lcu_u[i] = rand() % 256;
+		temp_lcu_v[i] = rand() % 256;
+		lcu1.rec.v[i] = rand() % 256;
+		lcu1.rec.u[i] = rand() % 256;
+	}
+
+
+
+	memset(expected_test_result.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(expected_test_result.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(expected_test_result.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+	memcpy(expected_test_result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
+	memcpy(expected_test_result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
+	memcpy(expected_test_result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
+
+	// Setup is not optimized working function from picture-generic.c.
+
+	
+	int shift = 15 - KVZ_BIT_DEPTH;
+	int offset = 1 << (shift - 1);
+
+	hi_prec_luma_rec0 = 0;// mv_param[0][0] & 3 || mv_param[0][1] & 3;
+	hi_prec_luma_rec1 = 0;// mv_param[1][0] & 3 || mv_param[1][1] & 3;
+
+	hi_prec_chroma_rec0 = 0; // mv_param[0][0] & 7 || mv_param[0][1] & 7;
+	hi_prec_chroma_rec1 =0; // mv_param[1][0] & 7 || mv_param[1][1] & 7;
+
+	if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+	if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+
+	
+
+
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+		}
+
+	}
+	for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+			int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+
+
+
+		}
+	}
+}
+
+
+TEST test_inter_recon_bipred_generic()
+{
+
+	memset(result.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(result.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(result.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+
+	memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
+	memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
+	memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
+	
+	
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			printf("%d ", (expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
+		}
+	}
+	printf("\n");
+	/*
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH_C)-1));
+			printf("%d ", (expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
+		}
+	}
+	printf("\n");*/
+	
+
+	kvz_inter_recon_bipred_generic(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); 
+	
+	/*
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH_C)-1));
+			printf("%d ", (result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]));
+		}
+	}
+	printf("\n");*/
+	
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			printf("%d ", (result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]));
+		}
+	}
+	printf("\n");
+	
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			ASSERT_EQ_FMT(expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], "%d");
+		}
+	}
+
+	for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			ASSERT_EQ_FMT(expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu], result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu], "%d");
+			ASSERT_EQ_FMT(expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu], result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu], "%d");
+		}
+	}
+	
+	PASS();
+}
+
+SUITE(bipred_generic_tests)
+{
+	setup();
+
+	for (volatile int i = 0; i < strategies.count; ++i) {
+		if (strcmp(strategies.strategies[i].type, "inter_recon_bipred") != 0) {
+			continue;
+		}
+
+		kvz_inter_recon_bipred_generic = strategies.strategies[i].fptr;
+		RUN_TEST(test_inter_recon_bipred_generic);
+	}
+}
--- a/tests/tests_main.c
+++ b/tests/tests_main.c
@ -32,6 +32,7 @@ extern SUITE(dct_tests);

 extern SUITE(coeff_sum_tests);
 extern SUITE(mv_cand_tests);
+extern SUITE(bipred_generic_tests);

 int main(int argc, char **argv)
 {
@ -57,5 +58,7 @@ int main(int argc, char **argv)

  RUN_SUITE(mv_cand_tests);

+  RUN_SUITE(bipred_generic_tests);
+
  GREATEST_MAIN_END();
 }