Toimiva ratkaisu

This commit is contained in:
Reima Hyvönen 2018-07-03 11:18:51 +03:00
parent 17babfffa4
commit ea83ae45f0
6 changed files with 483 additions and 277 deletions

View file

@ -21,9 +21,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\
EndProjectSection
EndProject
Global
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
Debug|x64 = Debug|x64
@ -70,4 +67,28 @@ Global
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
GlobalSection(Performance) = preSolution
HasPerformanceSessions = true
EndGlobalSection
EndGlobal

View file

@ -98,6 +98,7 @@
</ProjectReference>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\tests\bipred_generic_tests.c" />
<ClCompile Include="..\..\tests\coeff_sum_tests.c" />
<ClCompile Include="..\..\tests\dct_tests.c" />
<ClCompile Include="..\..\tests\test_strategies.c" />

View file

@ -42,6 +42,9 @@
<ClCompile Include="..\..\tests\coeff_sum_tests.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\tests\bipred_generic_tests.c">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\tests\sad_tests.h">

View file

@ -731,387 +731,545 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
lcu_t* lcu,
kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH],
kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C],
kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]) {
kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C])
{
int y_in_lcu;
int x_in_lcu;
int y_in_lcu;
int x_in_lcu;
int shift = 15 - KVZ_BIT_DEPTH;
int offset = 1 << (shift-1);
int shift_left = 14 - KVZ_BIT_DEPTH;
int shift = 15 - KVZ_BIT_DEPTH;
int offset = 1 << (shift-1);
int shift_left = 14 - KVZ_BIT_DEPTH;
__m256i temp_epi32;
__m256i temp_epi16;
__m256i offset_epi32 = _mm256_set1_epi32(offset);
__m256i temp_epi32;
__m256i temp_epi16;
__m256i temp_epi8;
__m256i temp_y_epi32, temp_u_epi32, temp_v_epi32;
__m256i temp_epi8;
__m256i temp_zeros_256 = _mm256_setzero_si256();
__m256i temp_y_epi32, temp_u_epi32, temp_v_epi32;
__m256i temp_epi32_u = _mm256_setzero_si256();
__m256i temp_epi32_v = _mm256_setzero_si256();
__m256i temp_epi32_u = _mm256_setzero_si256();
__m256i temp_epi32_v = _mm256_setzero_si256();
__m256i sample0_epi32;
__m256i sample1_epi32;
__m256i sample0_epi32;
__m256i sample1_epi32;
__m256i temp_epi8_u, temp_epi8_v;
__m256i temp_epi16_u = _mm256_setzero_si256();
__m256i temp_epi16_v = _mm256_setzero_si256();
__m128i offset_4 = _mm_set1_epi32(offset);
__m128i sample_epi32;
__m128i sample0_y_epi32, sample1_y_epi32, sample0_u_epi32, sample1_u_epi32, sample0_v_epi32, sample1_v_epi32;
__m256i final_epi8_256 = _mm256_setzero_si256();
__m128i final_epi8_128;
__m128i offset_4 = _mm_set1_epi32(offset);
__m128i sample_epi32;
__m128i sample0_y_epi32, sample1_y_epi32, sample0_u_epi32, sample1_u_epi32, sample0_v_epi32, sample1_v_epi32;
__m128i temp_zeros_128 = _mm_setzero_si128();
__m256i idx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
switch (width)
{
__m128i final_epi8_128;
case 4:
x_in_lcu = ((xpos) & ((LCU_WIDTH)-1));
switch (width)
{
for (int temp_y = 0; temp_y < height; ++temp_y) {
y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
case 4:
sample0_y_epi32 = hi_prec_luma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
for (int temp_y = 0; temp_y < height; ++temp_y) {
sample1_y_epi32 = hi_prec_luma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
x_in_lcu = ((xpos) & ((LCU_WIDTH)-1));
// (sample1 + sample2 + offset)>>shift
sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_y_epi32, sample1_y_epi32), offset_4);
sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
final_epi8_128 = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, sample_epi32), sample_epi32);
sample0_y_epi32 = hi_prec_luma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
int8_t*temp_int_y = (int8_t*)&final_epi8_128;
sample1_y_epi32 = hi_prec_luma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
for (int i = 0; i < 4; i++) {
lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu + i] = temp_int_y[i];
}
//lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = _mm_cvtsi128_si32(final_epi8_128);
// (sample1 + sample2 + offset)>>shift
sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_y_epi32, sample1_y_epi32), offset_4);
sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
final_epi8_128 = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, temp_zeros_128), temp_zeros_128);
if (temp_y < height >> 1) {
int8_t*temp_int_y = (int8_t*)&final_epi8_128;
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
x_in_lcu = (((xpos >> 1)) & (LCU_WIDTH_C - 1));
for (int i = 0; i < 4; i++) {
lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu + i] = temp_int_y[i];
}
sample0_u_epi32 = hi_prec_chroma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
sample1_u_epi32 = hi_prec_chroma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
if (temp_y < height >> 1) {
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
x_in_lcu = (((xpos >> 1)) & (LCU_WIDTH_C - 1));
// (sample1 + sample2 + offset)>>shift
sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_u_epi32, sample1_u_epi32), offset_4);
sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
sample0_u_epi32 = hi_prec_chroma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
__m128i temp_u = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, sample_epi32), sample_epi32);
int8_t*temp_int_u = (int8_t*)&temp_u;
sample1_u_epi32 = hi_prec_chroma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
sample0_v_epi32 = hi_prec_chroma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
// (sample1 + sample2 + offset)>>shift
sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_u_epi32, sample1_u_epi32), offset_4);
sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
sample1_v_epi32 = hi_prec_chroma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
__m128i temp_u = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, temp_zeros_128), temp_zeros_128);
int8_t*temp_int_u = (int8_t*)&temp_u;
// (sample1 + sample2 + offset)>>shift
sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_v_epi32, sample1_v_epi32), offset_4);
sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
sample0_v_epi32 = hi_prec_chroma_rec0 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
__m128i temp_v = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, sample_epi32), sample_epi32);
int8_t*temp_int_v = (int8_t*)&temp_v;
sample1_v_epi32 = hi_prec_chroma_rec1 ? _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i*)&(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) :
_mm_slli_epi32(_mm_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))), shift_left);
for (int i = 0; i < 2; i++)
{
lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu + i] = temp_int_u[i];
lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu + i] = temp_int_v[i];
}
}
}
break;
// (sample1 + sample2 + offset)>>shift
sample_epi32 = _mm_add_epi32(_mm_add_epi32(sample0_v_epi32, sample1_v_epi32), offset_4);
sample_epi32 = _mm_srai_epi32(sample_epi32, shift);
__m128i temp_v = _mm_packus_epi16(_mm_packus_epi32(sample_epi32, temp_zeros_128), temp_zeros_128);
int8_t*temp_int_v = (int8_t*)&temp_v;
default:
for (int i = 0; i < 2; i++) {
lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu + i] = temp_int_u[i];
lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu + i] = temp_int_v[i];
int start_point = 0;
int start_point_uv = 0;
}
}
if (hi_prec_luma_rec0 ==0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0)
{
for (int temp_y = 0; temp_y < height; temp_y += 1) {
temp_epi32 = _mm256_setzero_si256();
temp_epi16 = _mm256_setzero_si256();
int temp = 0;
int temp_uv = 0;
y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
}
break;
for (int temp_x = 0; temp_x < width; temp_x += 16) {
default:
x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
int start_point = 0;
int start_point_uv = 0;
sample0_epi32 = (_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))));
for (int temp_y = 0; temp_y < height; temp_y += 1) {
temp_epi32 = _mm256_setzero_si256();
temp_epi16 = _mm256_setzero_si256();
int temp = 0;
int temp_uv = 0;
sample1_epi32 = (_mm256_cvtepu8_epi16((_mm_loadu_si128((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))));
temp_y_epi32 = _mm256_avg_epu16(sample0_epi32, sample1_epi32);
for (int temp_x = 0; temp_x < width; temp_x += 8) {
switch (width)
{
case 8:
// Pack the bits from 16-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(temp_y_epi32, temp_y_epi32);
y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
// Store 64-bits from vector to memory
_mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
// Load total of 8 elements from memory to vector and convert all to 32-bit
sample0_epi32 = hi_prec_luma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
break;
sample1_epi32 = hi_prec_luma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
case 16:
// (sample1 + sample2 + offset)>>shift
temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
// Pack the bits from 32-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(temp_y_epi32, temp_y_epi32);
switch (width)
{
case 8:
temp_epi8 = _mm256_permute4x64_epi64(temp_epi8, 216);
// Pack the bits from 32-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_y_epi32, temp_zeros_256), temp_zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
final_epi8_128 = _mm_loadu_si128((__m128i*)&temp_epi8);
// Store 128-bit to memory
_mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
// Store 64-bits from vector to memory
_mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), final_epi8_128);
break;
break;
default:
if (temp == 0) {
temp_epi32 = temp_y_epi32;
temp = 1;
start_point = y_in_lcu* LCU_WIDTH + x_in_lcu;
}
case 16:
else {
if (temp == 0) {
// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
temp_epi8 = _mm256_packus_epi16(temp_epi32, temp_y_epi32);
// Store to temporary vector
temp_epi32 = temp_y_epi32;
temp++;
// Arrange the vector to right order before inserting it
temp_epi8 = _mm256_permute4x64_epi64(temp_epi8, 216);
// Save starting point to memory
start_point = (y_in_lcu)* LCU_WIDTH + x_in_lcu;
}
else if (temp == 1) {
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), temp_epi8);
temp = 0;
}
}
// Pack the bits from 32-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32, temp_y_epi32), temp_zeros_256);
if (temp_x < width >> 1 && temp_y < height >> 1) {
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
sample0_epi32 = (_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))));
// Fill 128 bit vector with packed data and store it to memory
__m128i final_epi8_16 = _mm_loadu_si128((__m128i*)&temp_epi8);
sample1_epi32 = (_mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))));
_mm_storeu_si128((__m128i*)&(lcu->rec.y[start_point]), final_epi8_16);
// (sample1 + sample2 + offset)>>shift
temp_u_epi32 = _mm256_avg_epu16(sample0_epi32, sample1_epi32);
temp = 0;
}
sample0_epi32 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])));
sample1_epi32 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])));
break;
// (sample1 + sample2 + offset)>>shift
temp_v_epi32 = _mm256_avg_epu16(sample0_epi32, sample1_epi32);
default:
if (temp == 0) {
temp_epi32 = temp_y_epi32;
temp++;
switch (width)
{
case 8:
start_point = y_in_lcu* LCU_WIDTH + x_in_lcu;
}
temp_epi8_u = _mm256_packus_epi16(temp_u_epi32, temp_u_epi32);
temp_epi8_v = _mm256_packus_epi16(temp_v_epi32, temp_v_epi32);
else if (temp == 1) {
int8_t*temp_int_u = (int8_t*)&temp_epi8_u;
int8_t*temp_int_v = (int8_t*)&temp_epi8_v;
// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
temp_epi16 = _mm256_packus_epi32(temp_epi32, temp_y_epi32);
temp++;
}
for (int i = 0; i < 4; i++) {
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_u[i];
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_v[i];
}
else if (temp == 2) {
temp_epi32 = temp_y_epi32;
temp++;
}
break;
else {
case 16:
temp_epi8 = _mm256_packus_epi16(temp_u_epi32, temp_u_epi32);
// Convert packed 32-bit integers to packed 8-bit integers and store result to vector
temp_epi8 = _mm256_packus_epi16(temp_epi16, _mm256_packus_epi32(temp_epi32, temp_y_epi32));
// Store 64-bit integer into memory
_mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
// Arrange the vector to right order before inserting it
final_epi8_256 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
temp_epi8 = _mm256_packus_epi16(temp_v_epi32, temp_v_epi32);
// Store 64-bit integer into memory
_mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), final_epi8_256);
temp = 0;
}
}
break;
if (temp_x < width >> 1 && temp_y < height >> 1) {
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
case 32:
sample0_epi32 = hi_prec_chroma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
// Pack the bits from 32-bit to 8-bit
temp_epi8_u = _mm256_packus_epi16(temp_u_epi32, temp_u_epi32);
temp_epi8_v = _mm256_packus_epi16(temp_v_epi32, temp_v_epi32);
sample1_epi32 = hi_prec_chroma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
temp_epi8_u = _mm256_permute4x64_epi64(temp_epi8_u, 216);
temp_epi8_v = _mm256_permute4x64_epi64(temp_epi8_v, 216);
// (sample1 + sample2 + offset)>>shift
temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
temp_u_epi32 = _mm256_srai_epi16(temp_u_epi32, shift);
// Fill 128 bit vector with packed data and store it to memory
_mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8_u));
sample0_epi32 = hi_prec_chroma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
// Fill 128 bit vector with packed data and store it to memory
_mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8_v));
sample1_epi32 = hi_prec_chroma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
// (sample1 + sample2 + offset)>>shift
temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
break;
default:
if (temp_uv == 0) {
switch (width) {
// Store to temporary vector
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
case 8:
// Save starting point to memory
start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
__m256i temp_epi8u = _mm256_packus_epi16(_mm256_packus_epi32(temp_u_epi32, temp_zeros_256), temp_zeros_256);
temp_uv = 1;
}
int8_t *temp_int_8_u = (int8_t*)&temp_epi8u;
else {
// Pack 32 bit to 8 bit
temp_epi8_u = _mm256_packus_epi16(temp_epi32_u, temp_u_epi32);
temp_epi8_v = _mm256_packus_epi16(temp_epi32_v, temp_v_epi32);
__m256i temp_epi8v = _mm256_packus_epi16(_mm256_packus_epi32(temp_v_epi32, temp_zeros_256), temp_zeros_256);
// Arrange the vector to right order before inserting it
temp_epi8_u = _mm256_permute4x64_epi64(temp_epi8_u, 216);
int8_t *temp_int_8_v = (int8_t*)&temp_epi8v;
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8_u);
// Arrange the vector to right order before inserting it
temp_epi8_v = _mm256_permute4x64_epi64(temp_epi8_v, 216);
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8_v);
temp_uv = 0;
}
}
y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
}
}
}
}
for (int i = 0; i < 4; i++) {
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_8_u[i];
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_8_v[i];
}
else
{
for (int temp_y = 0; temp_y < height; temp_y += 1) {
temp_epi32 = _mm256_setzero_si256();
temp_epi16 = _mm256_setzero_si256();
int temp = 0;
int temp_uv = 0;
__m256i idx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
__m256i offset_epi32 = _mm256_set1_epi32(offset);
__m256i zeros_256 = _mm256_setzero_si256();
__m256i temp_epi16_u = _mm256_setzero_si256();
__m256i temp_epi16_v = _mm256_setzero_si256();
break;
case 16:
for (int temp_x = 0; temp_x < width; temp_x += 8) {
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_u_epi32, temp_zeros_256), temp_zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
final_epi8_128 = _mm_loadu_si128((__m128i*)&temp_epi8);
y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
// Store 64-bit integer into memory
_mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), final_epi8_128);
// Load total of 8 elements from memory to vector and convert all to 32-bit
sample0_epi32 = hi_prec_luma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_v_epi32, temp_zeros_256), temp_zeros_256);
sample1_epi32 = hi_prec_luma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), shift_left);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
final_epi8_128 = _mm_loadu_si128((__m128i*)&temp_epi8);
// (sample1 + sample2 + offset)>>shift
temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
// Store 64-bit integer into memory
_mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), final_epi8_128);
switch (width)
{
case 8:
break;
// Pack the bits from 32-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_y_epi32, zeros_256), zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
case 32:
// Store 64-bits from vector to memory
_mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
if (temp_uv == 0) {
break;
// Store to temporary vector
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
case 16:
// Save starting point to memory
start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
if (temp == 0) {
temp_uv++;
}
// Store to temporary vector
temp_epi32 = temp_y_epi32;
temp++;
else{
// Save starting point to memory
start_point = (y_in_lcu)* LCU_WIDTH + x_in_lcu;
}
// Pack the bits from 32-bit to 8-bit
__m256i temp_epi8_u = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32_u, temp_u_epi32), temp_zeros_256);
__m256i temp_epi8_v = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32_v, temp_v_epi32), temp_zeros_256);
else if (temp == 1) {
temp_epi8_u = _mm256_permutevar8x32_epi32(temp_epi8_u, idx);
temp_epi8_v = _mm256_permutevar8x32_epi32(temp_epi8_v, idx);
// Pack the bits from 32-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32, temp_y_epi32), zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
// Fill 128 bit vector with packed data and store it to memory
__m128i final_epi8_u = _mm_loadu_si128((__m128i*)&temp_epi8_u);
_mm_storeu_si128((__m128i*)&(lcu->rec.u[start_point_uv]), final_epi8_u);
_mm_storeu_si128((__m128i*)&(lcu->rec.y[start_point]), _mm256_castsi256_si128(temp_epi8));
// Fill 128 bit vector with packed data and store it to memory
__m128i final_epi8_v = _mm_loadu_si128((__m128i*)&temp_epi8_v);
_mm_storeu_si128((__m128i*)&(lcu->rec.v[start_point_uv]), final_epi8_v);
temp = 0;
}
temp_uv = 0;
}
break;
default:
if (temp_uv == 0) {
break;
// Store to temporary vector
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
default:
if (temp == 0) {
// Save starting point to memory
start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
temp_epi32 = temp_y_epi32;
temp++;
temp_uv++;
}
start_point = y_in_lcu* LCU_WIDTH + x_in_lcu;
}
else if (temp_uv == 1) {
else if (temp == 1) {
// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
temp_epi16_u = _mm256_packus_epi32(temp_epi32_u, temp_u_epi32);
temp_epi16_v = _mm256_packus_epi32(temp_epi32_v, temp_v_epi32);
temp_uv++;
}
// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
temp_epi16 = _mm256_packus_epi32(temp_epi32, temp_y_epi32);
temp++;
}
else if (temp_uv == 2) {
else if (temp == 2) {
temp_epi32 = temp_y_epi32;
temp++;
}
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
temp_uv++;
}
else {
else {
// Pack 32 bit to 8 bit
__m256i temp_epi8_u = _mm256_packus_epi16(temp_epi16_u, _mm256_packus_epi32(temp_epi32_u, temp_u_epi32));
__m256i temp_epi8_v = _mm256_packus_epi16(temp_epi16_v, _mm256_packus_epi32(temp_epi32_v, temp_v_epi32));
// Convert packed 32-bit integers to packed 8-bit integers and store result to vector
temp_epi8 = _mm256_packus_epi16(temp_epi16, _mm256_packus_epi32(temp_epi32, temp_y_epi32));
// Arrange the vector to right order before inserting it
final_epi8_256 = _mm256_permutevar8x32_epi32(temp_epi8_u, idx);
// Arrange the vector to right order before inserting it
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), final_epi8_256);
// Arrange the vector to right order before inserting it
final_epi8_256 = _mm256_permutevar8x32_epi32(temp_epi8_v, idx);
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), temp_epi8);
temp = 0;
}
}
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), final_epi8_256);
if (temp_x < width >> 1 && temp_y < height >> 1) {
y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
temp_uv = 0;
}
sample0_epi32 = hi_prec_chroma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
sample1_epi32 = hi_prec_chroma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
}
}
// (sample1 + sample2 + offset)>>shift
temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
temp_u_epi32 = _mm256_srai_epi16(temp_u_epi32, shift);
sample0_epi32 = hi_prec_chroma_rec0 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
}
}
sample1_epi32 = hi_prec_chroma_rec1 ? (_mm256_cvtepu16_epi32(_mm_load_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])))) :
_mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), shift_left);
}
// (sample1 + sample2 + offset)>>shift
temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
switch (width) {
case 8:
__m256i temp_epi8_u = _mm256_packus_epi16(_mm256_packus_epi32(temp_u_epi32, zeros_256), zeros_256);
int8_t *temp_int_8_u = (int8_t*)&temp_epi8_u;
__m256i temp_epi8_v = _mm256_packus_epi16(_mm256_packus_epi32(temp_v_epi32, zeros_256), zeros_256);
int8_t *temp_int_8_v = (int8_t*)&temp_epi8_v;
for (int i = 0; i < 4; i++) {
lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_8_u[i];
lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu + i] = temp_int_8_v[i];
}
break;
case 16:
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_u_epi32, zeros_256), zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
// Store 64-bit integer into memory
_mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_v_epi32, zeros_256), zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
// Store 64-bit integer into memory
_mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
break;
case 32:
if (temp_uv == 0) {
// Store to temporary vector
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
// Save starting point to memory
start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
temp_uv++;
}
else {
// Pack the bits from 32-bit to 8-bit
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32_u, temp_u_epi32), zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
_mm_storeu_si128((__m128i*)&(lcu->rec.u[start_point_uv]), _mm256_castsi256_si128(temp_epi8));
temp_epi8 = _mm256_packus_epi16(_mm256_packus_epi32(temp_epi32_v, temp_v_epi32), zeros_256);
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
_mm_storeu_si128((__m128i*)&(lcu->rec.v[start_point_uv]), _mm256_castsi256_si128(temp_epi8));
temp_uv = 0;
}
break;
default:
if (temp_uv == 0) {
// Store to temporary vector
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
// Save starting point to memory
start_point_uv = (y_in_lcu)* LCU_WIDTH_C + x_in_lcu;
temp_uv++;
}
else if (temp_uv == 1) {
// Convert packed 16-bit integers to packed 8-bit integers and store result to vector
temp_epi16_u = _mm256_packus_epi32(temp_epi32_u, temp_u_epi32);
temp_epi16_v = _mm256_packus_epi32(temp_epi32_v, temp_v_epi32);
temp_uv++;
}
else if (temp_uv == 2) {
temp_epi32_u = temp_u_epi32;
temp_epi32_v = temp_v_epi32;
temp_uv++;
}
else {
// Pack 32 bit to 8 bit
temp_epi8 = _mm256_packus_epi16(temp_epi16_u, _mm256_packus_epi32(temp_epi32_u, temp_u_epi32));
// Arrange the vector to right order before inserting it
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8);
temp_epi8 = _mm256_packus_epi16(temp_epi16_v, _mm256_packus_epi32(temp_epi32_v, temp_v_epi32));
// Arrange the vector to right order before inserting it
temp_epi8 = _mm256_permutevar8x32_epi32(temp_epi8, idx);
// Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8);
temp_uv = 0;
}
}
}
}
}
}
}
}
#endif //COMPILE_INTEL_AVX2

View file

@ -555,7 +555,30 @@ static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
int y_in_lcu;
int x_in_lcu;
/*
y_in_lcu = ((ypos) & ((LCU_WIDTH)-1));
x_in_lcu = ((xpos) & ((LCU_WIDTH)-1));
int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
printf("%d ", sample0_y);
printf("\n");
printf("%d ", sample1_y);
printf("\n");
printf("%d ", shift);
printf("\n");
printf("%d ", offset);
printf("\n");
printf("%d ", (sample0_y+sample1_y+64) >> 7);
printf("\n");
printf("%d ", (20+15+1) >> 1);
printf("\n");*/
//After reconstruction, merge the predictors by taking an average of each pixel
for (int temp_y = 0; temp_y < height; ++temp_y) {

View file

@ -32,9 +32,9 @@ static lcu_t lcu1;
int temp1, temp2, temp3, temp4;
int16_t mv_param[2][2] = { { 7,7 },{ 7,7 } };
int width = 4;
int height = 4;
int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } };
int width = 32;
int height = 32;
int xpos = 0;
int ypos = 0;
@ -89,11 +89,11 @@ static void setup()
int shift = 15 - KVZ_BIT_DEPTH;
int offset = 1 << (shift - 1);
hi_prec_luma_rec0 = 0;// mv_param[0][0] & 3 || mv_param[0][1] & 3;
hi_prec_luma_rec1 = 0;// mv_param[1][0] & 3 || mv_param[1][1] & 3;
hi_prec_luma_rec0 = 0;// mv_param[0][0] & 3 || mv_param[0][1] & 3;
hi_prec_luma_rec1 = 0;// mv_param[1][0] & 3 || mv_param[1][1] & 3;
hi_prec_chroma_rec0 = 0; // mv_param[0][0] & 7 || mv_param[0][1] & 7;
hi_prec_chroma_rec1 =0; // mv_param[1][0] & 7 || mv_param[1][1] & 7;
hi_prec_chroma_rec0 = mv_param[0][0] & 7 || mv_param[0][1] & 7;
hi_prec_chroma_rec1 = mv_param[1][0] & 7 || mv_param[1][1] & 7;
if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);