some more optimation for bipred

This commit is contained in:
Reima Hyvönen 2018-07-11 11:27:54 +03:00
parent 9a339eef89
commit cc064da143
3 changed files with 10 additions and 25 deletions

View file

@ -84,11 +84,7 @@
</Lib> </Lib>
<YASM> <YASM>
<Defines>ARCH_X86_64=1;%(Defines)</Defines> <Defines>ARCH_X86_64=1;%(Defines)</Defines>
<<<<<<< HEAD
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths> <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths>
=======
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
>>>>>>> cbb5b20449e091471e8608616b30f3b199b29bfd
</YASM> </YASM>
<ClCompile> <ClCompile>
<AdditionalIncludeDirectories>$(SolutionDir)..\..\pthreads.2\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories);$(SolutionDir)..\src\strategies;</AdditionalIncludeDirectories> <AdditionalIncludeDirectories>$(SolutionDir)..\..\pthreads.2\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories);$(SolutionDir)..\src\strategies;</AdditionalIncludeDirectories>
@ -97,11 +93,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<YASM> <YASM>
<Defines>ARCH_X86_64=0;PREFIX</Defines> <Defines>ARCH_X86_64=0;PREFIX</Defines>
<<<<<<< HEAD
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths> <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths>
=======
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
>>>>>>> cbb5b20449e091471e8608616b30f3b199b29bfd
</YASM> </YASM>
<Lib> <Lib>
<AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories> <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories>
@ -116,11 +108,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<YASM> <YASM>
<Defines>ARCH_X86_64=0;PREFIX</Defines> <Defines>ARCH_X86_64=0;PREFIX</Defines>
<<<<<<< HEAD
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths> <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths>
=======
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
>>>>>>> cbb5b20449e091471e8608616b30f3b199b29bfd
</YASM> </YASM>
<Lib> <Lib>
<AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories> <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories>
@ -135,11 +123,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<YASM> <YASM>
<Defines>ARCH_X86_64=1;%(Defines)</Defines> <Defines>ARCH_X86_64=1;%(Defines)</Defines>
<<<<<<< HEAD
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths> <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86_asm;</IncludePaths>
=======
<IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
>>>>>>> cbb5b20449e091471e8608616b30f3b199b29bfd
</YASM> </YASM>
<Lib> <Lib>
<AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories> <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories>

View file

@ -939,11 +939,12 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
case 16: case 16:
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), 0b10011100); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_y_epi16, temp_y_epi16), 0b11011000);
_mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); _mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8));
break; break;
default: default:
if (temp == 0) { if (temp == 0) {
@ -954,7 +955,7 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
} }
else { else {
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16, temp_y_epi16), 0b10011100); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16, temp_y_epi16), 0b11011000);
// Store 256-bits of integer data into memory // Store 256-bits of integer data into memory
_mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), temp_epi8); _mm256_storeu_si256((__m256i*)&(lcu->rec.y[start_point]), temp_epi8);
@ -1036,10 +1037,10 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
case 32: case 32:
temp_epi8_u = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), 0b10011100); temp_epi8_u = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_u_epi16, temp_u_epi16), 0b11011000);
_mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8_u)); _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8_u));
temp_epi8_v = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_v_epi16, temp_v_epi16), 0b10011100); temp_epi8_v = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_v_epi16, temp_v_epi16), 0b11011000);
_mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8_v)); _mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8_v));
break; break;
@ -1058,11 +1059,11 @@ static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
} }
else { else {
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_u, temp_u_epi16), 0b10011100); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_u, temp_u_epi16), 0b11011000);
_mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8); _mm256_storeu_si256((__m256i*)&(lcu->rec.u[start_point_uv]), temp_epi8);
temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_v, temp_v_epi16), 0b10011100); temp_epi8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(temp_epi16_v, temp_v_epi16), 0b11011000);
_mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8); _mm256_storeu_si256((__m256i*)&(lcu->rec.v[start_point_uv]), temp_epi8);

View file

@ -33,8 +33,8 @@ static lcu_t lcu1;
int temp1, temp2, temp3, temp4; int temp1, temp2, temp3, temp4;
int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } }; int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } };
int width = 64; int width = 32;
int height = 64; int height = 32;
int xpos = 0; int xpos = 0;
int ypos = 0; int ypos = 0;